## Import required libraries

In [1]:
%matplotlib inline
import os, sys
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
print(os.listdir("../input"))
import matplotlib.pyplot as plt
import pickle
from random import shuffle, seed
seed(1)

['rwis224']


## Process file paths

In [2]:
# Image dimensions
image_height = 224
image_width = 224
channels = 3
display_images = True

In [3]:
label_names = ['Bare pavement', 'Partial snow coverage', 'Full snow coverage']

In [4]:
# Function that outputs the list of all files in the directory tree 
def getListOfFiles(dirName):
    # create a list of file and sub directories 
    # names in the given directory 
    listOfFile = os.listdir(dirName)
    allFiles = list()
    # Iterate over all the entries
    for entry in listOfFile:
        # Create full path
        fullPath = os.path.join(dirName, entry)
        # If entry is a directory then get the list of files in this directory 
        if os.path.isdir(fullPath):
            allFiles = allFiles + getListOfFiles(fullPath)
        else:
            if '.jpg' in fullPath :
                allFiles.append(fullPath[24:])    
    return allFiles

In [5]:
# Function that prints the total number of images per category in the given set
def print_items(list_of_files):
    cat1 = 0
    cat2 = 0
    cat3 = 0
    for ifile in list_of_files:
        if 'C1' in ifile:
            cat1 += 1
        elif 'C2' in ifile:
            cat2 += 1
        elif 'C3' in ifile:
            cat3 += 1
    total = cat1+cat2+cat3
    print ('Total images: ' + str(total))
    print (label_names[0] + ' ' + str(cat1) + ' Approx. ' + str(int(cat1/total*100)) + '%')
    print (label_names[1] + ' ' + str(cat2) + ' Approx. ' + str(int(cat2/total*100)) + '%')
    print (label_names[2] + ' ' + str(cat3) + ' Approx. ' + str(int(cat3/total*100)) + '%')

In [6]:
# Obtain filenames
folder = '../input/rwis' + str(image_height)
allfiles = getListOfFiles(folder)
print_items(allfiles)

Total images: 14000
Bare pavement 6339 Approx. 45%
Partial snow coverage 5672 Approx. 40%
Full snow coverage 1989 Approx. 14%


In [7]:
# Shuffle filenames
shuffle(allfiles)

In [8]:
# Sample of filenames
allfiles[:10]

['/ER-14/C2/ER-Hwy401_Curry_Hill-p3-1515260700.jpg',
 '/ER-32/C3/ER-Hwy416_Rideau_River-p3-1515841200.jpg',
 '/NWR-09/C2/NWR-Hwy11_at_Hwy625-p3-1512923400.jpg',
 '/CR-01/C2/CR-QEW_near_Con_Rd-p2-1514052000.jpg',
 '/WR-13/C3/WR-Hwy6_Mt_Forest-p2-1514197800.jpg',
 '/NWR-25/C1/NWR-Hwy11_near_Rock_Point_Rd-p2-1517428800.jpg',
 '/WR-06/C2/WR-Hwy21_Port_Elgin-p3-1515244500.jpg',
 '/WR-13/C2/WR-Hwy6_Mt_Forest-p2-1513088100.jpg',
 '/ER-11/C2/ER-Hwy17_Renfrew-p2-1513116000.jpg',
 '/NWR-14/C1/NWR-Hwy17_near_Raith-p2-1516467600.jpg']

In [9]:
# Split 90% for training and 10% for testing
train = allfiles[:int(len(allfiles)*0.9)].copy()
test = allfiles[int(len(allfiles)*0.9):].copy()

In [10]:
print('Images in the train set: ' + str(len(train)))
print('Images in the test set: ' + str(len(test)))

Images in the train set: 12600
Images in the test set: 1400


In [11]:
print('- Summary of images in the train set -')
print_items(train)

- Summary of images in the train set -
Total images: 12600
Bare pavement 5691 Approx. 45%
Partial snow coverage 5114 Approx. 40%
Full snow coverage 1795 Approx. 14%


In [12]:
print('- Summary of images in the test set -')
print_items(test)

- Summary of images in the test set -
Total images: 1400
Bare pavement 648 Approx. 46%
Partial snow coverage 558 Approx. 39%
Full snow coverage 194 Approx. 13%


In [13]:
# Double check for duplicates

for file in test:
    if file in train:
        print('duplicate!')

for file in train:
    if file in test:
        print('duplicate!')

print('all good')        

all good


## Save as pickle file

In [14]:
# Save train and test lists
split_dic = {'train':train,'test':test}

with open('split.pickle', 'wb') as handle:
    pickle.dump(split_dic, handle, protocol=pickle.HIGHEST_PROTOCOL)