## Image File Handling Prep
Handels two inputs a labels spreadsheet which includes the filelist and class label of the image set and a matching directory images.
Output is to move the images into a directory tree of -> class_x/train/ and class_x/test/ based on the class label.

In [53]:
# Standard Dsci and data wrangling packages
import numpy as np
import pandas as pd

# I/O handling packages
import os
import shutil
import pickle

# For splitting the files into train and test directories
# based on labels.csv.
from sklearn.model_selection import train_test_split

# Don't need the packages below as I'll manage the data 
# set creation in the Keras_TFlow_Model file.

#import random
# import cv2

In [39]:
# Image dataset info is included in the labels.csv file

deep_df = pd.read_csv('~/python-repo/DeepWeeds_Dataset/Data/labels.csv')

In [40]:
deep_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17509 entries, 0 to 17508
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Filename  17509 non-null  object
 1   Label     17509 non-null  int64 
 2   Species   17509 non-null  object
dtypes: int64(1), object(2)
memory usage: 410.5+ KB


In [41]:
deep_df.head()

Unnamed: 0,Filename,Label,Species
0,20160928-140314-0.jpg,0,Chinee apple
1,20160928-140337-0.jpg,0,Chinee apple
2,20160928-140731-0.jpg,0,Chinee apple
3,20160928-140747-0.jpg,0,Chinee apple
4,20160928-141107-0.jpg,0,Chinee apple


In [42]:
# Check number of classes
deep_df['Label'].unique()

array([0, 1, 8, 7, 6, 4, 3, 5, 2])

In [43]:
# Create dictionary of class directories from the label numbers.
label_dict = {0 : 'class_0', 1 : 'class_1', 2 : 'class_2', 
              3 : 'class_3', 4 : 'class_4', 5 : 'class_5',
              6 : 'class_6', 7 : 'class_7', 8 : 'class_8'}

In [44]:
# split the df into two using test_train_split, no stratification
train, test = train_test_split(deep_df, test_size=0.2, random_state=123)

In [45]:
train.shape

(14007, 3)

In [46]:
test.shape

(3502, 3)

In [47]:
train.head()

Unnamed: 0,Filename,Label,Species
820,20170210-142607-0.jpg,7,Snake weed
10592,20171219-110049-3.jpg,8,Negative
10757,20171219-113609-2.jpg,8,Negative
5300,20170920-100739-1.jpg,2,Parkinsonia
1357,20170315-085825-0.jpg,8,Negative


In [48]:
test.head()

Unnamed: 0,Filename,Label,Species
11509,20171220-095434-1.jpg,8,Negative
7207,20171109-090410-3.jpg,5,Rubber vine
1071,20170217-115719-0.jpg,0,Chinee apple
203,20170127-102206-0.jpg,1,Lantana
6584,20171102-100741-2.jpg,4,Prickly acacia


In [56]:
# Count of class_* the negative class
train[train['Label']==8].count()

Filename    7287
Label       7287
Species     7287
dtype: int64

In [57]:
# Calculate % of df.
7287/14007

0.52023988005997

In [58]:
# Count of class_8 the negative class
test[test['Label']==8].count()

Filename    1819
Label       1819
Species     1819
dtype: int64

In [59]:
# Calculate % of df.
1819/3502

0.5194174757281553

We've maintained 50% negative class in both test and train after our train test split. We're ready to move the images!


In [49]:
# Reindex train df 
train = train.copy()
train.sort_index(ignore_index=True, inplace=True)

In [50]:
# Reindex test df
test = test.copy()
test.sort_index(ignore_index=True, inplace=True)

In [54]:
# Save the train and test df's in case of need to 
# reference which images are in which folders.

pickle_out = open('train_labels.pickle', 'wb')
pickle.dump(train, pickle_out)
pickle_out.close()

pickle_out = open('test_labels.pickle', 'wb')
pickle.dump(test, pickle_out)
pickle_out.close()

In [52]:
# Loop over the dataframe and move images into the appropriate class directory.
# This runs once on an image collection to put all of the images in their class directories.

for row in range(len(train)):
    image_name = train.loc[row, 'Filename']
    new_path_dir = label_dict[train.loc[row, 'Label']]
    shutil.move(f'/home/ksawczuk/python-repo/DeepWeeds_Dataset/Data/{image_name}',
                f'/home/ksawczuk/python-repo/DeepWeeds_Dataset/Data/{new_path_dir}/train')
    
for row in range(len(test)):
    image_name = test.loc[row, 'Filename']
    new_path_dir = label_dict[test.loc[row, 'Label']]
    shutil.move(f'/home/ksawczuk/python-repo/DeepWeeds_Dataset/Data/{image_name}',
               f'/home/ksawczuk/python-repo/DeepWeeds_Dataset/Data/{new_path_dir}/test')


# Stop Here!
Everything below this cell isn't needed if the model file utilizes the Keras ImageDataGenerator.

In [10]:
# data_dir = '/home/ksawczuk/python-repo/DeepWeeds_Dataset/Data'

# categories = ['class_0', 'class_1', 'class_2', 
#               'class_3', 'class_4', 'class_5', 
#               'class_6', 'class_7', 'class_8']
# img_size = 28

# for category in categories:
#     path = os.path.join(data_dir, category)
#     for img in os.listdir(path):
#         img_array = cv2.imread(os.path.join(path, img), cv2.IMREAD_COLOR)

In [11]:
# training_data = []
# def create_training_data():
#     for i, category in enumerate(categories):
#         path = os.path.join(data_dir, category)
#         class_num = categories.index(category)
#         for img in os.listdir(path):
#             img_array = cv2.imread(os.path.join(path, img), cv2.IMREAD_COLOR)
#             new_array = cv2.resize(img_array, (img_size, img_size))
#             training_data.append([new_array, class_num])


In [12]:
# create_training_data()

In [13]:
# len(training_data)

17509

In [15]:
# random.shuffle(training_data)

In [16]:
# X = []
# y = []
# for features, label_data in training_data:
#     X.append(features)
#     y.append(label_data)
    

In [17]:
# X = np.array(X).reshape(-1, img_size, img_size, 3)

In [20]:
# y = np.array(y)

In [21]:
# pickle_out = open('X.pickle', 'wb')
# pickle.dump(X, pickle_out)
# pickle_out.close()

# pickle_out = open('y.pickle', 'wb')
# pickle.dump(y, pickle_out)
# pickle_out.close()