## Data Prep

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import shutil
import cv2
import random
import pickle


In [5]:
# Image dataset info is included in the labels.csv file

deep_df = pd.read_csv('~/python-repo/DeepWeeds_Dataset/Data/labels.csv')

In [6]:
deep_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17509 entries, 0 to 17508
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Filename  17509 non-null  object
 1   Label     17509 non-null  int64 
 2   Species   17509 non-null  object
dtypes: int64(1), object(2)
memory usage: 410.5+ KB


In [7]:
deep_df.head()

Unnamed: 0,Filename,Label,Species
0,20160928-140314-0.jpg,0,Chinee apple
1,20160928-140337-0.jpg,0,Chinee apple
2,20160928-140731-0.jpg,0,Chinee apple
3,20160928-140747-0.jpg,0,Chinee apple
4,20160928-141107-0.jpg,0,Chinee apple


In [8]:
# Check number of classes
deep_df['Label'].unique()

array([0, 1, 8, 7, 6, 4, 3, 5, 2])

In [9]:
# Create dictionary of class directories from the label numbers.
label_dict = {0 : 'class_0', 1 : 'class_1', 2 : 'class_2', 
              3 : 'class_3', 4 : 'class_4', 5 : 'class_5',
              6 : 'class_6', 7 : 'class_7', 8 : 'class_8'}

In [7]:
# Loop over the dataframe and move images into the appropriate class directory.

for row in range(len(deep_df)):
    image_name = deep_df.loc[row, 'Filename']
    new_path_dir = label_dict[deep_df.loc[row, 'Label']]
    shutil.move(f'/home/ksawczuk/python-repo/DeepWeeds_Dataset/Data/{image_name}',
                f'/home/ksawczuk/python-repo/DeepWeeds_Dataset/Data/{new_path_dir}')
    

Error: Destination path '/home/ksawczuk/python-repo/DeepWeeds_Dataset/Data/class_0/20160928-140314-0.jpg' already exists

In [10]:
data_dir = '/home/ksawczuk/python-repo/DeepWeeds_Dataset/Data'

categories = ['class_0', 'class_1', 'class_2', 
              'class_3', 'class_4', 'class_5', 
              'class_6', 'class_7', 'class_8']
img_size = 28

for category in categories:
    path = os.path.join(data_dir, category)
    for img in os.listdir(path):
        img_array = cv2.imread(os.path.join(path, img), cv2.IMREAD_COLOR)

In [11]:
training_data = []
def create_training_data():
    for i, category in enumerate(categories):
        path = os.path.join(data_dir, category)
        class_num = categories.index(category)
        for img in os.listdir(path):
            img_array = cv2.imread(os.path.join(path, img), cv2.IMREAD_COLOR)
            new_array = cv2.resize(img_array, (img_size, img_size))
            training_data.append([new_array, class_num])


In [12]:
create_training_data()

In [13]:
len(training_data)

17509

In [15]:
random.shuffle(training_data)

In [16]:
X = []
y = []
for features, label_data in training_data:
    X.append(features)
    y.append(label_data)
    

In [17]:
X = np.array(X).reshape(-1, img_size, img_size, 3)

In [20]:
y = np.array(y)

In [21]:
pickle_out = open('X.pickle', 'wb')
pickle.dump(X, pickle_out)
pickle_out.close()

pickle_out = open('y.pickle', 'wb')
pickle.dump(y, pickle_out)
pickle_out.close()