In [1]:
import pandas as pd
import numpy as np
import os
from PIL import Image
from tqdm import tqdm
import tensorflow as tf
from sklearn.metrics import f1_score

### Load dataframe and separate it into train and test

In [2]:
all_df = pd.read_csv("participants_dataset/participants_dataset.csv", index_col=0)
all_df.head()

Unnamed: 0_level_0,filename,label
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,files/e2991a66-412d-4841-8dc0-524e38338a82 Lea...,Tomato_healthy
1,files/e9392a5e-2fac-48c5-a23d-df3aa3f61048 Lea...,Tomato_healthy
2,files/87a77f4d-38bd-42dc-bdd3-5f2a1fa95ca7 Lea...,Tomato_healthy
3,files/efe6c986-b85c-40f1-8cb5-345acbb36b71 057...,Tomato_healthy
4,files/cdf10741-0ed4-4a27-a2e9-8970e4426730 035...,Tomato_healthy


In [3]:
all_df.tail()

Unnamed: 0_level_0,filename,label
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
16263,files/7e6c8297-dfe1-4826-826e-01d696cc263b 186...,
16264,files/36bdc44c-96a4-40bb-86f7-63a91d706e96 420...,
16265,files/46b2555a-c0a6-48c2-af13-0816e147e4de 194...,
16266,files/d287be3d-cac6-4485-8c31-674fb8b459be 197...,
16267,files/111cd9d8-4d27-4225-be9e-d29b21cf56b9 541...,


In [4]:
train_df = all_df[pd.isnull(all_df["label"])==False]
test_df = all_df[pd.isnull(all_df["label"])]

### Define and train model on train_df

In [5]:
data_path = "participants_dataset/"
categories = ["Pepper__bell___Bacterial_spot", "Pepper__bell___healthy", "Potato___Early_blight", 
              "Potato___Late_blight", "Potato___healthy", "Tomato_Bacterial_spot", "Tomato_Early_blight",
              "Tomato_Late_blight", "Tomato_Leaf_Mold", "Tomato_Septoria_leaf_spot", 
              "Tomato_Spider_mites_Two_spotted_spider_mite", "Tomato__Target_Spot", 
              "Tomato__Tomato_YellowLeaf__Curl_Virus", "Tomato__Tomato_mosaic_virus", "Tomato_healthy"]

def oneHotEncoding(category, categories):
    res = np.zeros(len(categories))
    res[categories.index(category)] = 1
    return res

In [6]:
x_train = []
y_train = []
for i in tqdm(range(0, train_df.shape[0])):
    try:
        arr = Image.open(data_path+train_df["filename"].iloc[i])
        arr = np.asarray(arr, dtype=np.float32)/255
        x_train.append(arr)
        y_train.append(oneHotEncoding(train_df["label"][i], categories))
    except Exception:
        file = data_path+train_df["filename"].iloc[i]
        print(f"There was an error loading file {file}")
        pass

100%|██████████████████████████████████████| 8216/8216 [00:26<00:00, 310.97it/s]


In [7]:
x_train = np.array(x_train)
y_train = np.array(y_train)

In [8]:
shuffler = np.random.permutation(x_train.shape[0])
x_train = x_train[shuffler]
y_train = y_train[shuffler]

In [14]:
x_train.shape

(8216, 256, 256, 3)

In [9]:
inputShape = x_train[0].shape

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Conv2D(32, (3,3), padding="same", input_shape=inputShape, activation="relu"))
model.add(tf.keras.layers.MaxPooling2D(pool_size=(5, 5)))
model.add(tf.keras.layers.Conv2D(64, (3,3), padding="same", input_shape=inputShape, activation="relu"))
model.add(tf.keras.layers.MaxPooling2D(pool_size=(3, 3)))
model.add(tf.keras.layers.Dropout(0.25))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(1024))
model.add(tf.keras.layers.Dense(len(categories), activation="softmax"))
model.summary()

Model: "sequential"
_________________________________________________________________


2022-10-08 21:25:44.280125: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 256, 256, 32)      896       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 51, 51, 32)       0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 51, 51, 64)        18496     
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 17, 17, 64)       0         
 2D)                                                             
                                                                 
 dropout (Dropout)           (None, 17, 17, 64)        0         
                                                                 
 flatten (Flatten)           (None, 18496)             0         
                                                                 
 dense (De

In [10]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.005, beta_1=0.9, beta_2=0.999, epsilon=1, decay=0.0, amsgrad=False)
model.compile(loss="categorical_crossentropy", optimizer=optimizer,metrics=["accuracy"])

In [11]:
model.fit(x=x_train, y=y_train, epochs=40, batch_size=128)

Epoch 1/40

KeyboardInterrupt: 

### Score model based on training data

In [12]:
y_pred = model.predict(x_train)

In [13]:
y_pred_binary = np.zeros(y_train.shape)

In [14]:
for i in range(0, y_train.shape[0]):
    for j in range(0, y_train.shape[1]):
        if y_pred[i,j]==max(y_pred[i,:]):
            y_pred_binary[i,j] = 1
            break

In [15]:
y_pred_binary

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [16]:
f1_score(y_train, y_pred_binary, average="weighted")

0.7951539570383875

### Apply model on test data

In [17]:
x_test = []
for i in tqdm(range(0, test_df.shape[0])):
    try:
        arr = Image.open(data_path+test_df["filename"].iloc[i])
        arr = np.asarray(arr, dtype=np.float32)/255
        x_test.append(arr)
    except Exception:
        print(f"There was an error loading file {test_df['filename'].iloc[i]}")
        pass

100%|██████████████████████████████████████| 8052/8052 [00:26<00:00, 306.80it/s]


In [18]:
x_test = np.array(x_test)

In [19]:
y_pred = model.predict(x_test)
y_pred_binary = np.zeros(y_pred.shape)
for i in range(0, y_pred.shape[0]):
    for j in range(0, y_pred.shape[1]):
        if y_pred[i,j]==max(y_pred[i,:]):
            y_pred_binary[i,j] = 1
            break

In [21]:
class_predictions = []
for i in range(0, y_pred_binary.shape[0]):
    class_predictions.append(categories[np.argwhere(y_pred_binary[i]==1)[0,0]])

In [23]:
new_test_df = test_df.copy(deep=True)
new_test_df["label"] = class_predictions

In [24]:
new_test_df.to_csv("Plant_submission.csv")