# Exploratory Data Analysis & Modeling

In [None]:
from radiant_mlhub import Dataset, Collection, client, get_session
import tarfile
from pathlib import Path
import os
import shutil
from io import BytesIO
from glob import glob
from tqdm.notebook import tqdm
import tifffile as tiff
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import PIL as pil
import json
import seaborn as sns

## Load Data

In [None]:
# Main dataset
dataset_df = pd.read_json("dataset_df.json")

# View first five rows
dataset_df.head()

In [None]:
# Dataframe containing the keys
key_df = pd.read_json("key_df.json")

# View first five rows
key_df.head()

In [None]:
# Output path where the data is located and version of the path as a string
output_path = Path("./data/").resolve()
output_path_str = str(Path(output_path))

In [None]:
# Access to the data on local system
data_root=f"{output_path}/Images" 

In [None]:
# Print classes from the Images folder
selectedClasses = (os.listdir(data_root))
print (selectedClasses)

In [None]:
# How many images are in the dataset
print("Total images in the dataset: ", len(dataset_df))

In [None]:
# Check image counts per category, just for reference
ac_count = dataset_df['label'].value_counts()
plt.figure(figsize=(10,6))
sns.barplot(x=ac_count.index, y=ac_count.values)
plt.title("Images count for each label category", fontsize=16)
plt.xlabel("Label", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.show();

In [None]:
import keras
from keras.preprocessing.image import ImageDataGenerator
import tensorflow as tf #deep learning


### Plot some sample images before any data augmentation

In [None]:
DATA_DIR = str(data_root) # data directory

In [None]:
datagen_plot = ImageDataGenerator()
generator_plot = datagen_plot.flow_from_directory(
    DATA_DIR,
    target_size=(224, 224), #image size (resized for better visualisation clarity)
    shuffle = True, #randomize
    subset='training')

In [None]:
#function for plotting images with labels
def plots(ims, figsize = (10,10), rows=4, interp=False, titles=None, maxNum = 10):
    if type(ims[0] is np.ndarray):
        ims = np.array(ims).astype(np.uint8)
        if(ims.shape[-1] != 3):
            ims = ims.transpose((0,2,3,1))
           
    f = plt.figure(figsize=figsize)
    cols = maxNum // rows if maxNum % 2 == 0 else maxNum//rows + 1
    for i in range(maxNum):
        sp = f.add_subplot(rows, cols, i+1)
        sp.axis('Off')
        if titles is not None:
            sp.set_title(titles[i], fontsize=12)
        plt.imshow(ims[i], interpolation = None if interp else 'none')  

In [None]:
generator_plot.reset()
imgs, labels = generator_plot.next() # images to plot

# including labels
labelNames=[]
labelIndices=[np.where(r==1)[0][0] for r in labels]

for ind in labelIndices:
    for labelName,labelIndex in generator_plot.class_indices.items():
        if labelIndex == ind:
            labelNames.append(labelName)

In [None]:
plots(imgs, rows=1, titles = labelNames, maxNum=8)


### Creating data matrices

In [None]:
#reading images from local images folder -- replace and comment out the duplicate
dataPath = "C:/Users/kchan/Desktop/Spring2022/RemoteSensing/Final/FinalRepo/MUSA650_Final_ChangEpstein/data/Images"
#dataPath = "C:/Users/jenna/Documents/MCP/Spring_2022/MUSA650_RemoteSensing/Final/MUSA650_Final_ChangEpstein/data/Images"

#array of unique labels
labelList = os.listdir(dataPath)

#read images
numClass = len(labelList)

lenClass = np.zeros(numClass)
for i in np.arange(0, numClass):
    lenClass[i] = len(os.listdir(dataPath + '/' + labelList[i]))
#returns the number of images in each class
lenClass 

In [None]:
#returns the total number of images
numImg = int(lenClass.sum())
numImg

In [None]:
#plotting one image
import PIL

imgSel = dataPath + '/' + labelList[i] + '/' + os.listdir(dataPath + '/' + labelList[i])[29] #this can be any number between 0-165 idk why
img = PIL.Image.open(imgSel, 'r')
plt.imshow(np.asarray(img))

In [None]:
np.asarray(img).shape
#200x200 and 3 color channels - does it need to be 224x224?

In [None]:
#returns total number of pixels per each image
numPixels = np.prod(np.asarray(img).shape)
numPixels

In [None]:
# Extract color channels from each image and flatten to a feature matrix X
X = np.zeros([numImg, numPixels])

# Create the numeric labels y for each image
y = np.zeros(numImg)

In [None]:
# creating a dictionary to make it easier to match up text labels with numeric
class_dict = dict(zip(labelList, range(len(labelList))))
class_dict  

In [None]:
imgInd = 0
for i in np.arange(0, numClass):
  className = labelList[i]
  for imgName in os.listdir(dataPath + '/' + className):
    img = PIL.Image.open(dataPath + '/' + className + '/' + imgName, 'r')
    imgVec = np.asarray(img).flatten()
    X[imgInd,:] = imgVec
    y[imgInd] = i
    imgInd = imgInd + 1
    print('Read img class ' + className + ' no ' + str(imgInd))

In [None]:
#shape of X before splitting
X.shape

In [None]:
#shape of y before splitting
y.shape

### KNN Model

In [None]:
#splitting the data, using stratify to ensure even distribution of each classes in each set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42, stratify=y)


In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
# Defining knn classifier
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=5) # Start with 5 neighbors just to get baseline accuracy

In [None]:
# Scale and transform
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [None]:
# Train the model
knn_clf.fit(X_train, y_train)

In [None]:
# Apply the model
knn_preds = knn_clf.predict(X_test)

In [None]:
# Evaluate model
from sklearn.metrics import accuracy_score, plot_confusion_matrix
accuracy_score(y_test, knn_preds)

In [None]:
# Optimize parameters -- NOTE: this takes 10 min ito run
# Source: https://machinelearningknowledge.ai/knn-classifier-in-sklearn-using-gridsearchcv-with-example/
from sklearn.model_selection import GridSearchCV
# Defining range of parameters
k_range = list(range(1,31))
param_grid_knn = dict(n_neighbors = k_range)

#defining grid
grid_knn = GridSearchCV(knn_clf, param_grid_knn, cv=10, scoring='accuracy', return_train_score=False, verbose=1)

#fitting model for grid search
grid_search_knn = grid_knn.fit(X_train, y_train)

In [None]:
#print best parameters
print(grid_search_knn.best_params_)

#10 folds for 30 candidates range(1,31) defines 27 neighbors as optimal

In [None]:
# Now that we have best params, run again

# Defining knn classifier
knn_clf_best = KNeighborsClassifier(n_neighbors=27) #using 27 from gridSearchCV

In [None]:
# Train the model
knn_clf_best.fit(X_train, y_train)

In [None]:
# Apply the model
knn_preds_best = knn_clf_best.predict(X_test)

In [None]:
# Evaluate model
accuracy_score(y_test, knn_preds_best)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

print("KNN accuracy score: ",accuracy_score(y_test,knn_preds_best))
print("KNN classification report \n",classification_report(y_test,knn_preds_best))

knn_cm = confusion_matrix(y_test,knn_preds_best,  labels = knn_clf_best.classes_)

In [None]:
# Plot the confusion matrix
%matplotlib inline
cm = knn_cm.astype('float') / knn_cm.sum(axis=1)[:, np.newaxis]
fig, ax = plt.subplots(figsize=(10, 10))
im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
ax.figure.colorbar(im, ax=ax)

# Show all ticks
ax.set(xticks=np.arange(cm.shape[1]),
       yticks=np.arange(cm.shape[0]),
       # and label them with the respective list entries
       xticklabels=labelList, yticklabels=labelList,
       title='KNN Model: Normalized Confusion Matrix',
       ylabel='True label',
       xlabel='Predicted label')

# Loop over data dimensions and create text annotations
fmt = '.2f'
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(j, i, format(cm[i, j], fmt),
                ha="center", va="center",
                color="white" if cm[i, j] > thresh else "black")
fig.tight_layout()

### Random Forest

In [None]:
# Defining rf classifier
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(max_depth=10, n_estimators=100, max_features=1)

In [None]:
# Train the model
rf_clf.fit(X_train, y_train)

In [None]:
# Apply the model
rf_preds = rf_clf.predict(X_test)

In [None]:
# Evaluate model
accuracy_score(y_test, rf_preds)

In [None]:
# Optimize parameters - should only take <5 min to run currently
# Source: https://stackoverflow.com/questions/30102973/how-to-get-best-estimator-on-gridsearchcv-random-forest-classifier-scikit
# Defining range of parameters
param_grid_rf = {
    'max_depth': [5, 10, 15],
    'n_estimators': [200, 700, 900]
}

#defining grid
grid_rf = GridSearchCV(rf_clf, param_grid_rf, cv=10, scoring='accuracy', return_train_score=False, verbose=1)

#fitting model for grid search
grid_search_rf = grid_rf.fit(X_train, y_train)

In [None]:
# Print best parameters
print(grid_search_rf.best_params_)

In [None]:
# Defining rf classifier with best params
rf_clf_best = RandomForestClassifier(max_depth=15, n_estimators=700, max_features=1)

In [None]:
# Train the model
rf_clf_best.fit(X_train, y_train)

In [None]:
# Apply the model
rf_preds_best = rf_clf_best.predict(X_test)

In [None]:
# Evaluate model
accuracy_score(y_test, rf_preds_best)

In [None]:
print("RF accuracy score: ",accuracy_score(y_test,rf_preds_best))
print("RF classification report \n",classification_report(y_test,rf_preds_best))

rf_cm = confusion_matrix(y_test,rf_preds_best, labels = rf_clf_best.classes_)

In [None]:
# Plot the confusion matrix
%matplotlib inline
cm = rf_cm.astype('float') / rf_cm.sum(axis=1)[:, np.newaxis]
fig, ax = plt.subplots(figsize=(10, 10))
im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
ax.figure.colorbar(im, ax=ax)

# Show all ticks
ax.set(xticks=np.arange(cm.shape[1]),
       yticks=np.arange(cm.shape[0]),
       # and label them with the respective list entries
       xticklabels=labelList, yticklabels=labelList,
       title='Random Forest Model: Normalized Confusion Matrix',
       ylabel='True label',
       xlabel='Predicted label')

# Loop over data dimensions and create text annotations
fmt = '.2f'
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(j, i, format(cm[i, j], fmt),
                ha="center", va="center",
                color="white" if cm[i, j] > thresh else "black")
fig.tight_layout()

## CNN 1st attempt

In [None]:
#reading in images and retaining rgb
import glob
import os

img_files = []

for file in glob.glob(dataPath + os.sep + "*" + os.sep + "*.png"):
    img_files.append(file)

In [None]:
#load rgb images
imgs_rgb = []

for imgName in img_files:
    temp = io.imread(imgName)
    imgs_rgb.append(temp)

In [None]:
#convert to array
RGBimages = np.stack(imgs_rgb)

In [None]:
#check shape
RGBimages.shape

In [None]:
#split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(RGBimages,y,test_size=0.5, random_state=42, stratify=y)

In [None]:
#scale and transform
from sklearn.preprocessing import MinMaxScaler
scalar = MinMaxScaler()
scalar.fit(X_train.reshape(X_train.shape[0], -1))
X_train = scalar.transform(X_train.reshape(X_train.shape[0], -1)).reshape(X_train.shape)
X_test = scalar.transform(X_test.reshape(X_test.shape[0], -1)).reshape(X_test.shape)

In [None]:
#set global params
batch_size = 64
epochs = 12
input_shape =(200,200,3)

In [None]:
import tensorflow as tf 
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
from tensorflow.keras.layers import BatchNormalization

In [None]:
cnn1 = Sequential()

cnn1.add(Conv2D(64, kernel_size=(3, 3),strides=(1,1),input_shape=input_shape))
cnn1.add(BatchNormalization())
cnn1.add(Activation('relu'))
cnn1.add(MaxPooling2D((2,2)))

cnn1.add(Dropout(0.25))

cnn1.add(Flatten())
cnn1.add(Dense(numClass, activation='softmax'))

cnn1.summary()

In [None]:
cnn1.compile(loss=keras.losses.sparse_categorical_crossentropy,
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])

In [None]:
cnn1.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(X_test, y_test))

In [None]:
score = cnn1.evaluate(X_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])