In [2]:
%reset -f
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Activation, Flatten, Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping, ReduceLROnPlateau, CSVLogger
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.applications.densenet import DenseNet121, DenseNet201
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import matplotlib.pyplot as plt
import pandas as pd
import cv2, os, os, git, glob, random
import numpy as np
from insectrec.utils import get_dataset, train_generator, valid_generator, augment_trainset
from tensorflow.keras.preprocessing.image import img_to_array
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
seed = 42
np.random.seed(seed)

repo = git.Repo('.', search_parent_directories=True)
created_data_path = f'{repo.working_tree_dir}/insectrec/created_data'
path_impy_crops_export = f'{created_data_path}/impy_crops_export/'
path_images_augmented = f'{created_data_path}/images_augmented/'

if not os.path.isdir(path_images_augmented):
    raise NotImplemented("Not expanded dataset yet!")

batch_size = 128
img_dim = 90
modelname = 'xtract'

In [3]:
# Creating le for encoding labels
le = LabelEncoder()

# Creating dataframe with all the original data (x: filenames, textlabels, y: nummerical labels)
df_orig = pd.DataFrame()
df_orig['x'] = pd.Series(glob.glob(f"{path_impy_crops_export}/*/*.jpg"))
df_orig['y_text'] = df_orig['x'].apply(lambda x: x.split('/')[-2])
df_orig['y'] = le.fit_transform(df_orig.y_text)

# Splitting into train/val/test
X_train, X_test, y_train, y_test = train_test_split(df_orig.x, df_orig.y, test_size=0.2, random_state=seed, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=seed, shuffle=True)

In [4]:
# Gathering info on augmented X_train data
df_aug = pd.DataFrame()
df_aug['x'] = pd.Series(glob.glob(f"{path_images_augmented}/*/*.jpg"))
df_aug['textlabels'] = df_aug['x'].apply(lambda x: x.split('/')[-2])
df_aug['y'] = le.fit_transform(df_aug.textlabels)

In [5]:
X_train_aug = df_aug.x.tolist()
y_train_aug = df_aug.y.tolist()

c = list(zip(X_train_aug, y_train_aug))
random.shuffle(c)
X_train_aug, y_train_aug = zip(*c)
X_train_aug = list(X_train_aug)
y_train_aug = list(y_train_aug)
X_val = X_val.tolist()
y_val = y_val.tolist()
X_test = X_test.tolist()
y_test = y_test.tolist()

In [59]:
base_model = DenseNet121(include_top=False, 
                         weights='imagenet', 
                        input_shape=(img_dim,img_dim,3)) 
x = base_model.output
predictions = GlobalAveragePooling2D()(x)
# x = Dense(128, activation='relu')(x)
# x = Dropout(0.5)(x)
# predictions = Dense(6, activation='relu')(x)
# predictions = Dense(6, activation='softmax')(x)

model = Model(inputs=base_model.input, outputs=predictions)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.layers[-1].output

<tf.Tensor 'global_average_pooling2d_4/Identity:0' shape=(None, 1024) dtype=float32>

## Extracting features for our training data

In [60]:
X_pred_aug = model.predict(valid_generator(X_train_aug, y_train_aug, batch_size=batch_size, nb_classes=6, img_dim=img_dim), 
                         steps= len(X_train_aug) / batch_size, 
                         verbose=1)



In [61]:
print(X_pred_aug.shape)

(48624, 1024)


## Using extracted features with XGBoost

In [62]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_validate
from xgboost import XGBClassifier
from joblib import dump, load
from sklearn.metrics import confusion_matrix, balanced_accuracy_score, classification_report, make_scorer, log_loss

In [63]:
parameters = {'nthread': [8],
              'gamma': [0.1, 0], 
              'learning_rate': [0.2, 0.3, 0.4], #so called `eta` value,
              'n_estimators': [500, 750, 1000],
             'n_jobs': [-1]}

gridsearch = GridSearchCV(XGBClassifier(), 
                   parameters, 
                   n_jobs=-1, 
                   cv=5, 
                   scoring=make_scorer(balanced_accuracy_score),
                   verbose=2, 
                   refit=True)

gridsearch.fit(X_pred_aug, y_train_aug)

print(gridsearch)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed: 327.5min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 1838.2min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=-1,
             param_grid={'gamma': [0.1, 0], 'learning_rate': [0.2, 0.3, 0.4],
                         'n_estimators': [500, 750, 1000], 'n_jobs': [-1],
                         '

In [70]:
final_estimator = gridsearch.best_estimator_
final_estimator

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.4, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=1000, n_jobs=-1,
              nthread=8, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [66]:
# cv_results = cross_validate(final_estimator, X_pred_aug, y_train_aug, cv=5, 
#                             return_estimator=True, 
#                             return_train_score=True, 
#                             scoring=make_scorer(balanced_accuracy_score),
#                             verbose=1, 
#                             n_jobs=-1) 

In [67]:
gridsearch

GridSearchCV(cv=5, error_score=nan,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, verbosity=1),
             iid='deprecated', n_jobs=-1,
             param_grid={'gamma': [0.1, 0], 'learning_rate': [0.2, 0.3, 0.4],
                         'n_estimators': [500, 750, 1000], 'n_jobs': [-1],
                         '

#### Extracting test features for prediction

In [68]:
X_test_pred = model.predict(valid_generator(X_test, y_test, batch_size=batch_size, nb_classes=6, img_dim=img_dim), 
                         steps= len(X_test) / batch_size, 
                         verbose=1)



In [69]:
balanced_accuracy_score(y_test,final_estimator.predict(X_test_pred))

0.6479034652026391

In [None]:
# using a relu size 6 in the end
## include_top, untrained 0.3344455669500479
# using global avg pooling in the end 0.6479034652026391
## include_top, imagenet  