# Building the Classifier

In [11]:
import pickle
import pandas as pd
import numpy as np
from os.path import join

from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils.multiclass import unique_labels
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

## 1. For a single region

1.1 Read saved feature matrix and corresponding labels

In [2]:
region = 'mixco_3'

pickle_path = join('..', '..', 'pickles')
with open(join(pickle_path, 'resnet50_avg_features_' + region + '_train.pkl'), 'rb') as f:
    df_train = pickle.load(f)

### 1.2 Convert the features into a matrix and the labels into a list

In [None]:
feature_matrix = df_train['features'].to_numpy()
feature_matrix = np.column_stack(feature_matrix).transpose()

labels = df_train['label'].to_numpy().astype('int')

scaler = StandardScaler()
scaler.fit(feature_matrix)
feature_matrix_scaled = scaler.transform(feature_matrix)

1.3 Split data into train and validation set

In [None]:
features_train, features_test, labels_train, labels_test = train_test_split(feature_matrix_scaled,
                                                                            labels,
                                                                            test_size=0.33,
                                                                            random_state=43)

1.3. Train a classifier on the training set and save it to disk

In [None]:
clf = svm.SVC(gamma='scale', probability=True, C=100, decision_function_shape='ovr')
clf.fit(features_train, labels_train)

#with open(join(pickle_path, 'classifier_densenet201_max.pkl' ), 'wb') as f:
#    pickle.dump(clf, f)

1.4. Predict labels on the validation set according to the classifier

In [None]:
predicted_labels = clf.predict(features_test)

1.5. Calculate and plot (to do) the confusion matrix

In [None]:
cm = confusion_matrix(labels_test, predicted_labels)
print(cm)

Train a RF to assess feature importance

In [None]:

forest = RandomForestClassifier(n_estimators=50, max_depth=16, random_state=0)
forest.fit(feature_matrix, labels) 

In [None]:
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1][:1]

In [None]:
feature_matrix_selected = feature_matrix[:, indices]

In [None]:
indices

In [None]:
feature_matrix_selected

In [None]:
features_train, features_test, labels_train, labels_test = train_test_split(feature_matrix_selected,
                                                                            labels,
                                                                            test_size=0.33,
                                                                            random_state=43)

clf = svm.SVC(gamma='scale', probability=True, C=100, decision_function_shape='ovr')
clf.fit(features_train, labels_train)

predicted_labels = clf.predict(features_test)
cm = confusion_matrix(labels_test, predicted_labels)
print(cm)

In [None]:
pred_probas = clf.predict_proba(features_test)

for i in range(len(predicted_labels)):
    print(pred_probas[i], labels_test[i])

## 2. For a set of regions

2.1 Load all available training features into one big dataframe

In [3]:
regions = ['borde_rural', 'borde_soacha', 'mixco_3', 'mixco_1_and_ebenezer', 'dennery']

In [4]:
columns = ['id', 'features', 'label']
df_train_global = pd.DataFrame(columns=columns)

pickle_path = join('..', '..', 'pickles')
for region in regions:
    try:
        with open(join(pickle_path, 'resnet50_avg_features_' + region + '_train.pkl'), 'rb') as f:
            df_train = pickle.load(f)
            df_train_global = pd.concat([df_train_global, df_train], ignore_index=True)
    except:
        print("Error reading training data for region ", region)
    

In [5]:
df_train_global.head()

Unnamed: 0,id,features,label
0,7a37c004,"[0.2500773, 1.10179, 0.37730777, 0.0062653143,...",0
1,7a43cf34,"[0.63164663, 0.5006275, 0.29867664, 0.00472470...",0
2,7a2beb9e,"[0.0, 1.5943071, 0.47893494, 0.033646796, 0.0,...",0
3,7a30cd44,"[0.033745307, 0.59142905, 0.051605936, 0.09569...",0
4,7a26b840,"[0.22346738, 1.1884235, 0.009053887, 0.0141087...",0


In [6]:
len(df_train_global.features[13000])

2048

2.2 Expand the dataframe such that instead of a list of features, every feature has its one column

In [7]:
#features_train = df_train_global.features.apply(lambda features: pd.Series(list(features)))
#features_train

#features_train = df_train_global.features.apply(lambda x: pd.Series(list(x)))
#features_train = features_train.astype('double')

feature_matrix_global = df_train_global['features'].to_numpy()
feature_matrix_global = np.column_stack(feature_matrix_global).transpose()

labels_global = df_train_global['label'].to_numpy().astype('int')

In [8]:
scaler = StandardScaler()
scaler.fit(feature_matrix_global)
feature_matrix_scaled = scaler.transform(feature_matrix_global)

In [9]:
print(feature_matrix_global.shape)
print(labels_global.shape)

(14853, 2048)
(14853,)


## 3. Train an SVM Classifier

In [None]:
clf = svm.SVC(gamma='scale', probability=True, C=100, decision_function_shape='ovr', verbose=1)
clf.fit(feature_matrix_global, labels_global)

In [None]:
#with open(join(pickle_path, 'classifier_resnet50_avg_retrained_all_scaled.pkl' ), 'wb') as f:
#    pickle.dump(clf, f)

## 4. Train a Random Forest Model

In [21]:
forest = RandomForestClassifier(n_estimators=100, max_depth=16, random_state=0, n_jobs=6)
forest.fit(feature_matrix_global, labels_global) 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=16, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=6,
                       oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [24]:
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1][:200]
feature_matrix_selected = feature_matrix_global[:, indices]

In [25]:
features_train, features_test, labels_train, labels_test = train_test_split(feature_matrix_selected,
                                                                            labels_global,
                                                                            test_size=0.33,
                                                                            random_state=43)

clf = svm.SVC(gamma='scale', probability=True, C=100, decision_function_shape='ovr')
clf.fit(features_train, labels_train)

predicted_labels = clf.predict(features_test)
cm = confusion_matrix(labels_test, predicted_labels)
print(cm)

[[ 354   47   12   39    2]
 [  54 2011   12  329    9]
 [  26   26   74  102    0]
 [  37  363   70 1268    7]
 [   6   22    0   12   20]]


In [26]:
materials = {'concrete_cement':0, 'healthy_metal':1, 'incomplete':2, 'irregular_metal':3, 'other':4}

In [29]:
import matplotlib.pyplot as plt
import utils

row_sums = cm.sum(axis=1, keepdims=True)
cm_norm = cm / row_sums

fig, ax = plt.subplots(figsize=(8, 8))
im, cbar = utils.heatmap(cm_norm,  materials.keys(), materials.keys(), ax=ax,
                   cmap="YlGn")
texts = utils.annotate_heatmap(im)

fig.tight_layout()
plt.show()

ModuleNotFoundError: No module named 'utils'