# Building the Classifier

In [1]:
import pickle
import pandas as pd
import numpy as np
from os.path import join

from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils.multiclass import unique_labels
from sklearn.preprocessing import StandardScaler

## 1. For a single region

1.1 Read saved feature matrix and corresponding labels

In [16]:
region = 'borde_rural'

pickle_path = join('..', '..', 'pickles')
with open(join(pickle_path, 'resnet50_avg_retrained_features_' + region + '_train.pkl'), 'rb') as f:
    df_train = pickle.load(f)

1.2 Convert the features into a matrix and the labels into a list

In [17]:
feature_matrix = df_train['features'].to_numpy()
feature_matrix = np.column_stack(feature_matrix).transpose()

labels = df_train['label'].to_numpy().astype('int')

1.3 Split data into train and validation set

In [18]:
features_train, features_test, labels_train, labels_test = train_test_split(feature_matrix,
                                                                            labels,
                                                                            test_size=0.33,
                                                                            random_state=43)

In [None]:
scaler = StandardScaler()
scaler.fit(feature_matrix)
features_train_scaled = scaler.transform(features_train)
features_test_scaled = scaler.transform(features_test)

1.3. Train a classifier on the training set and save it to disk

In [None]:
clf = svm.SVC(gamma='scale', probability=True, C=100, decision_function_shape='ovr')
clf.fit(features_train_scaled, labels_train)

#with open(join(pickle_path, 'classifier_densenet201_max.pkl' ), 'wb') as f:
#    pickle.dump(clf, f)

1.4. Predict labels on the validation set according to the classifier

In [20]:
predicted_labels = clf.predict(features_test_scaled)

1.5. Calculate and plot (to do) the confusion matrix

In [21]:
cm = confusion_matrix(labels_test, predicted_labels)
print(cm)

[[ 41  12   3   6   0]
 [  2 671   3  94   0]
 [  6   4  36  35   0]
 [  3 136  17 366   0]
 [  1   1   0   0   0]]


In [22]:
pred_probas = clf.predict_proba(features_test)

for i in range(len(predicted_labels)):
    print(pred_probas[i], labels_test[i])

[2.94152779e-04 9.75224083e-01 2.30300876e-04 2.39673229e-02
 2.84139977e-04] 1
[0.00635455 0.18547294 0.03931638 0.7670976  0.00175853] 3
[0.06348166 0.0377846  0.60078586 0.28721012 0.01073776] 2
[5.16710259e-04 8.40367891e-01 2.81147229e-03 1.19027585e-01
 3.72763414e-02] 1
[1.15201630e-03 8.97998951e-01 7.72499902e-04 9.97822114e-02
 2.94321529e-04] 1
[0.70442885 0.02282522 0.23789693 0.03130311 0.00354589] 0
[0.05476056 0.02392315 0.39356647 0.52567756 0.00207227] 2
[2.39135656e-04 9.46708109e-01 5.30901050e-04 5.07489277e-02
 1.77292662e-03] 3
[0.0178599  0.46086913 0.05091284 0.46966975 0.00068838] 3
[4.94319781e-03 3.40867149e-01 1.31882153e-02 6.40466428e-01
 5.35009912e-04] 3
[8.37913398e-04 5.75328408e-02 9.27293935e-03 9.32320235e-01
 3.60709538e-05] 3
[5.27729429e-04 6.99592882e-01 8.04220808e-03 2.91450431e-01
 3.86749318e-04] 1
[1.13821024e-03 9.43908982e-01 1.27207215e-03 5.29816800e-02
 6.99055994e-04] 1
[1.28967553e-03 6.99946259e-01 9.71646832e-04 2.97505152e-01
 2.8

[9.26637120e-03 1.73500225e-01 3.25653515e-02 7.84273710e-01
 3.94341915e-04] 3
[3.45330049e-04 9.52208031e-01 4.89076426e-03 4.23737273e-02
 1.82147215e-04] 1
[4.10286446e-03 1.75252643e-02 2.00782443e-02 9.57851359e-01
 4.42268275e-04] 3
[1.23231110e-03 3.10200740e-02 1.09068900e-01 8.58601044e-01
 7.76705917e-05] 3
[0.00771996 0.19583616 0.02272062 0.77281716 0.0009061 ] 3
[3.58226067e-04 1.74309351e-01 1.01382182e-02 8.13262527e-01
 1.93167740e-03] 2
[0.00180264 0.47941011 0.00111387 0.51673341 0.00093996] 1
[0.01012612 0.13064351 0.04542203 0.81190789 0.00190046] 3
[8.07862572e-03 2.81004069e-02 8.23007399e-02 8.80890991e-01
 6.29236389e-04] 3
[1.32977366e-02 6.39450325e-01 8.77659428e-03 3.38190586e-01
 2.84758730e-04] 1
[4.86596546e-04 3.04403076e-01 1.64443357e-03 6.93239235e-01
 2.26659193e-04] 2
[2.77682492e-03 2.79055582e-01 7.59988855e-03 7.10098062e-01
 4.69642513e-04] 1
[9.23649294e-02 2.08868308e-02 6.80081393e-01 2.06297184e-01
 3.69663402e-04] 2
[2.45487310e-03 3.99333

[0.04461037 0.25197807 0.07937096 0.62129058 0.00275002] 3
[0.73510933 0.0920584  0.10596799 0.05332426 0.01354002] 0
[1.29607764e-02 7.26198941e-01 4.21007590e-03 2.55945051e-01
 6.85154852e-04] 3
[1.85274376e-04 9.37172331e-01 8.13088306e-04 2.94307048e-02
 3.23986018e-02] 1
[0.0074065  0.0666997  0.32279193 0.60129365 0.00180822] 2
[3.68010963e-04 3.62173041e-01 3.44094632e-03 6.33677979e-01
 3.40021877e-04] 3
[2.89088721e-04 5.80606360e-01 2.36307426e-03 4.13758532e-01
 2.98294553e-03] 3
[6.90261923e-04 8.06561678e-01 1.39145186e-03 1.91284734e-01
 7.18746444e-05] 1
[2.55452918e-04 9.77305474e-01 4.55433906e-04 2.18357102e-02
 1.47928536e-04] 1
[0.00154999 0.72488085 0.04106602 0.23046405 0.00203909] 1
[3.86191380e-03 2.20763978e-01 1.38151982e-03 7.73755812e-01
 2.36776442e-04] 3
[0.48306731 0.29133762 0.01248847 0.21235088 0.00075572] 1
[0.03362337 0.36027762 0.14577787 0.45361913 0.00670201] 0
[0.00409923 0.89772742 0.01918469 0.07632344 0.00266522] 1
[0.00250075 0.07591194 0.01

 7.09501480e-05] 1
[1.08949052e-04 9.83209767e-01 8.37533719e-04 1.54831165e-02
 3.60633889e-04] 1
[3.72180644e-04 7.15343760e-01 2.43384569e-03 2.81290546e-01
 5.59668124e-04] 1
[0.00098734 0.33721452 0.00407747 0.65672947 0.00099121] 3
[0.0090879  0.05603352 0.04565934 0.88359877 0.00562047] 3
[5.59683707e-05 9.89200234e-01 2.72980206e-04 1.04360824e-02
 3.47346732e-05] 1
[3.31107541e-03 2.56651794e-01 3.35185833e-02 7.05995152e-01
 5.23395037e-04] 3
[0.09208699 0.70003498 0.01689067 0.18891602 0.00207133] 1
[5.98006599e-04 9.65671760e-01 3.91069240e-03 2.95875908e-02
 2.31950461e-04] 1
[0.00339241 0.09418016 0.01857082 0.88228203 0.00157458] 3
[0.00129579 0.42674312 0.04223376 0.52907096 0.00065638] 3
[0.00410797 0.07194911 0.15882919 0.76292962 0.00218411] 3
[0.39116422 0.2962451  0.07147088 0.23775738 0.00336243] 0
[4.53725677e-04 1.15466143e-01 1.65096776e-03 8.81323922e-01
 1.10524152e-03] 3
[0.00173901 0.1848254  0.00768089 0.80480044 0.00095427] 3
[1.59226561e-03 2.54650562e-0

## 2. For a set of regions

2.1 Load all available training features into one big dataframe

In [4]:
regions = ['borde_rural', 'borde_soacha', 'mixco_3', 'mixco_1_and_ebenezer', 'dennery']

In [5]:
columns = ['id', 'features', 'label']
df_train_global = pd.DataFrame(columns=columns)

pickle_path = join('..', '..', 'pickles')
for region in regions:
    try:
        with open(join(pickle_path, 'resnet50_avg_retrained_features_' + region + '_train.pkl'), 'rb') as f:
            df_train = pickle.load(f)
            df_train_global = pd.concat([df_train_global, df_train], ignore_index=True)
    except:
        print("Error reading training data for region ", region)
    

In [6]:
df_train_global.head()

Unnamed: 0,id,features,label
0,7a37c004,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.955824, ...",0
1,7a43cf34,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.2254369,...",0
2,7a2beb9e,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.5360031,...",0
3,7a30cd44,"[0.0, 0.0, 0.0, 0.9002078, 0.0, 0.0, 0.1847459...",0
4,7a26b840,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0


In [7]:
len(df_train_global.features[13000])

2048

2.2 Expand the dataframe such that instead of a list of features, every feature has its one column

In [10]:
#features_train = df_train_global.features.apply(lambda features: pd.Series(list(features)))
#features_train

#features_train = df_train_global.features.apply(lambda x: pd.Series(list(x)))
#features_train = features_train.astype('double')

feature_matrix_global = df_train_global['features'].to_numpy()
feature_matrix_global = np.column_stack(feature_matrix_global).transpose()

labels_global = df_train_global['label'].to_numpy().astype('int')

In [16]:
scaler = StandardScaler()
scaler.fit(feature_matrix_global)
feature_matrix_scaled = scaler.transform(feature_matrix_global)

In [11]:
print(feature_matrix_global.shape)
print(labels_global.shape)

(14853, 2048)
(14853,)


## 3. Train an SVM Classifier

In [None]:
clf = svm.SVC(gamma='scale', probability=True, C=100, decision_function_shape='ovr', verbose=1)
clf.fit(feature_matrix_global, labels_global)

[LibSVM]

In [None]:
with open(join(pickle_path, 'classifier_resnet50_avg_retrained_all_scaled.pkl' ), 'wb') as f:
    pickle.dump(clf, f)

## 4. Train a Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier(n_estimators=100, max_depth=32, random_state=0)
clf.fit(feature_matrix_global, labels_global) 