# Building the Classifier

In [1]:
import pickle
import pandas as pd
import numpy as np
from os.path import join

from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils.multiclass import unique_labels
from sklearn.preprocessing import StandardScaler

## 1. For a single region

1.1 Read saved feature matrix and corresponding labels

In [10]:
region = 'borde_rural'

pickle_path = join('..', '..', 'pickles')
with open(join(pickle_path, 'resnet50_avg_features_' + region + '_train.pkl'), 'rb') as f:
    df_train = pickle.load(f)

1.2 Convert the features into a matrix and the labels into a list

In [11]:
feature_matrix = df_train['features'].to_numpy()
feature_matrix = np.column_stack(feature_matrix).transpose()

labels = df_train['label'].to_numpy().astype('int')

1.3 Split data into train and validation set

In [12]:
features_train, features_test, labels_train, labels_test = train_test_split(feature_matrix,
                                                                            labels,
                                                                            test_size=0.33,
                                                                            random_state=43)

In [13]:
scaler = StandardScaler()
scaler.fit(feature_matrix)
features_train_scaled = scaler.transform(features_train)
features_test_scaled = scaler.transform(features_test)

1.3. Train a classifier on the training set and save it to disk

In [14]:
clf = svm.SVC(gamma='scale', probability=True, C=100, decision_function_shape='ovr')
clf.fit(features_train_scaled, labels_train)

#with open(join(pickle_path, 'classifier_densenet201_max.pkl' ), 'wb') as f:
#    pickle.dump(clf, f)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

1.4. Predict labels on the validation set according to the classifier

In [15]:
predicted_labels = clf.predict(features_test_scaled)

1.5. Calculate and plot (to do) the confusion matrix

In [16]:
cm = confusion_matrix(labels_test, predicted_labels)
print(cm)

[[ 38  16   2   6   0]
 [  1 675   0  94   0]
 [  7  13  23  38   0]
 [  3 123   8 388   0]
 [  0   2   0   0   0]]


In [9]:
pred_probas = clf.predict_proba(features_test)

for i in range(len(predicted_labels)):
    print(pred_probas[i], labels_test[i])

[1.46850342e-04 9.82936985e-01 1.24599440e-04 1.61547724e-02
 6.36793089e-04] 1
[4.07339853e-04 8.20514935e-01 1.67431754e-03 1.76573785e-01
 8.29621977e-04] 3
[0.00157439 0.70725758 0.01217722 0.27679848 0.00219233] 2
[1.59530305e-04 9.47786410e-01 5.27353928e-04 4.90188166e-02
 2.50788875e-03] 1
[4.31829545e-04 9.69777265e-01 4.03436802e-04 2.89184826e-02
 4.68985818e-04] 1
[0.14496779 0.56611253 0.01981013 0.26333518 0.00577436] 0
[0.00253214 0.19146692 0.01186554 0.79328951 0.00084588] 2
[1.73800366e-04 9.77514021e-01 3.05273698e-04 2.06553732e-02
 1.35153180e-03] 3
[9.27783740e-04 9.28704926e-01 9.04844951e-04 6.86931643e-02
 7.69280869e-04] 3
[1.05780825e-04 8.50317532e-01 2.13031493e-04 1.48551440e-01
 8.12216458e-04] 3
[1.83542723e-04 5.76226190e-01 2.19253454e-04 4.23049342e-01
 3.21671252e-04] 3
[4.28494694e-04 9.16152794e-01 9.94466964e-04 8.17284825e-02
 6.95762272e-04] 1
[2.90943276e-04 9.73511346e-01 3.60348597e-04 2.54435671e-02
 3.93795054e-04] 1
[2.71543845e-04 9.38538

[2.45412409e-04 7.28168300e-01 1.18691087e-03 2.69022783e-01
 1.37659421e-03] 2
[1.05420141e-04 8.04463225e-01 1.30539396e-04 1.94446664e-01
 8.54151536e-04] 1
[5.73186147e-04 5.91700325e-01 1.11862122e-03 4.06029810e-01
 5.78057733e-04] 3
[0.00091342 0.65749261 0.00541478 0.33518571 0.00099347] 3
[2.13201866e-04 9.60451934e-01 1.19294374e-04 3.88115386e-02
 4.04031382e-04] 1
[2.74421063e-04 7.96907855e-01 3.82191651e-04 2.01826866e-01
 6.08665782e-04] 2
[3.03938307e-04 9.10300941e-01 5.54335008e-04 8.81519579e-02
 6.88827914e-04] 1
[2.97189573e-03 1.53765080e-01 1.71507747e-02 8.25763596e-01
 3.48654250e-04] 2
[3.75633355e-04 4.10137618e-01 1.45661493e-03 5.87430922e-01
 5.99211612e-04] 3
[1.37493922e-04 7.65449777e-01 7.04720335e-04 2.32929359e-01
 7.78649559e-04] 3
[6.13068137e-05 8.57350583e-01 1.55977728e-04 1.42119533e-01
 3.12599452e-04] 3
[1.53145393e-04 9.63151518e-01 1.13653059e-04 3.62043638e-02
 3.77319675e-04] 3
[1.05613507e-04 9.83941002e-01 1.87620877e-04 1.52694899e-02


 6.02294968e-04] 1
[1.33096222e-04 8.04103126e-01 1.71573192e-04 1.94873413e-01
 7.18791839e-04] 1
[5.44038665e-04 2.69771166e-01 1.35366622e-03 7.27855510e-01
 4.75619090e-04] 3
[1.25498237e-04 7.90588971e-01 2.43022435e-04 2.07504257e-01
 1.53825187e-03] 1
[5.62339224e-05 9.87905202e-01 2.74288497e-04 1.09980821e-02
 7.66193484e-04] 1
[4.35127701e-04 9.15219318e-01 9.33189170e-04 8.27652713e-02
 6.47094051e-04] 1
[6.40276435e-05 9.86884025e-01 1.49716324e-04 1.15055909e-02
 1.39664043e-03] 1
[1.69000536e-04 7.22897542e-01 5.67529787e-04 2.75557336e-01
 8.08592263e-04] 3
[1.09874276e-03 9.80613318e-01 1.16928147e-04 1.72788582e-02
 8.92152900e-04] 0
[1.34609702e-04 7.22029491e-01 1.83591208e-04 2.76844855e-01
 8.07453237e-04] 3
[2.53738356e-04 7.74647814e-01 3.62663796e-04 2.23999707e-01
 7.36077243e-04] 1
[1.52555154e-04 9.58453627e-01 1.11705883e-04 4.08131229e-02
 4.68988877e-04] 1
[0.67967622 0.28630031 0.00283117 0.02991302 0.00127928] 0
[4.82672403e-04 9.67970044e-01 5.15475860e

[1.11869102e-03 8.67169729e-01 1.42121995e-03 1.29484133e-01
 8.06226729e-04] 3
[1.12173479e-04 9.85163122e-01 1.65239892e-04 1.40776767e-02
 4.81788045e-04] 1
[0.26519504 0.38230806 0.06205187 0.27016485 0.02028018] 2
[3.69559593e-04 9.68860831e-01 7.84073239e-04 2.90101010e-02
 9.75435273e-04] 1
[0.00141785 0.50597861 0.03381733 0.45796232 0.00082389] 3
[4.55000666e-04 9.67405806e-01 5.49744998e-04 3.09651582e-02
 6.24289932e-04] 1
[3.98488597e-04 7.57138751e-01 1.98462827e-03 2.39872375e-01
 6.05757634e-04] 3
[1.13662791e-04 9.59658376e-01 2.48384614e-04 3.89437971e-02
 1.03577951e-03] 1
[8.42741365e-04 7.49426481e-01 9.12002525e-04 2.48373144e-01
 4.45631353e-04] 3
[1.64786107e-04 9.58521140e-01 6.66501067e-05 4.07035243e-02
 5.43899031e-04] 1
[0.00126651 0.89200457 0.00098208 0.10473352 0.00101331] 1
[4.50706233e-05 9.89481961e-01 6.30538438e-05 9.71127611e-03
 6.98638639e-04] 1
[2.66146967e-04 8.20173143e-01 7.36221239e-04 1.78333174e-01
 4.91314494e-04] 3
[0.01490739 0.64237841 

## 2. For a set of regions

2.1 Load all available training features into one big dataframe

In [None]:
regions = ['borde_rural', 'borde_soacha', 'mixco_3', 'mixco_1_and_ebenezer', 'dennery']

In [None]:
columns = ['id', 'features', 'label']
df_train_global = pd.DataFrame(columns=columns)

pickle_path = join('..', '..', 'pickles')
for region in regions:
    try:
        with open(join(pickle_path, 'resnet50_avg_retrained_features_' + region + '_train.pkl'), 'rb') as f:
            df_train = pickle.load(f)
            df_train_global = pd.concat([df_train_global, df_train], ignore_index=True)
    except:
        print("Error reading training data for region ", region)
    

In [None]:
df_train_global.head()

In [None]:
len(df_train_global.features[13000])

2.2 Expand the dataframe such that instead of a list of features, every feature has its one column

In [None]:
#features_train = df_train_global.features.apply(lambda features: pd.Series(list(features)))
#features_train

#features_train = df_train_global.features.apply(lambda x: pd.Series(list(x)))
#features_train = features_train.astype('double')

feature_matrix_global = df_train_global['features'].to_numpy()
feature_matrix_global = np.column_stack(feature_matrix_global).transpose()

labels_global = df_train_global['label'].to_numpy().astype('int')

In [None]:
scaler = StandardScaler()
scaler.fit(feature_matrix_global)
feature_matrix_scaled = scaler.transform(feature_matrix_global)

In [None]:
print(feature_matrix_global.shape)
print(labels_global.shape)

## 3. Train an SVM Classifier

In [None]:
clf = svm.SVC(gamma='scale', probability=True, C=100, decision_function_shape='ovr', verbose=1)
clf.fit(feature_matrix_global, labels_global)

In [None]:
with open(join(pickle_path, 'classifier_resnet50_avg_retrained_all_scaled.pkl' ), 'wb') as f:
    pickle.dump(clf, f)

## 4. Train a Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier(n_estimators=100, max_depth=32, random_state=0)
clf.fit(feature_matrix_global, labels_global) 