# Building the Classifier

In [52]:
import pickle
import pandas as pd
import numpy as np
from os.path import join

## 1. For a single region

1.1 Read saved feature matrix and corresponding labels

In [53]:
region = 'mixco_3'

pickle_path = join('..', '..', 'pickles')
with open(join(pickle_path, 'resnet50_features_' + region + '_train.pkl'), 'rb') as f:
    features_train = pickle.load(f)

1.2 Convert the features into a matrix and the labels into a list

In [54]:
feature_matrix = features_train['features'].to_numpy()
feature_matrix = np.column_stack(feat_matrix).transpose()

labels = features_train['label'].to_numpy().astype('int')

1.3 Split data into train and validation set

In [55]:
from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils.multiclass import unique_labels

In [56]:
features_train, features_test, labels_train, labels_test = train_test_split(feature_matrix,
                                                                            labels,
                                                                            test_size=0.33,
                                                                            random_state=43)

1.3. Train a classifier on the training set and save it to disk

In [57]:
clf = svm.SVC(gamma='scale', probability=True, C=100, decision_function_shape='ovr')
clf.fit(features_train, labels_train)

with open(join(pickle_path, 'classifier.pkl' ), 'wb') as f:
    pickle.dump(clf, f)

1.4. Predict labels on the validation set according to the classifier

In [58]:
predicted_labels = clf.predict(features_test)

1.5. Calculate and plot (to do) the confusion matrix

In [59]:
cm = confusion_matrix(labels_test, predicted_labels)
print(cm)

[[17  0  0  0]
 [ 1 14  2  1]
 [ 1  0 16  0]
 [ 1  0  0  0]]


In [60]:
pred_probas = clf.predict_proba(features_test)

for i in range(len(predicted_labels)):
    print(pred_probas[i], labels_test[i])

[0.01432318 0.97646168 0.00248714 0.006728  ] 1
[0.03435104 0.03802011 0.92495716 0.0026717 ] 2
[0.0853879  0.04645862 0.85464752 0.01350596] 2
[0.51623924 0.33767721 0.09757833 0.04850522] 0
[0.00828789 0.91065825 0.0092641  0.07178976] 1
[0.14554093 0.80591284 0.02137687 0.02716936] 1
[0.01650321 0.00432187 0.97576329 0.00341163] 2
[0.0604639  0.40781854 0.49668187 0.03503569] 1
[0.97175708 0.02143286 0.00483211 0.00197796] 0
[0.91126869 0.06183233 0.01316497 0.01373401] 0
[0.94736647 0.02657177 0.01383606 0.0122257 ] 0
[0.02610183 0.02721261 0.94458762 0.00209793] 2
[0.00822071 0.00355633 0.98385746 0.0043655 ] 2
[0.04418412 0.80551715 0.05941063 0.0908881 ] 1
[0.01977288 0.01959542 0.95859802 0.00203367] 2
[0.76535632 0.07237136 0.14912748 0.01314483] 0
[0.64087794 0.2706998  0.07687663 0.01154564] 0
[0.00965878 0.00360163 0.98167693 0.00506266] 2
[0.84908375 0.04678598 0.03507736 0.06905291] 0
[0.91758135 0.02803361 0.04411353 0.01027151] 0
[0.01330783 0.91378982 0.01486418 0.0580

## 2. Build Classifier Using All Regions

2.1 Load all available training features into one big dataframe

In [70]:
regions = ['borde_rural', 'borde_soacha', 'mixco_1_and_ebenezer', 'mixco_3']

In [76]:
columns = ['id', 'features', 'label']
features_train_global = pd.DataFrame(columns=columns)

for region in regions:
    try:
        with open(join(pickle_path, 'resnet50_features_' + region + '_train.pkl'), 'rb') as f:
            features_local = pickle.load(f)
            features_local.head()
            features_train_global = pd.concat([features_train_global, features_local])
    except:
        print("No available training features for region ", region)
    

No available training features for region  borde_rural
No available training features for region  borde_soacha
No available training features for region  mixco_1_and_ebenezer


Train a classifier with this giant dataframe

In [8]:
gloabal_labels.shape

(13849,)

In [9]:
clf = svm.SVC(gamma='scale', probability=True, C=100, decision_function_shape='ovr')
clf.fit(global_feature_matrix, gloabal_labels)

with open(join(pickle_path, 'classifier_global.pkl' ), 'wb') as f:
    pickle.dump(clf, f)