# Building the Classifier

In [1]:
import pickle
import pandas as pd
import numpy as np
from os.path import join

from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils.multiclass import unique_labels

## 1. For a single region

**1.1** Read saved feature matrix and corresponding labels

In [2]:
region = 'curated'

pickle_path = join('..', '..', 'pickles')
with open(join(pickle_path, 'resnet50_features_' + region + '_train.pkl'), 'rb') as f:
    df_train = pickle.load(f)

**1.2** Convert the features into a matrix and the labels into a list

In [3]:
feature_matrix = df_train['features'].to_numpy()
feature_matrix = np.column_stack(feature_matrix).transpose()

labels = df_train['label'].to_numpy().astype('int')

1.3 Split data into train and validation set

In [4]:
features_train, features_test, labels_train, labels_test = train_test_split(feature_matrix,
                                                                            labels,
                                                                            test_size=0.33,
                                                                            random_state=43)

1.3. Train a classifier on the training set and save it to disk

In [5]:
clf = svm.SVC(gamma='scale', probability=True, C=100, decision_function_shape='ovr')
clf.fit(features_train, labels_train)

with open(join(pickle_path, 'classifier.pkl' ), 'wb') as f:
    pickle.dump(clf, f)

1.4. Predict labels on the validation set according to the classifier

In [6]:
predicted_labels = clf.predict(features_test)

1.5. Calculate and plot (to do) the confusion matrix

In [7]:
cm = confusion_matrix(labels_test, predicted_labels)
print(cm)

[[14  1  2  1  0]
 [ 1 18  1  1  0]
 [ 0  0 10  1  0]
 [ 0  0  1 12  1]
 [ 0  0  1  0  4]]


In [8]:
pred_probas = clf.predict_proba(features_test)

for i in range(len(predicted_labels)):
    print(pred_probas[i], labels_test[i])

[0.33951308 0.34842611 0.04533576 0.04013541 0.22658964] 0
[0.02205633 0.02337387 0.09327877 0.84111365 0.02017738] 3
[0.96147448 0.00792523 0.02157417 0.00432752 0.0046986 ] 0
[0.91895387 0.02690449 0.00819583 0.01662817 0.02931764] 0
[0.00275372 0.00617738 0.00991003 0.27387434 0.70728453] 4
[0.01591606 0.01135018 0.02947518 0.91987155 0.02338703] 3
[0.00718198 0.00337585 0.00491734 0.96014231 0.02438252] 3
[0.00115004 0.00131282 0.00474257 0.986901   0.00589357] 3
[0.00910612 0.00980025 0.93135246 0.03666927 0.01307189] 2
[0.00467913 0.73691991 0.00320729 0.01206816 0.24312551] 1
[0.00403936 0.00984746 0.8087539  0.14712289 0.03023639] 2
[0.0209784  0.08765014 0.05652131 0.77496136 0.05988879] 3
[0.00272729 0.00635751 0.00676109 0.89708356 0.08707055] 3
[0.01255057 0.93933161 0.00391995 0.00748961 0.03670826] 1
[0.00477557 0.86692897 0.02149821 0.02560023 0.08119701] 1
[0.95894914 0.00945469 0.01463826 0.00266454 0.01429336] 0
[0.34346548 0.0041046  0.60992722 0.03291552 0.00958719]

## 2. Build Classifier Using All Regions

2.1 Load all available training features into one big dataframe

In [15]:
regions = ['borde_rural', 'borde_soacha', 'mixco_3', 'mixco_1_and_ebenezer', 'dennery']

In [16]:
columns = ['id', 'features', 'label']
df_train_global = pd.DataFrame(columns=columns)

pickle_path = join('..', '..', 'pickles')
for region in regions:
    try:
        with open(join(pickle_path, 'resnet50_features_' + region + '_train.pkl'), 'rb') as f:
            df_train = pickle.load(f)
            df_train_global = pd.concat([df_train_global, df_train])
    except:
        print("Error reading training data for region ", region)
    

In [17]:
df_train_global.head()

Unnamed: 0,id,features,label
0,7a1c6d7c,"[0.8933228, 9.503732, 0.72881216, 3.7250175, 0...",0
1,7a1d078c,"[0.0, 2.642926, 0.5762399, 0.0, 3.139349, 1.07...",0
2,7a1d2ff0,"[1.8013268, 3.5478988, 2.0254405, 0.0, 0.78705...",0
3,7a1d570a,"[4.6264086, 3.7589073, 8.721215, 0.4236287, 0....",0
4,7a1d6042,"[1.9871483, 3.8774838, 6.796729, 0.10309696, 3...",0


2.2 Expand the dataframe such that instead of a list of features, every feature has its one column

In [18]:
#features_train = df_train_global.features.apply(lambda features: pd.Series(list(features)))
#features_train

#features_train = df_train_global.features.apply(lambda x: pd.Series(list(x)))
#features_train = features_train.astype('double')

feature_matrix_global = df_train_global['features'].to_numpy()
feature_matrix_global = np.column_stack(feature_matrix_global).transpose()

labels_global = df_train_global['label'].to_numpy().astype('int')

In [19]:
feature_matrix_global.shape

(14853, 2048)

**2.3** Train the classifier using all training data  

In [None]:
clf = svm.SVC(gamma='scale', probability=True, C=100, decision_function_shape='ovr', verbose=1)
clf.fit(feature_matrix_global, labels_global)

[LibSVM]

In [None]:
with open(join(pickle_path, 'classifier_br_bs_m3_m1.pkl' ), 'wb') as f:
    pickle.dump(clf, f)