# Building the Classifier

In [9]:
import pickle
import pandas as pd
import numpy as np
from os.path import join

from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils.multiclass import unique_labels

## 1. For a single region

1.1 Read saved feature matrix and corresponding labels

In [2]:
region = 'curated'

pickle_path = join('..', '..', 'pickles')
with open(join(pickle_path, 'resnet50_features_' + region + '_train.pkl'), 'rb') as f:
    df_train = pickle.load(f)

1.2 Convert the features into a matrix and the labels into a list

In [3]:
feature_matrix = df_train['features'].to_numpy()
feature_matrix = np.column_stack(feature_matrix).transpose()

labels = df_train['label'].to_numpy().astype('int')

1.3 Split data into train and validation set

In [4]:
features_train, features_test, labels_train, labels_test = train_test_split(feature_matrix,
                                                                            labels,
                                                                            test_size=0.33,
                                                                            random_state=43)

1.3. Train a classifier on the training set and save it to disk

In [5]:
clf = svm.SVC(gamma='scale', probability=True, C=100, decision_function_shape='ovr')
clf.fit(features_train, labels_train)

with open(join(pickle_path, 'classifier.pkl' ), 'wb') as f:
    pickle.dump(clf, f)

1.4. Predict labels on the validation set according to the classifier

In [6]:
predicted_labels = clf.predict(features_test)

1.5. Calculate and plot (to do) the confusion matrix

In [7]:
cm = confusion_matrix(labels_test, predicted_labels)
print(cm)

[[14  1  2  1  0]
 [ 1 18  1  1  0]
 [ 0  0 10  1  0]
 [ 0  0  1 12  1]
 [ 0  0  1  0  4]]


In [8]:
pred_probas = clf.predict_proba(features_test)

for i in range(len(predicted_labels)):
    print(pred_probas[i], labels_test[i])

[0.32623592 0.33407995 0.04612867 0.04045531 0.25310015] 0
[0.02585654 0.02947882 0.12742263 0.7875693  0.02967272] 3
[0.96106545 0.00700827 0.02372455 0.00462538 0.00357635] 0
[0.92955901 0.0223956  0.00742757 0.01537196 0.02524585] 0
[0.00168519 0.0060048  0.01184194 0.26158264 0.71888544] 4
[0.01935112 0.01477218 0.04409353 0.88809381 0.03368935] 3
[0.00855545 0.00425066 0.00806734 0.94635448 0.03277207] 3
[0.00195423 0.00250486 0.00806003 0.97838135 0.00909954] 3
[0.00906023 0.00994846 0.92091904 0.04172376 0.01834851] 2
[0.00295868 0.71859338 0.00315389 0.01377523 0.26151883] 1
[0.00356877 0.01017338 0.80130179 0.14501507 0.03994099] 2
[0.02180286 0.09200722 0.07169893 0.73913857 0.07535243] 3
[0.00286082 0.0078805  0.01086957 0.8713787  0.10701043] 3
[0.01058244 0.93723426 0.00379107 0.00798312 0.04040912] 1
[0.00365825 0.85971751 0.02061064 0.02727397 0.08873963] 1
[0.96331756 0.00799609 0.01450916 0.00254226 0.01163493] 0
[0.29551851 0.00367481 0.6604802  0.03153712 0.00878936]

## 2. For a set of regions

2.1 Load all available training features into one big dataframe

In [10]:
regions = ['borde_rural', 'borde_soacha', 'mixco_3', 'mixco_1_and_ebenezer', 'dennery']

In [11]:
columns = ['id', 'features', 'label']
df_train_global = pd.DataFrame(columns=columns)

pickle_path = join('..', '..', 'pickles')
for region in regions:
    try:
        with open(join(pickle_path, 'resnet50_features_' + region + '_train.pkl'), 'rb') as f:
            df_train = pickle.load(f)
            df_train_global = pd.concat([df_train_global, df_train])
    except:
        print("Error reading training data for region ", region)
    

In [12]:
df_train_global.head()

Unnamed: 0,id,features,label
0,7a1c6d7c,"[0.8933228, 9.503732, 0.72881216, 3.7250175, 0...",0
1,7a1d078c,"[0.0, 2.642926, 0.5762399, 0.0, 3.139349, 1.07...",0
2,7a1d2ff0,"[1.8013268, 3.5478988, 2.0254405, 0.0, 0.78705...",0
3,7a1d570a,"[4.6264086, 3.7589073, 8.721215, 0.4236287, 0....",0
4,7a1d6042,"[1.9871483, 3.8774838, 6.796729, 0.10309696, 3...",0


2.2 Expand the dataframe such that instead of a list of features, every feature has its one column

In [13]:
#features_train = df_train_global.features.apply(lambda features: pd.Series(list(features)))
#features_train

#features_train = df_train_global.features.apply(lambda x: pd.Series(list(x)))
#features_train = features_train.astype('double')

feature_matrix_global = df_train_global['features'].to_numpy()
feature_matrix_global = np.column_stack(feature_matrix_global).transpose()

labels_global = df_train_global['label'].to_numpy().astype('int')

In [14]:
feature_matrix_global.shape

(14853, 2048)

## 3. Train an SVM Classifier

In [14]:
clf = svm.SVC(gamma='scale', probability=True, C=100, decision_function_shape='ovr', verbose=1)
clf.fit(feature_matrix_global, labels_global)

[LibSVM]

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=1)

In [15]:
with open(join(pickle_path, 'classifier_br_bs_m3_m1_d.pkl' ), 'wb') as f:
    pickle.dump(clf, f)

## 4. Train a Random Forest Model

In [3]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier(n_estimators=100, max_depth=32, random_state=0)
clf.fit(feature_matrix_global, labels_global) 