# Building the Classifier

In [1]:
import pickle
import pandas as pd
import numpy as np
from os.path import join

from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils.multiclass import unique_labels

## 1. For a single region

1.1 Read saved feature matrix and corresponding labels

In [None]:
region = 'curated'

pickle_path = join('..', '..', 'pickles')
with open(join(pickle_path, 'densenet201_avg_features_' + region + '_train.pkl'), 'rb') as f:
    df_train = pickle.load(f)

1.2 Convert the features into a matrix and the labels into a list

In [None]:
feature_matrix = df_train['features'].to_numpy()
feature_matrix = np.column_stack(feature_matrix).transpose()

labels = df_train['label'].to_numpy().astype('int')

1.3 Split data into train and validation set

In [None]:
features_train, features_test, labels_train, labels_test = train_test_split(feature_matrix,
                                                                            labels,
                                                                            test_size=0.33,
                                                                            random_state=43)

1.3. Train a classifier on the training set and save it to disk

In [None]:
clf = svm.SVC(gamma='scale', probability=True, C=100, decision_function_shape='ovr')
clf.fit(features_train, labels_train)

with open(join(pickle_path, 'classifier_densenet201_max.pkl' ), 'wb') as f:
    pickle.dump(clf, f)

1.4. Predict labels on the validation set according to the classifier

In [None]:
predicted_labels = clf.predict(features_test)

1.5. Calculate and plot (to do) the confusion matrix

In [None]:
cm = confusion_matrix(labels_test, predicted_labels)
print(cm)

In [None]:
pred_probas = clf.predict_proba(features_test)

for i in range(len(predicted_labels)):
    print(pred_probas[i], labels_test[i])

## 2. For a set of regions

2.1 Load all available training features into one big dataframe

In [2]:
regions = ['borde_rural', 'borde_soacha', 'mixco_3', 'mixco_1_and_ebenezer', 'dennery']

In [3]:
columns = ['id', 'features', 'label']
df_train_global = pd.DataFrame(columns=columns)

pickle_path = join('..', '..', 'pickles')
for region in regions:
    try:
        with open(join(pickle_path, 'densenet201_max_features_' + region + '_train.pkl'), 'rb') as f:
            df_train = pickle.load(f)
            df_train_global = pd.concat([df_train_global, df_train], ignore_index=True)
    except:
        print("Error reading training data for region ", region)
    

In [4]:
df_train_global.head()

Unnamed: 0,id,features,label
0,7a37c004,"[0.00042720986, 0.0044912533, 0.010839798, 0.0...",0
1,7a43cf34,"[0.00055256934, 0.0045050588, 0.01082645, 0.00...",0
2,7a2beb9e,"[0.00042150632, 0.0054836688, 0.010666521, 0.0...",0
3,7a30cd44,"[0.0004985495, 0.004545764, 0.010795763, 0.006...",0
4,7a26b840,"[0.0006316932, 0.0046592713, 0.010714476, 0.00...",0


In [5]:
len(df_train_global.features[13000])

1920

2.2 Expand the dataframe such that instead of a list of features, every feature has its one column

In [6]:
#features_train = df_train_global.features.apply(lambda features: pd.Series(list(features)))
#features_train

#features_train = df_train_global.features.apply(lambda x: pd.Series(list(x)))
#features_train = features_train.astype('double')

feature_matrix_global = df_train_global['features'].to_numpy()
feature_matrix_global = np.column_stack(feature_matrix_global).transpose()

labels_global = df_train_global['label'].to_numpy().astype('int')

In [7]:
print(feature_matrix_global.shape)
print(labels_global.shape)

(14853, 1920)
(14853,)


## 3. Train an SVM Classifier

In [None]:
clf = svm.SVC(gamma='scale', probability=True, C=100, decision_function_shape='ovr', verbose=1)
clf.fit(feature_matrix_global, labels_global)

[LibSVM]

In [None]:
with open(join(pickle_path, 'classifier_densenet201_max_all.pkl' ), 'wb') as f:
    pickle.dump(clf, f)

## 4. Train a Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier(n_estimators=100, max_depth=32, random_state=0)
clf.fit(feature_matrix_global, labels_global) 