# Building the Classifier

In [1]:
import pickle
import pandas as pd
import numpy as np
from os.path import join

from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils.multiclass import unique_labels

## 1. For a single region

**1.1** Read saved feature matrix and corresponding labels

In [None]:
region = 'curated'

pickle_path = join('..', '..', 'pickles')
with open(join(pickle_path, 'resnet50_features_' + region + '_train.pkl'), 'rb') as f:
    df_train = pickle.load(f)

**1.2** Convert the features into a matrix and the labels into a list

In [None]:
feature_matrix = df_train['features'].to_numpy()
feature_matrix = np.column_stack(feature_matrix).transpose()

labels = df_train['label'].to_numpy().astype('int')

1.3 Split data into train and validation set

In [None]:
features_train, features_test, labels_train, labels_test = train_test_split(feature_matrix,
                                                                            labels,
                                                                            test_size=0.33,
                                                                            random_state=43)

1.3. Train a classifier on the training set and save it to disk

In [None]:
clf = svm.SVC(gamma='scale', probability=True, C=100, decision_function_shape='ovr')
clf.fit(features_train, labels_train)

with open(join(pickle_path, 'classifier.pkl' ), 'wb') as f:
    pickle.dump(clf, f)

1.4. Predict labels on the validation set according to the classifier

In [None]:
predicted_labels = clf.predict(features_test)

1.5. Calculate and plot (to do) the confusion matrix

In [None]:
cm = confusion_matrix(labels_test, predicted_labels)
print(cm)

In [None]:
pred_probas = clf.predict_proba(features_test)

for i in range(len(predicted_labels)):
    print(pred_probas[i], labels_test[i])

## 2. Build Classifier Using All Regions

2.1 Load all available training features into one big dataframe

In [2]:
regions = ['borde_rural'] # add the other regions!

In [3]:
columns = ['id', 'features', 'label']
df_train_global = pd.DataFrame(columns=columns)

pickle_path = join('..', '..', 'pickles')
for region in regions:
    #try:
        with open(join(pickle_path, 'resnet50_features_' + region + '_train.pkl'), 'rb') as f:
            df_train = pickle.load(f)
            df_train_global = pd.concat([df_train_global, df_train])
    #except:
    #    print("Error reading training data for region ", region)
    

In [4]:
df_train_global.head()

Unnamed: 0,id,features,label
0,7a1c6d7c,"[0.8933228, 9.503732, 0.72881216, 3.7250175, 0...",0
1,7a1d078c,"[0.0, 2.642926, 0.5762399, 0.0, 3.139349, 1.07...",0
2,7a1d2ff0,"[1.8013268, 3.5478988, 2.0254405, 0.0, 0.78705...",0
3,7a1d570a,"[4.6264086, 3.7589073, 8.721215, 0.4236287, 0....",0
4,7a1d6042,"[1.9871483, 3.8774838, 6.796729, 0.10309696, 3...",0


2.2 Expand the dataframe such that instead of a list of features, every feature has its one column

In [5]:
#features_train = df_train_global.features.apply(lambda features: pd.Series(list(features)))
#features_train

#features_train = df_train_global.features.apply(lambda x: pd.Series(list(x)))
#features_train = features_train.astype('double')

feature_matrix_global = df_train_global['features'].to_numpy()
feature_matrix_global = np.column_stack(feature_matrix_global).transpose()

labels_global = df_train_global['label'].to_numpy().astype('int')

In [6]:
feature_matrix_global.shape

(4353, 2048)

**2.3** Train the classifier using all training data  

In [7]:
clf = svm.SVC(gamma='scale', probability=True, C=100, decision_function_shape='ovr')
clf.fit(feature_matrix_global, labels_global)

with open(join(pickle_path, 'classifier_borde_rural.pkl' ), 'wb') as f:
    pickle.dump(clf, f)