# Building the Classifier

In [1]:
import pickle
import pandas as pd
import numpy as np
from os.path import join

from sklearn import svm
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils.multiclass import unique_labels
from sklearn.preprocessing import StandardScaler

## 1. For a single region

1.1 Read saved feature matrix and corresponding labels

In [24]:
region = 'borde_rural'

pickle_path = join('..', '..', 'pickles')
with open(join(pickle_path, 'resnet50_features_' + region + '_train.pkl'), 'rb') as f:
    df_train = pickle.load(f)

### 1.2 Convert the features into a matrix and the labels into a list

In [25]:
feature_matrix = df_train['features'].to_numpy()
feature_matrix = np.column_stack(feature_matrix).transpose()

labels = df_train['label'].to_numpy().astype('int')

1.3 Split data into train and validation set

In [26]:
features_train, features_test, labels_train, labels_test = train_test_split(feature_matrix,
                                                                            labels,
                                                                            test_size=0.33,
                                                                            random_state=43)

1.3. Train a classifier on the training set and save it to disk

In [27]:
clf = svm.SVC(gamma='scale', probability=True, C=100, decision_function_shape='ovr')
clf.fit(features_train, labels_train)

with open(join(pickle_path, 'classifier_densenet201_max.pkl' ), 'wb') as f:
    pickle.dump(clf, f)

1.4. Predict labels on the validation set according to the classifier

In [28]:
predicted_labels = clf.predict(features_test)

1.5. Calculate and plot (to do) the confusion matrix

In [29]:
cm = confusion_matrix(labels_test, predicted_labels)
print(cm)

[[ 44   4   6   8   0]
 [  2 658   1 109   0]
 [  6   5  34  36   0]
 [  2 114  13 393   0]
 [  0   1   0   1   0]]


In [8]:
pred_probas = clf.predict_proba(features_test)

for i in range(len(predicted_labels)):
    print(pred_probas[i], labels_test[i])

[0.93800929 0.01484652 0.01814413 0.00568303 0.02331702] 0
[9.08252507e-04 2.45963026e-03 3.79420015e-03 9.86714228e-01
 6.12368935e-03] 3
[0.8838209  0.04758416 0.00903683 0.00636294 0.05319517] 0
[0.64379693 0.01603544 0.29894523 0.02332422 0.01789818] 0
[0.03330799 0.09994666 0.0182777  0.20619385 0.6422738 ] 4
[0.02131354 0.00380459 0.1348034  0.81913534 0.02094313] 3
[0.00730056 0.02320506 0.00825509 0.95464947 0.00658983] 3
[0.12315168 0.00567728 0.80841179 0.04407472 0.01868452] 3
[0.02576058 0.0063406  0.95157755 0.00667839 0.00964288] 2
[0.20671721 0.20668239 0.08844017 0.05887055 0.43928969] 1
[0.00247052 0.00249262 0.98518159 0.00245457 0.00740069] 2
[0.03900178 0.04461704 0.16073936 0.03422922 0.72141261] 3
[1.64277443e-04 5.08697701e-04 1.69996881e-03 9.91856442e-01
 5.77061396e-03] 3
[0.08807758 0.79680469 0.01264633 0.0424169  0.06005451] 1
[0.03297028 0.75743978 0.02513453 0.10560274 0.07885268] 1
[0.25559518 0.33485729 0.06191562 0.04621424 0.30141767] 0
[0.77874343 0.

## 2. For a set of regions

2.1 Load all available training features into one big dataframe

In [4]:
regions = ['borde_rural', 'borde_soacha', 'mixco_3', 'mixco_1_and_ebenezer', 'dennery']

In [5]:
columns = ['id', 'features', 'label']
df_train_global = pd.DataFrame(columns=columns)

pickle_path = join('..', '..', 'pickles')
for region in regions:
    try:
        with open(join(pickle_path, 'resnet50_avg_retrained_features_' + region + '_train.pkl'), 'rb') as f:
            df_train = pickle.load(f)
            df_train_global = pd.concat([df_train_global, df_train], ignore_index=True)
    except:
        print("Error reading training data for region ", region)
    

In [6]:
df_train_global.head()

Unnamed: 0,id,features,label
0,7a37c004,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.955824, ...",0
1,7a43cf34,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.2254369,...",0
2,7a2beb9e,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.5360031,...",0
3,7a30cd44,"[0.0, 0.0, 0.0, 0.9002078, 0.0, 0.0, 0.1847459...",0
4,7a26b840,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0


In [7]:
len(df_train_global.features[13000])

2048

2.2 Expand the dataframe such that instead of a list of features, every feature has its one column

In [10]:
#features_train = df_train_global.features.apply(lambda features: pd.Series(list(features)))
#features_train

#features_train = df_train_global.features.apply(lambda x: pd.Series(list(x)))
#features_train = features_train.astype('double')

feature_matrix_global = df_train_global['features'].to_numpy()
feature_matrix_global = np.column_stack(feature_matrix_global).transpose()

labels_global = df_train_global['label'].to_numpy().astype('int')

In [16]:
scaler = StandardScaler()
scaler.fit(feature_matrix_global)
feature_matrix_scaled = scaler.transform(feature_matrix_global)

In [11]:
print(feature_matrix_global.shape)
print(labels_global.shape)

(14853, 2048)
(14853,)


## 3. Train an SVM Classifier

In [12]:
clf = svm.SVC(gamma='scale', probability=True, C=100, decision_function_shape='ovr', verbose=1)
clf.fit(feature_matrix_global, labels_global)

[LibSVM]

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=1)

In [13]:
with open(join(pickle_path, 'classifier_resnet50_avg_retrained_all_scaled.pkl' ), 'wb') as f:
    pickle.dump(clf, f)

## 4. Train a Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier(n_estimators=100, max_depth=32, random_state=0)
clf.fit(feature_matrix_global, labels_global) 

In [4]:
import numpy as np

In [9]:
a = np.arange(10).tolist()

In [10]:
a

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [12]:
for elem in a[:-3]:
    print(elem)

0
1
2
3
4
5
6
