# Classifiers for feature vectors

As seen in the literature, the models used for trajectory classification are SVM, KNN, DBSCAN and KMEANS. In this notebook we are going to test the effectiveness of these models.

Firstly, we are going to load the vectors where the trajectories are described by their characteristics.

In [1]:
import feature_vec as fv

metadata = fv.get_selected_data()
feat_vectors, clss_mask, clss = fv.get_feat_vectors(metadata)

100.00%

Now, we split the data into 70% for model training and the other 30% for testing.

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(feat_vectors, clss, stratify=clss, 
                                                  random_state= 0, test_size=0.30)

X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, stratify=y_test, 
                                                  random_state= 0, test_size=0.15)

## Decision Tree Classifier

In [3]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(criterion='entropy')
dtc.fit(X_train, y_train)
dtc.score(X_val, y_val)

# 0.81

0.8378109452736319

## Random Forest Classifier

In [55]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

rfc = RandomForestClassifier(criterion='entropy', max_features='log2', bootstrap=False, random_state=0)
rfc.fit(X_train, y_train)
rfc.score(X_val, y_val)

rfc.feature_importances_
# 0.88

array([4.32315254e-02, 1.09496755e-01, 3.80367493e-02, 3.67029815e-03,
       2.68469618e-02, 6.46146064e-02, 4.64163751e-06, 2.70882605e-02,
       1.18737432e-01, 6.42636589e-02, 8.40335715e-02, 1.68211507e-03,
       4.77180709e-03, 1.21925244e-02, 1.06031490e-02, 9.83357264e-03,
       0.00000000e+00, 6.53226430e-03, 8.79268238e-03, 1.01901558e-03,
       4.87328334e-03, 1.12855711e-02, 1.20712225e-02, 1.18387340e-02,
       2.44605792e-05, 7.22697269e-03, 1.44800413e-02, 8.68416143e-03,
       6.51619583e-03, 1.51198900e-02, 2.42488009e-03, 1.02698681e-02,
       1.21514845e-02, 8.10018909e-03, 1.36575258e-02, 5.81634638e-03,
       5.12107005e-03, 8.52070208e-03, 1.29449704e-02, 1.45540400e-02,
       1.23779043e-02, 6.48456420e-03, 5.50124069e-02, 6.11912752e-03,
       5.59064209e-03, 9.62740605e-03, 1.00533193e-02, 1.22760803e-02,
       8.95032224e-03, 6.67525664e-03, 4.56997698e-02])

In [None]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

pipeline = Pipeline([('pca', PCA(n_components = 14)), 
                     ('Random_Forest', 
                      RandomForestClassifier(criterion='entropy', max_features='log2', bootstrap=False))])

pipeline.fit(X_train, y_train)
pipeline.score(X_val, y_val)

### Looking for a good combination of hyperparameters.

Grid Search based on out-of-bag score

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid

import pandas as pd

param_grid = ParameterGrid(
                {'n_estimators': [150],
                 'max_features': [5, 7, 9, 15, 25, 'log2'],
                 'max_depth'   : [None, 3, 10, 20, 30],
                 'criterion'   : ['gini', 'entropy']
                }
            )

results = {'params': [], 'oob_accuracy': []}

for params in param_grid:
    
    model = RandomForestClassifier(
                oob_score    = True,
                n_jobs       = -1,
                random_state = 0,
                ** params
             )
    
    model.fit(X_train, y_train)
    
    results['params'].append(params)
    results['oob_accuracy'].append(modelo.oob_score_)
    print(f"Modelo: {params} \u2713")

results = pd.DataFrame(results)
results = pd.concat([results, results['params'].apply(pd.Series)], axis=1)
results = results.sort_values('oob_accuracy', ascending=False)
results = results.drop(columns = 'params')
results.head(5)

Grid Search based on cross validation

In [54]:
from sklearn.model_selection import RepeatedKFold
import multiprocessing

param_grid = {'n_estimators': [150],
            'max_features': [5, 7, 9, 15, 25, 'log2'],
            'max_depth'   : [None, 3, 10, 20, 30],
            'criterion'   : ['gini', 'entropy']
            }

grid = GridSearchCV(
        estimator  = RandomForestClassifier(random_state = 0),
        param_grid = param_grid,
        scoring    = 'accuracy',
        n_jobs     = multiprocessing.cpu_count() - 1,
        cv         = RepeatedKFold(n_splits=5, n_repeats=3, random_state=0), 
        refit      = True,
        verbose    = 0,
        return_train_score = True
       )

grid.fit(X = X_train, y = y_train)

results = pd.DataFrame(grid.cv_results_)
results.filter(regex = '(param*|mean_t|std_t)') \
    .drop(columns = 'params') \
    .sort_values('mean_test_score', ascending = False) \
    .head(4)

Unnamed: 0,param_criterion,param_max_depth,param_max_features,param_n_estimators,mean_test_score,std_test_score,mean_train_score,std_train_score
55,entropy,30.0,7,150,0.88913,0.009949,1.0,0.0
49,entropy,20.0,7,150,0.88913,0.010166,1.0,0.0
31,entropy,,7,150,0.88913,0.009949,1.0,0.0
19,gini,20.0,7,150,0.888889,0.012632,1.0,0.0


## K-Nearest Neighbors

In [5]:
from sklearn.neighbors import KNeighborsClassifier

Whith number of neighbors by default for kneighbors queries.

In [6]:
knn = KNeighborsClassifier(weights='distance')
knn.fit(X_train, y_train)
knn.score(X_val, y_val)

# 0.70

0.7054726368159204

The results with the weights parameter with value 'distance' are better.

Whith number of neighbors in 20.

In [7]:
knn = KNeighborsClassifier(weights='distance', n_neighbors=20)
knn.fit(X_train, y_train)
knn.score(X_val, y_val)

# 0.71

0.7223880597014926

## Support Vector Machine

In [9]:
from sklearn.svm import SVC

Standardizing the data and using the rbf kernel.

In [10]:
svm = make_pipeline(StandardScaler(), SVC(kernel = 'rbf', gamma='auto', probability=True))
svm.fit(X_train, y_train)

svm.score(X_val, y_val)

# 0.85

0.83681592039801

Let's evaluate the model

In [11]:
acc_score = accuracy_score(y_test, y_pred=svm.predict(X_test))
auc_score = roc_auc_score(y_test, svm.predict_proba(X_test)[:], multi_class='ovr')
print(f"Accuracy: {acc_score:0.4f}")
print(f"AUC: {auc_score:0.4f}")

Accuracy: 0.8371
AUC: 0.9582


In [12]:
svm = make_pipeline(StandardScaler(), SVC(kernel='poly', degree=3, gamma='scale'))
svm.fit(X_train, y_train)

svm.score(X_val, y_val)

#0.72

0.7114427860696517

In [13]:
svm = make_pipeline(StandardScaler(), SVC(kernel='sigmoid', gamma='auto'))
svm.fit(X_train, y_train)

svm.score(X_val, y_val)

# 0.71

0.7522388059701492

## Neural Networks

In [None]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

model = keras.models.Sequential()

model.add(keras.layers.Dense(100, activation = 'sigmoid'))
model.add(keras.layers.Dense(300, activation = 'relu'))
model.add(keras.layers.Dense(100, activation = 'relu'))
model.add(keras.layers.Dense(5, activation= 'softmax'))

model.compile(loss="sparse_categorical_crossentropy",
              optimizer="sgd",
              metrics=["accuracy"])
history = model.fit(np.array(X_train), np.array(y_train), epochs=30)

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
import multiprocessing

model1 = MLPClassifier(
                hidden_layer_sizes=(5),
                learning_rate_init=0.01,
                solver = 'lbfgs',
                max_iter = 1000,
                random_state = 123
            )

model2 = MLPClassifier(
                hidden_layer_sizes=(10),
                learning_rate_init=0.01,
                solver = 'lbfgs',
                max_iter = 1000,
                random_state = 123
            )

model3 = MLPClassifier(
                hidden_layer_sizes=(20, 20),
                learning_rate_init=0.01,
                solver = 'lbfgs',
                max_iter = 5000,
                random_state = 123
            )

model4 = MLPClassifier(
                hidden_layer_sizes=(50, 50, 50),
                learning_rate_init=0.01,
                solver = 'lbfgs',
                max_iter = 5000,
                random_state = 123
            )

model1.fit(X=X_train, y=y_train)
model2.fit(X=X_train, y=y_train)
model3.fit(X=X_train, y=y_train)
model4.fit(X=X_train, y=y_train)

In [None]:
model1.score(X_val, y_val)
model2.score(X_val, y_val)
model3.score(X_val, y_val)
model4.score(X_val, y_val)

0.2407960199004975