# Classification Model

**Purpose of script:**

- Test regression model suitability in data fusion context
- Test different regressors

## Data Prep

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor

In [2]:
df_path = r"../Data/combined/"
df = pd.read_parquet(df_path + 'melt_2019-07-01_extended.parquet.gzip')
df = df.fillna(-1)

In [3]:
X = df[['x', 'y', 'mw_value', 'col', 'row', 'v1', 'v2', 'v3', 'v4', 'v6', 'v7', 'v8', 'v9', 'mean']] # v5 is duplicated
y = df[['opt_value']]

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training, 20% test and 10% validation
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.3, random_state=1)

In [14]:
y_train_binary = y_train["opt_value"].apply(lambda x: 1 if x >= 0.64 else 0)
y_test_binary = y_test["opt_value"].apply(lambda x: 1 if x >= 0.64 else 0)
y_val_binary = y_val["opt_value"].apply(lambda x: 1 if x >= 0.64 else 0)

In [5]:
y_train_buckets = y_train.copy()
y_train_buckets['binned_opt_value'] = pd.cut(y_train_buckets['opt_value'], 
                                    list(np.arange(0, 0.41, 0.2)) 
                                    + [0.64] 
                                    + list(np.arange(0.8, 2.01, 0.2)) 
                                    + [7.0])

buckets = list(y_train_buckets['binned_opt_value'].unique())
buckets.sort()
num_buckets = len(buckets)
value_bucket_lookup = dict(zip(buckets, range(num_buckets)))
y_train_buckets['binned_opt_value_code'] = y_train_buckets['binned_opt_value'].replace(value_bucket_lookup).values

In [6]:
# same for test
y_test_buckets = y_test.copy()
y_test_buckets['binned_opt_value'] = pd.cut(y_test_buckets['opt_value'], 
                                    list(np.arange(0, 0.41, 0.2)) 
                                    + [0.64] 
                                    + list(np.arange(0.8, 2.01, 0.2)) 
                                    + [7.0])

buckets_test = list(y_test_buckets['binned_opt_value'].unique())
buckets_test.sort()
num_buckets_test = len(buckets_test)
value_bucket_lookup_test = dict(zip(buckets_test, range(num_buckets_test)))
y_test_buckets['binned_opt_value_code'] = y_test_buckets['binned_opt_value'].replace(value_bucket_lookup_test).values

In [74]:
y_val_buckets = y_val.copy()
y_val_buckets['binned_opt_value'] = pd.cut(y_val_buckets['opt_value'], 
                                    list(np.arange(0, 0.41, 0.2)) 
                                    + [0.64] 
                                    + list(np.arange(0.8, 2.01, 0.2)) 
                                    + [7.0])

buckets_val = list(y_val_buckets['binned_opt_value'].unique())
buckets_val.sort()
num_buckets_val = len(buckets_val)
value_bucket_lookup_val = dict(zip(buckets_val, range(num_buckets_val)))
y_val_buckets['binned_opt_value_code'] = y_val_buckets['binned_opt_value'].replace(value_bucket_lookup_val).values

## Decision Tree Classifier

### Binary Classification

In [70]:
classifier = DecisionTreeClassifier(random_state=0)
classifier.fit(X_train, y_train_binary)

y_predicted = classifier.predict(X_test)

In [46]:
rmse = np.sqrt(mean_squared_error(y_test_binary, y_predicted))
rmse

0.17007349119759516

In [47]:
accuracy = accuracy_score(y_test_binary, y_predicted)
accuracy

0.9710750075918615

### Multiclass (buckets) classification

In [87]:
classifier = DecisionTreeClassifier(random_state=0)
classifier.fit(X_train, y_train_buckets['binned_opt_value_code'])

y_predicted = classifier.predict(X_test)

In [88]:
rmse = np.sqrt(mean_squared_error(y_test_buckets["binned_opt_value_code"], y_predicted))
rmse

0.7727960835954906

In [89]:
accuracy = accuracy_score(y_test_binary, y_predicted)
accuracy

0.28574501467759894

## Random Forest Classifier

In [93]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
random_grid

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
 'max_features': ['auto', 'sqrt'],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [1, 2, 4],
 'bootstrap': [True, False]}

In [94]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train_binary)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


### Binary Classification - BEST

In [172]:
classifier = RandomForestClassifier(random_state=0)
classifier.fit(X_train, y_train_binary)

y_predicted = classifier.predict(X_test)

In [174]:
rmse = np.sqrt(mean_squared_error(y_test_binary, y_predicted))
rmse

0.15735984771292208

In [175]:
accuracy = accuracy_score(y_test_binary, y_predicted)
accuracy

0.975237878327766

### Multiclass (buckets) Classification

In [178]:
classifier = RandomForestClassifier(random_state=0)
classifier.fit(X_train, y_train_buckets['binned_opt_value_code'])

y_predicted = classifier.predict(X_test)

In [180]:
rmse = np.sqrt(mean_squared_error(y_test_buckets["binned_opt_value_code"], y_predicted))
rmse

0.7264454196061586

In [181]:
accuracy = accuracy_score(y_test_binary, y_predicted)
accuracy

0.28989523231096265

## Logistic Regression

### Binary Classification

In [184]:
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train_binary)

y_predicted = classifier.predict(X_test)

In [186]:
rmse = np.sqrt(mean_squared_error(y_test_binary, y_predicted))
rmse

0.4755445038683857

In [187]:
accuracy = accuracy_score(y_test_binary, y_predicted)
accuracy

0.773857424840571

### Multiclass (buckets) Classification

In [193]:
classifier = LogisticRegression(random_state=0, max_iter=500)
classifier.fit(X_train, y_train_buckets['binned_opt_value_code'])

In [194]:
y_predicted = classifier.predict(X_test)

In [195]:
rmse = np.sqrt(mean_squared_error(y_test_binary, y_predicted))
rmse

0.7612984709552048

In [196]:
accuracy = accuracy_score(y_test_binary, y_predicted)
accuracy

0.42042463812126735

## k-nearest Neighbors Classification

### Binary Classification

In [None]:
parameters = {"n_neighbors": range(1, 10)}
gridsearch = GridSearchCV(KNeighborsRegressor(), parameters)
gridsearch.fit(X_train, y_train_binary)
gridsearch.best_params_

In [207]:
classifier = KNeighborsRegressor(n_neighbors=5)
classifier.fit(X_train, y_train_binary)

y_predicted = classifier.predict(X_test).astype(int)

In [208]:
rmse = np.sqrt(mean_squared_error(y_test_binary, y_predicted))
rmse

0.24455038163703965

In [209]:
accuracy = accuracy_score(y_test_binary, y_predicted)
accuracy

0.9401951108411782

### Multiclass (buckets) Classification

In [None]:
parameters = {"n_neighbors": range(1, 10)}
gridsearch = GridSearchCV(KNeighborsRegressor(), parameters)
gridsearch.fit(X_train, y_train_buckets['binned_opt_value_code'])
gridsearch.best_params_

In [None]:
classifier = KNeighborsRegressor(n_neighbors=4)
classifier.fit(X_train, y_train_buckets['binned_opt_value_code'])

y_predicted = classifier.predict(X_test).astype(int)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test_buckets['binned_opt_value_code'], y_predicted))
rmse

In [None]:
accuracy = accuracy_score(y_test_buckets['binned_opt_value_code'], y_predicted)
accuracy