# Classification Model

**Purpose of script:**

- Test regression model suitability in data fusion context
- Test different regressors

## Data Prep

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor

# custom functions:
from functions import read_and_prep_parquet, make_binary_labels, make_multiclass_labels, save_prediction_tif
#from functions import read_and_prep_parquet

In [2]:
df_path = r"../Data/combined/pandas_extended/"
out_path =  r"../Data/results/classification/"

In [3]:
# Train:
train_datelist = pd.date_range(start="2019-06-01",end="2019-06-30")
train_datelist = [str(day.date()) for day in train_datelist]

X_train_df_list = []
y_train_df_list = []

for day in train_datelist:
    try: # bc some days are empty
        X_train, y_train = read_and_prep_parquet(df_path + 'melt_' + day + '_extended.parquet.gzip', 'train')
        X_train_df_list.append(X_train)
        y_train_df_list.append(y_train)
    except:
        continue


X_train = pd.concat(X_train_df_list, axis=0)
y_train = pd.concat(y_train_df_list, axis=0)

del X_train_df_list
del y_train_df_list

In [4]:
# Test:

test_datelist = pd.date_range(start="2019-07-01",end="2019-07-14")
test_datelist = [str(day.date()) for day in test_datelist]

X_test_df_list = []
y_test_df_list = []

for day in test_datelist:
    try: # bc some days are empty
        X_train, y_train = read_and_prep_parquet(df_path + 'melt_' + day + '_extended.parquet.gzip', 'test')
        X_test_df_list.append(X_train)
        y_test_df_list.append(y_train)
    except:
        continue


X_test = pd.concat(X_test_df_list, axis=0)
y_test = pd.concat(y_test_df_list, axis=0)

del X_test_df_list
del y_test_df_list

## Decision Tree Classifier

### Binary Classification

In [56]:
classifier = DecisionTreeClassifier(random_state=0, criterion="log_loss")
classifier.fit(X_train, make_binary_labels(y_train))

y_predicted = classifier.predict(X_test)

In [57]:
accuracy = accuracy_score(make_binary_labels(y_test), y_predicted)
accuracy

0.8492624287097301

In [64]:
prediction_date = '2019-07-31' # do one next day (2019-07-15) and one for tha last day of July (2019-07-31)

# get accuracy
X_get_acc, y_get_acc = read_and_prep_parquet(df_path + 'melt_' + prediction_date + '_extended.parquet.gzip', 'validate')
y_get_acc = make_binary_labels(y_get_acc)
print(f'Nan percentage: {1-(len(X_get_acc)/(2663*1462))}')
y_predicted_get_acc = classifier.predict(X_get_acc)
X_get_acc['prediction'] = y_predicted_get_acc 
acc = accuracy_score(y_get_acc, y_predicted_get_acc)
print(f'Accuracy: {acc} for {prediction_date}')

# write prediction:
X_pred1 = read_and_prep_parquet(df_path + 'melt_' + prediction_date + '_extended.parquet.gzip', 'predict') 
y_predicted_out1 = classifier.predict(X_pred1)
path_out = out_path + 'decision_tree_classifier/' + 'dtrc_binary_' + prediction_date + '.tif'
save_prediction_tif(X_pred1, y_predicted_out1, path_out)

Nan percentage: 0.7577539499849228
Accuracy:0.644337308007948 for 2019-07-31


2278725it [04:44, 8009.55it/s] 


Nan percentage: 0.6277636538201723

Accuracy: 0.8433002352973648 for 2019-07-15

-- 

Nan percentage: 0.7577539499849228

Accuracy: 0.644337308007948 for 2019-07-31

### Multiclass (buckets) classification

In [66]:
classifier = DecisionTreeClassifier(random_state=0, criterion="log_loss")
classifier.fit(X_train, make_multiclass_labels(y_train)["binned_opt_value_code"])

y_predicted = classifier.predict(X_test)

In [74]:
accuracy = accuracy_score(make_multiclass_labels(y_test)["binned_opt_value_code"], y_predicted)
accuracy

0.5568754092536725

In [93]:
prediction_date = '2019-07-31' # do one next day (2019-07-15) and one for tha last day of July (2019-07-31)

# get accuracy
X_get_acc, y_get_acc = read_and_prep_parquet(df_path + 'melt_' + prediction_date + '_extended.parquet.gzip', 'validate')
y_get_acc = make_multiclass_labels(y_get_acc)["binned_opt_value_code"]
print(f'Nan percentage: {1-(len(X_get_acc)/(2663*1462))}')
y_predicted_get_acc = classifier.predict(X_get_acc)
X_get_acc['prediction'] = y_predicted_get_acc 
acc = accuracy_score(y_get_acc, y_predicted_get_acc)
print(f'Accuracy: {acc} for {prediction_date}')

# write prediction:
X_pred1 = read_and_prep_parquet(df_path + 'melt_' + prediction_date + '_extended.parquet.gzip', 'predict') 
y_predicted_out1 = classifier.predict(X_pred1)
path_out = out_path + 'decision_tree_classifier/' + 'dtc_multiclass_' + prediction_date + '.tif'
save_prediction_tif(X_pred1, y_predicted_out1, path_out)

Nan percentage: 0.7577539499849228
Accuracy:0.2862709380811716 for 2019-07-31


2278725it [05:48, 6531.88it/s] 


Nan percentage: 0.6277636538201723

Accuracy: 0.6226444387709336 for 2019-07-15

--

Nan percentage: 0.7577539499849228

Accuracy: 0.2862709380811716 for 2019-07-31


## Logistic Regression

### Binary Classification

In [5]:
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, make_binary_labels(y_train))

y_predicted = classifier.predict(X_test)

In [7]:
accuracy = accuracy_score(make_binary_labels(y_test), y_predicted)
accuracy

0.7967810303360895

In [10]:
prediction_date = '2019-07-31' # do one next day (2019-07-15) and one for tha last day of July (2019-07-31)

# get accuracy
X_get_acc, y_get_acc = read_and_prep_parquet(df_path + 'melt_' + prediction_date + '_extended.parquet.gzip', 'validate')
y_get_acc = make_binary_labels(y_get_acc)
print(f'Nan percentage: {1-(len(X_get_acc)/(2663*1462))}')
y_predicted_get_acc = classifier.predict(X_get_acc)
X_get_acc['prediction'] = y_predicted_get_acc 
acc = accuracy_score(y_get_acc, y_predicted_get_acc)
print(f'Accuracy: {acc} for {prediction_date}')

# write prediction:
X_pred1 = read_and_prep_parquet(df_path + 'melt_' + prediction_date + '_extended.parquet.gzip', 'predict') 
y_predicted_out1 = classifier.predict(X_pred1)
path_out = out_path + 'logistic_regression/' + 'logr_binary_' + prediction_date + '.tif'
save_prediction_tif(X_pred1, y_predicted_out1, path_out)

Nan percentage: 0.7577539499849228
Accuracy:0.42979076232746427 for 2019-07-31


2278725it [02:19, 16352.06it/s]


Nan percentage: 0.6277636538201723

Accuracy: 0.798468842074757 for 2019-07-15

-- 

Nan percentage: 0.7577539499849228

Accuracy: 0.42979076232746427 for 2019-07-31

### Multiclass (buckets) Classification

In [9]:
classifier = LogisticRegression(random_state=0, solver='liblinear')
classifier.fit(X_train, make_multiclass_labels(y_train)["binned_opt_value_code"])

y_predicted = classifier.predict(X_test)

In [10]:
y_predicted = classifier.predict(X_test)

In [11]:
accuracy = accuracy_score(make_multiclass_labels(y_test)["binned_opt_value_code"], y_predicted)
accuracy

0.5728542644471752

In [13]:
prediction_date = '2019-07-31' # do one next day (2019-07-15) and one for tha last day of July (2019-07-31)

# get accuracy
X_get_acc, y_get_acc = read_and_prep_parquet(df_path + 'melt_' + prediction_date + '_extended.parquet.gzip', 'validate')
y_get_acc = make_multiclass_labels(y_get_acc)["binned_opt_value_code"]
print(f'Nan percentage: {1-(len(X_get_acc)/(2663*1462))}')
y_predicted_get_acc = classifier.predict(X_get_acc)
X_get_acc['prediction'] = y_predicted_get_acc 
acc = accuracy_score(y_get_acc, y_predicted_get_acc)
print(f'Accuracy: {acc} for {prediction_date}')

# write prediction:
X_pred1 = read_and_prep_parquet(df_path + 'melt_' + prediction_date + '_extended.parquet.gzip', 'predict') 
y_predicted_out1 = classifier.predict(X_pred1)
path_out = out_path + 'logistic_regression/' + 'logr_multiclass_' + prediction_date + '.tif'
save_prediction_tif(X_pred1, y_predicted_out1, path_out)

Nan percentage: 0.7577539499849228
Accuracy:0.269387936866079 for 2019-07-31


2278725it [02:20, 16255.76it/s]


Nan percentage: 0.6277636538201723

Accuracy: 0.5146243177411453 for 2019-07-15

--

Nan percentage: 0.7577539499849228

Accuracy: 0.269387936866079 for 2019-07-31