# Classification Model

**Purpose of script:**

Load the feature engineered data (tif) to investigate the suitability of classification plus testing of different models

- In: combined and extended dataframe (tif)
- Out: predictions (tif) for 2019-07-15 and 2019-07-31
    - decision tree classification with multiclass labels
    - decision tree classification with binary labels
    - logistic regression with multiclass labels
    - logistic regression with binary labels

imports

In [1]:
import pandas as pd

from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# custom functions:
from functions import read_and_prep_parquet, make_binary_labels, make_multiclass_labels, save_prediction_tif

## Data Prep

Relevant Paths

In [2]:
df_path = r"../Data/combined/dataframe_extended/"
out_path =  r"../Data/results/classification/"

Create train set from all of June 

In [3]:
train_datelist = pd.date_range(start="2019-06-01",end="2019-06-30")
train_datelist = [str(day.date()) for day in train_datelist]

X_train_df_list = []
y_train_df_list = []

for day in train_datelist:
    try: # becausec some days are missing in the data files (e.g. 2019-06-04)
        X_train, y_train = read_and_prep_parquet(df_path + 'melt_' + day + '_extended.parquet.gzip', 'train')
        X_train_df_list.append(X_train)
        y_train_df_list.append(y_train)
    except:
        continue


X_train = pd.concat(X_train_df_list, axis=0)
y_train = pd.concat(y_train_df_list, axis=0)

del X_train_df_list
del y_train_df_list

Create test set from July 1st to July 14th

In [4]:
test_datelist = pd.date_range(start="2019-07-01",end="2019-07-14")
test_datelist = [str(day.date()) for day in test_datelist]

X_test_df_list = []
y_test_df_list = []

for day in test_datelist:
    try: # becausec some days are missing in the data files (e.g. 2019-06-04)
        X_test, y_test = read_and_prep_parquet(df_path + 'melt_' + day + '_extended.parquet.gzip', 'test')
        X_test_df_list.append(X_test)
        y_test_df_list.append(y_test)
    except:
        continue


X_test = pd.concat(X_test_df_list, axis=0)
y_test = pd.concat(y_test_df_list, axis=0)

del X_test_df_list
del y_test_df_list

## Decision Tree Classifier

### Binary Classification

Train binary classifier on whole dataset

In [34]:
classifier = DecisionTreeClassifier(random_state=0, criterion="log_loss")
classifier.fit(X_train, make_binary_labels(y_train))

Calculate accuracy for whole dataset

In [35]:
y_predicted = classifier.predict(X_test)
accuracy = accuracy_score(make_binary_labels(y_test), y_predicted)
accuracy

0.8492624287097301

Predict binary labels for 2019-07-15 and 2019-07-31 and calculate accuracy for each

In [36]:
for prediction_date in ['2019-07-15', '2019-07-31']: # do one next day (2019-07-15) and one for tha last day of July (2019-07-31)

    # get accuracy
    X, y = read_and_prep_parquet(df_path + 'melt_' + prediction_date + '_extended.parquet.gzip', 'validate')
    y_predicted_binary = classifier.predict(X)
    acc = accuracy_score(make_binary_labels(y), y_predicted_binary)
    print(prediction_date)
    print(f'Missing data percentage: {1-(len(X)/(2663*1462))}')
    print(f'Accuracy: {acc}')

    # write prediction:
    X_predict = read_and_prep_parquet(df_path + 'melt_' + prediction_date + '_extended.parquet.gzip', 'predict') 
    y_predicted_binary = classifier.predict(X_predict)
    path_out = out_path + 'decision_tree_classifier/' + 'dtrc_binary_' + prediction_date + '.tif'
    save_prediction_tif(X_predict, y_predicted_binary, path_out)

    print()

2019-07-15
Missing data percentage: 0.6277636538201723
Accuracy: 0.8433002352973648
Nan percentage: 0.36401715871814283


2278725it [02:01, 18694.81it/s]



2019-07-31
Missing data percentage: 0.7577539499849228
Accuracy: 0.644337308007948
Nan percentage: 0.5861115316679283


2278725it [02:08, 17667.01it/s]







### Multiclass (buckets) classification

Train multiclass classifier on whole dataset

In [6]:
classifier = DecisionTreeClassifier(random_state=0, criterion="log_loss")
classifier.fit(X_train, make_multiclass_labels(y_train)["binned_opt_value_code"])

Calculate accuracy for whole dataset

In [74]:
y_predicted = classifier.predict(X_test)
accuracy = accuracy_score(make_multiclass_labels(y_test)["binned_opt_value_code"], y_predicted)
accuracy

0.5568754092536725

Predict multiclass labels for 2019-07-15 and 2019-07-31 and calculate accuracy for each

In [7]:
for prediction_date in ['2019-07-15', '2019-07-31']: # do one next day (2019-07-15) and one for tha last day of July (2019-07-31)

    # get accuracy
    X, y = read_and_prep_parquet(df_path + 'melt_' + prediction_date + '_extended.parquet.gzip', 'validate')
    y_predicted_multiclass = classifier.predict(X)
    acc = accuracy_score(make_multiclass_labels(y)["binned_opt_value_code"], y_predicted_multiclass)
    print(prediction_date)
    print(f'Missing data percentage: {1-(len(X)/(2663*1462))}')
    print(f'Accuracy: {acc}')

    # write prediction:
    X_predict = read_and_prep_parquet(df_path + 'melt_' + prediction_date + '_extended.parquet.gzip', 'predict') 
    y_predicted_multiclass = classifier.predict(X_predict)
    path_out = out_path + 'decision_tree_classifier/' + 'dtrc_multiclass_' + prediction_date + '.tif'
    save_prediction_tif(X_predict, y_predicted_multiclass, path_out)

    print()


2019-07-15
Missing data percentage: 0.6277636538201723
Accuracy: 0.4125949642223802
Nan percentage: 0.36401715871814283

2019-07-31
Missing data percentage: 0.7577539499849228
Accuracy: 0.303987327411259
Nan percentage: 0.5861115316679283



"OLD" VALUES:

Nan percentage: 0.6277636538201723

Accuracy: 0.6226444387709336 for 2019-07-15

--

Nan percentage: 0.7577539499849228

Accuracy: 0.2862709380811716 for 2019-07-31


## Logistic Regression

### Binary Classification

Train binary classifier on whole dataset

In [19]:
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, make_binary_labels(y_train))

Calculate accuracy for whole dataset

In [20]:
y_predicted = classifier.predict(X_test)
accuracy = accuracy_score(make_binary_labels(y_test), y_predicted)
accuracy

0.7690444825617601

Predict binary labels for 2019-07-15 and 2019-07-31 and calculate accuracy for each

In [22]:
for prediction_date in ['2019-07-15', '2019-07-31']: # do one next day (2019-07-15) and one for tha last day of July (2019-07-31)

    # get accuracy
    X, y = read_and_prep_parquet(df_path + 'melt_' + prediction_date + '_extended.parquet.gzip', 'validate')
    y_predicted_binary = classifier.predict(X)
    acc = accuracy_score(make_binary_labels(y), y_predicted_binary)
    print(prediction_date)
    print(f'Missing data percentage: {1-(len(X)/(2663*1462))}')
    print(f'Accuracy: {acc}')

    # write prediction:
    X_predict = read_and_prep_parquet(df_path + 'melt_' + prediction_date + '_extended.parquet.gzip', 'predict') 
    y_predicted_binary = classifier.predict(X_predict)
    path_out = out_path + 'decision_tree_classifier/' + 'dtrc_binary_' + prediction_date + '.tif'
    save_prediction_tif(X_predict, y_predicted_binary, path_out)

    print()

2019-07-15
Missing data percentage: 0.6277636538201723
Accuracy: 0.7505068208634931
Nan percentage: 0.36401715871814283

2019-07-31
Missing data percentage: 0.7577539499849228
Accuracy: 0.3668582964529051
Nan percentage: 0.5861115316679283



### Multiclass (buckets) Classification

Train multiclass classifier on whole dataset

In [8]:
classifier = LogisticRegression(random_state=0, solver='liblinear')
classifier.fit(X_train, make_multiclass_labels(y_train)["binned_opt_value_code"])

Calculate accuracy for whole dataset

In [11]:
y_predicted = classifier.predict(X_test)
accuracy = accuracy_score(make_multiclass_labels(y_test)["binned_opt_value_code"], y_predicted)
accuracy

0.5728542644471752

Predict multiclass labels for 2019-07-15 and 2019-07-31 and calculate accuracy for each

In [None]:
for prediction_date in ['2019-07-15', '2019-07-31']: # do one next day (2019-07-15) and one for tha last day of July (2019-07-31)

    # get accuracy
    X, y = read_and_prep_parquet(df_path + 'melt_' + prediction_date + '_extended.parquet.gzip', 'validate')
    y_predicted_multiclass = classifier.predict(X)
    acc = accuracy_score(make_multiclass_labels(y)["binned_opt_value_code"], y_predicted_multiclass)
    print(prediction_date)
    print(f'Missing data percentage: {1-(len(X)/(2663*1462))}')
    print(f'Accuracy: {acc}')

    # write prediction:
    X_predict = read_and_prep_parquet(df_path + 'melt_' + prediction_date + '_extended.parquet.gzip', 'predict') 
    y_predicted_multiclass = classifier.predict(X_predict)
    path_out = out_path + 'decision_tree_classifier/' + 'dtrc_multiclass_' + prediction_date + '.tif'
    #save_prediction_tif(X_predict, y_predicted_multiclass, path_out)

    print()


"OLD" VALUES:

Nan percentage: 0.6277636538201723

Accuracy: 0.5146243177411453 for 2019-07-15

--

Nan percentage: 0.7577539499849228

Accuracy: 0.269387936866079 for 2019-07-31