In [1]:
import os
import pandas as pd
pd.options.display.max_columns = 250

import machine_learning as ml
from preprocessing import separate_features_target

In [2]:
# Path
cwd = os.getcwd()
csv_path = os.path.join(cwd, 'data/csv/')

train = {
    'stylometric' : ['style_train_balanced.csv','style_train_imbalanced.csv']
}
test = {
    'stylometric' : ['style_test_balanced.csv','style_test_imbalanced.csv']
}

# Balanced Dataset

In [3]:
style_train_balanced_complete = pd.read_csv(os.path.join(csv_path, train['stylometric'][0]), index_col=0)
style_test_balanced_complete = pd.read_csv(os.path.join(csv_path, test['stylometric'][0]), index_col=0)

In [4]:
style_train_balanced = separate_features_target(style_train_balanced_complete)
style_test_balanced = separate_features_target(style_test_balanced_complete)

## Train

### Logistic Regression

In [5]:
lr_style_balanced = ml.train_logistic_regression(style_train_balanced['features'], style_train_balanced['target'], show_train_accuracy=1)
lr_style_balanced, lr_style_balanced_scaler = lr_style_balanced['model'], lr_style_balanced['scaler']

Train accuracy: 0.8596881959910914


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Decision Tree

In [6]:
dt_style_balanced = ml.train_decision_tree(style_train_balanced['features'], style_train_balanced['target'], show_train_accuracy=1)

Train accuracy: 0.8611729769858946


### Random Forest

In [7]:
rf_style_balanced = ml.train_random_forest(style_train_balanced['features'], style_train_balanced['target'], show_train_accuracy=1)

Train accuracy: 0.8904974016332591


### Gradient Boosting Tree

In [8]:
gb_style_balanced = ml.train_gradient_boost(style_train_balanced['features'], style_train_balanced['target'], show_train_accuracy=1)

Train accuracy: 0.9450631031922792


### Naive Bayes

In [9]:
nb_style_balanced = ml.train_naive_bayes(style_train_balanced['features'], style_train_balanced['target'], remove_negatives=True, show_train_accuracy=1)
nb_style_balanced, nb_style_balanced_scaler = nb_style_balanced['model'], nb_style_balanced['scaler']

Train accuracy: 0.8006681514476615


## Test

In [10]:
models = [lr_style_balanced, dt_style_balanced, rf_style_balanced, gb_style_balanced, nb_style_balanced]
names = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting Tree', 'Naive Bayes']

In [11]:
results_style_balanced = ml.multi_model_results(models, names, style_test_balanced['features'], style_test_balanced['target'], lr_style_balanced_scaler, nb_style_balanced_scaler)

In [12]:
results_style_balanced

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
Logistic Regression,0.872404,0.880814,0.87069,0.875723,0.125767,0.12931,0.872461
Decision Tree,0.847181,0.863501,0.836207,0.849635,0.141104,0.163793,0.847551
Random Forest,0.857567,0.857955,0.867816,0.862857,0.153374,0.132184,0.857221
Gradient Boosting Tree,0.89911,0.9,0.905172,0.902579,0.107362,0.094828,0.898905
Naive Bayes,0.799703,0.829721,0.770115,0.798808,0.168712,0.229885,0.800702


# Imbalanced Dataset

In [13]:
style_train_balanced_complete = pd.read_csv(os.path.join(csv_path, train['stylometric'][1]), index_col=0)
style_test_balanced_complete = pd.read_csv(os.path.join(csv_path, test['stylometric'][1]), index_col=0)

In [14]:
style_train_imbalanced = separate_features_target(style_train_balanced_complete)
style_test_imbalanced = separate_features_target(style_test_balanced_complete)

## Train

### Logistic Regression

In [15]:
lr_style_imbalanced = ml.train_logistic_regression(style_train_imbalanced['features'], style_train_imbalanced['target'], show_train_accuracy=1)
lr_style_imbalanced, lr_style_imbalanced_scaler = lr_style_imbalanced['model'], lr_style_imbalanced['scaler']

Train accuracy: 0.9412758480875794


### Decision Tree

In [16]:
dt_style_imbalanced = ml.train_decision_tree(style_train_imbalanced['features'], style_train_imbalanced['target'], show_train_accuracy=1)

Train accuracy: 0.9497229355318286


### Random Forest

In [17]:
rf_style_imbalanced = ml.train_random_forest(style_train_imbalanced['features'], style_train_imbalanced['target'], show_train_accuracy=1)

Train accuracy: 0.9326260305446682


### Gradient Boosting Tree

In [18]:
gb_style_imbalanced = ml.train_gradient_boost(style_train_imbalanced['features'], style_train_imbalanced['target'], show_train_accuracy=1)

Train accuracy: 0.9634410055412893


### Naive Bayes

In [19]:
nb_style_imbalanced = ml.train_naive_bayes(style_train_imbalanced['features'], style_train_imbalanced['target'], remove_negatives=True, show_train_accuracy=1)
nb_style_imbalanced, nb_style_imbalanced_scaler = nb_style_imbalanced['model'], nb_style_imbalanced['scaler']

Train accuracy: 0.9080281119070145


## Test

In [20]:
models = [lr_style_imbalanced, dt_style_imbalanced, rf_style_imbalanced, gb_style_imbalanced, nb_style_imbalanced]
names = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting Tree', 'Naive Bayes']

In [21]:
results_style_imbalanced = ml.multi_model_results(models, names, style_test_imbalanced['features'], style_test_imbalanced['target'], lr_style_imbalanced_scaler, nb_style_imbalanced_scaler)

  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
results_style_imbalanced

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
Logistic Regression,0.942703,0.78,0.481481,0.59542,0.013033,0.518519,0.734224
Decision Tree,0.947027,0.760163,0.57716,0.65614,0.017476,0.42284,0.779842
Random Forest,0.934865,0.988235,0.259259,0.410758,0.000296,0.740741,0.629482
Gradient Boosting Tree,0.958378,0.872807,0.614198,0.721014,0.00859,0.385802,0.802804
Naive Bayes,0.912432,0.0,0.0,0.0,0.0,1.0,0.5
