In [1]:
import os
import pandas as pd
pd.options.display.max_columns = 250

import machine_learning as ml

from ast import literal_eval

In [2]:
# Path
csv_path = './data/csv/'

train = {
    'tfidf_sel' : ['tfidf_chi2_train_1.csv','tfidf_chi2_train_2.csv'],
    'word2vec' : ['word2vec_train_1.csv','word2vec_train_2.csv']
}
test = {
    'tfidf_sel' : ['tfidf_chi2_test_1.csv','tfidf_chi2_test_2.csv'],
    'word2vec' : ['word2vec_test_1.csv','word2vec_test_2.csv']
}

# Balanced Dataset

## Train TF-IDF

In [3]:
tfidf_train_1 = pd.read_csv(os.path.join(csv_path, train['tfidf_sel'][0]), index_col=0)
tfidf_test_1 = pd.read_csv(os.path.join(csv_path, test['tfidf_sel'][0]), index_col=0)

In [4]:
tfidf_train_1 = ml.separate_features_target(tfidf_train_1)
tfidf_test_1 = ml.separate_features_target(tfidf_test_1)

### Logistic Regression

In [5]:
lr = ml.train_logistic_regression(tfidf_train_1['features'], tfidf_train_1['target'], show_train_accuracy=1)

Train accuracy: 0.9944893460690669


### Decision Tree

In [6]:
dt = ml.train_decision_tree(tfidf_train_1['features'], tfidf_train_1['target'], show_train_accuracy=1)

Train accuracy: 0.9357090374724467


### Random Forest

In [7]:
rf = ml.train_random_forest(tfidf_train_1['features'], tfidf_train_1['target'], show_train_accuracy=1)

Train accuracy: 0.9474650991917708


### Gradient Boosting Tree

In [8]:
gb = ml.train_gradient_boost(tfidf_train_1['features'], tfidf_train_1['target'], show_train_accuracy=1)

Train accuracy: 0.9757531227038942


### Naive Bayes

In [9]:
nb = ml.train_naive_bayes(tfidf_train_1['features'], tfidf_train_1['target'], show_train_accuracy=1)

Train accuracy: 0.9717119764878765


## Results

### TF-IDF

In [10]:
models = [lr, dt, rf, gb, nb]
names = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting Tree', 'Naive Bayes']

In [11]:
results_tfidf = ml.multi_model_results(models, names, tfidf_test_1['features'], tfidf_test_1['target'])

In [12]:
results_tfidf

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
Logistic Regression,0.958884,0.964179,0.952802,0.958457,0.017621,0.023495,0.958857
Decision Tree,0.917768,0.954984,0.876106,0.913846,0.020558,0.061674,0.917585
Random Forest,0.933921,0.962264,0.902655,0.931507,0.017621,0.048458,0.933784
Gradient Boosting Tree,0.939794,0.968553,0.908555,0.937595,0.014684,0.045521,0.939657
Naive Bayes,0.963289,0.959064,0.967552,0.963289,0.020558,0.016153,0.963308
