In [1]:
import os
import pandas as pd
pd.options.display.max_columns = 250

import machine_learning as ml

from ast import literal_eval

In [2]:
# Path
csv_path = './data/csv/'

train = {
    'tfidf_sel' : ['tfidf_chi2_train_1.csv','tfidf_chi2_train_2.csv'],
    'word2vec' : ['word2vec_train_1.csv','word2vec_train_2.csv']
}
test = {
    'tfidf_sel' : ['tfidf_chi2_test_1.csv','tfidf_chi2_test_2.csv'],
    'word2vec' : ['word2vec_test_1.csv','word2vec_test_2.csv']
}

# Balanced Dataset

## Train TF-IDF

In [3]:
tfidf_train_1 = pd.read_csv(os.path.join(csv_path, train['tfidf_sel'][0]), index_col=0)
tfidf_test_1 = pd.read_csv(os.path.join(csv_path, test['tfidf_sel'][0]), index_col=0)

In [4]:
tfidf_train_1 = ml.separate_features_target(tfidf_train_1)
tfidf_test_1 = ml.separate_features_target(tfidf_test_1)

### Logistic Regression

In [5]:
lr_tfidf = ml.train_logistic_regression(tfidf_train_1['features'], tfidf_train_1['target'], show_train_accuracy=1)

Train accuracy: 0.9944893460690669


### Decision Tree

In [6]:
dt_tfidf = ml.train_decision_tree(tfidf_train_1['features'], tfidf_train_1['target'], show_train_accuracy=1)

Train accuracy: 0.9357090374724467


### Random Forest

In [7]:
rf_tfidf = ml.train_random_forest(tfidf_train_1['features'], tfidf_train_1['target'], show_train_accuracy=1)

Train accuracy: 0.9474650991917708


### Gradient Boosting Tree

In [8]:
gb_tfidf = ml.train_gradient_boost(tfidf_train_1['features'], tfidf_train_1['target'], show_train_accuracy=1)

Train accuracy: 0.9757531227038942


### Naive Bayes

In [9]:
nb_tfidf = ml.train_naive_bayes(tfidf_train_1['features'], tfidf_train_1['target'], show_train_accuracy=1)

Train accuracy: 0.9717119764878765


## Train Word2Vec

In [10]:
word2vec_train_1 = pd.read_csv(os.path.join(csv_path, train['word2vec'][0]), index_col=0)
word2vec_test_1 = pd.read_csv(os.path.join(csv_path, test['word2vec'][0]), index_col=0)

In [11]:
word2vec_train_1 = ml.separate_features_target(word2vec_train_1)
word2vec_test_1 = ml.separate_features_target(word2vec_test_1)

### Logistic Regression

In [12]:
lr_word2vec = ml.train_logistic_regression(word2vec_train_1['features'], word2vec_train_1['target'], show_train_accuracy=1)

Train accuracy: 0.9919177075679647


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Decision Tree

In [13]:
dt_word2vec = ml.train_decision_tree(word2vec_train_1['features'], word2vec_train_1['target'], show_train_accuracy=1)

Train accuracy: 0.9856722997795738


### Random Forest

In [14]:
rf_word2vec = ml.train_random_forest(word2vec_train_1['features'], word2vec_train_1['target'], show_train_accuracy=1)

Train accuracy: 0.9805290227773695


### Gradient Boosting Tree

In [15]:
gb_word2vec = ml.train_gradient_boost(word2vec_train_1['features'], word2vec_train_1['target'], show_train_accuracy=1)

Train accuracy: 0.9955914768552535


### Naive Bayes

In [16]:
nb_word2vec = ml.train_naive_bayes(word2vec_train_1['features'], word2vec_train_1['target'], remove_negatives=True, show_train_accuracy=1)

Train accuracy: 0.9434239529757531


## Results

### TF-IDF

In [17]:
models = [lr_tfidf, dt_tfidf, rf_tfidf, gb_tfidf, nb_tfidf]
names = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting Tree', 'Naive Bayes']

In [18]:
results_tfidf = ml.multi_model_results(models, names, tfidf_test_1['features'], tfidf_test_1['target'])

In [19]:
results_tfidf

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
Logistic Regression,0.9442,0.980831,0.905605,0.941718,0.008811,0.04699,0.94403
Decision Tree,0.917768,0.954984,0.876106,0.913846,0.020558,0.061674,0.917585
Random Forest,0.933921,0.962264,0.902655,0.931507,0.017621,0.048458,0.933784
Gradient Boosting Tree,0.939794,0.968553,0.908555,0.937595,0.014684,0.045521,0.939657
Naive Bayes,0.963289,0.959064,0.967552,0.963289,0.020558,0.016153,0.963308


### Word2Vec

In [20]:
models = [lr_word2vec, dt_word2vec, rf_word2vec, gb_word2vec, nb_word2vec]
names = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting Tree', 'Naive Bayes']

In [21]:
results_word2vec = ml.multi_model_results(models, names, word2vec_test_1['features'], word2vec_test_1['target'])

In [22]:
results_word2vec

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
Logistic Regression,0.537445,0.519048,0.964602,0.674923,0.444934,0.017621,0.539318
Decision Tree,0.960352,0.945714,0.976401,0.960813,0.0279,0.011747,0.960423
Random Forest,0.963289,0.959064,0.967552,0.963289,0.020558,0.016153,0.963308
Gradient Boosting Tree,0.964758,0.95389,0.976401,0.965015,0.023495,0.011747,0.964809
Naive Bayes,0.619677,0.567114,0.99705,0.722995,0.378855,0.001468,0.621332
