In [1]:
import os
import pandas as pd
pd.options.display.max_columns = 250

import machine_learning as ml
from preprocessing import separate_features_target

from ast import literal_eval

In [2]:
# Path
cwd = os.getcwd()
csv_path = os.path.join(cwd, 'data/csv/')

train = {
    'tfidf_sel' : ['tfidf_chi2_train_balanced.csv','tfidf_chi2_train_imbalanced.csv'],
    'word2vec' : ['word2vec_train_balanced.csv','word2vec_train_imbalanced.csv']
}
test = {
    'tfidf_sel' : ['tfidf_chi2_test_balanced.csv','tfidf_chi2_test_imbalanced.csv'],
    'word2vec' : ['word2vec_test_balanced.csv','word2vec_test_imbalanced.csv']
}

# Balanced Dataset

## Train TF-IDF

In [3]:
tfidf_train_balanced = pd.read_csv(os.path.join(csv_path, train['tfidf_sel'][0]), index_col=0)
tfidf_test_balanced = pd.read_csv(os.path.join(csv_path, test['tfidf_sel'][0]), index_col=0)

In [4]:
tfidf_train_balanced = separate_features_target(tfidf_train_balanced)
tfidf_test_balanced = separate_features_target(tfidf_test_balanced)

### Logistic Regression

In [5]:
lr_tfidf_balanced = ml.train_logistic_regression(tfidf_train_balanced['features'], tfidf_train_balanced['target'], show_train_accuracy=1)
lr_tfidf_balanced, lr_tfidf_balanced_scaler = lr_tfidf_balanced['model'], lr_tfidf_balanced['scaler']

Train accuracy: 0.9911039223615042


### Decision Tree

In [6]:
dt_tfidf_balanced = ml.train_decision_tree(tfidf_train_balanced['features'], tfidf_train_balanced['target'], show_train_accuracy=1)

Train accuracy: 0.9255964415689446


### Random Forest

In [7]:
rf_tfidf_balanced = ml.train_random_forest(tfidf_train_balanced['features'], tfidf_train_balanced['target'], show_train_accuracy=1)

Train accuracy: 0.9397492923574605


### Gradient Boosting Tree

In [8]:
gb_tfidf_balanced = ml.train_gradient_boost(tfidf_train_balanced['features'], tfidf_train_balanced['target'], show_train_accuracy=1)

Train accuracy: 0.976546704407602


### Naive Bayes

In [9]:
nb_tfidf_balanced = ml.train_naive_bayes(tfidf_train_balanced['features'], tfidf_train_balanced['target'], show_train_accuracy=1)

Train accuracy: 0.9627982207844723


## Train Word2Vec

In [10]:
word2vec_train_balanced = pd.read_csv(os.path.join(csv_path, train['word2vec'][0]), index_col=0)
word2vec_test_balanced = pd.read_csv(os.path.join(csv_path, test['word2vec'][0]), index_col=0)

In [11]:
word2vec_train_balanced = separate_features_target(word2vec_train_balanced)
word2vec_test_balanced = separate_features_target(word2vec_test_balanced)

### Logistic Regression

In [12]:
lr_word2vec_balanced = ml.train_logistic_regression(word2vec_train_balanced['features'], word2vec_train_balanced['target'], show_train_accuracy=1)
lr_word2vec_balanced, lr_word2vec_balanced_scaler = lr_word2vec_balanced['model'], lr_word2vec_balanced['scaler']

Train accuracy: 0.9951475940153659


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Decision Tree

In [13]:
dt_word2vec_balanced = ml.train_decision_tree(word2vec_train_balanced['features'], word2vec_train_balanced['target'], show_train_accuracy=1)

Train accuracy: 0.9781641730691468


### Random Forest

In [14]:
rf_word2vec_balanced = ml.train_random_forest(word2vec_train_balanced['features'], word2vec_train_balanced['target'], show_train_accuracy=1)

Train accuracy: 0.978568540234533


### Gradient Boosting Tree

In [15]:
gb_word2vec_balanced = ml.train_gradient_boost(word2vec_train_balanced['features'], word2vec_train_balanced['target'], show_train_accuracy=1)

Train accuracy: 0.9943388596845936


### Naive Bayes

In [16]:
nb_word2vec_balanced = ml.train_naive_bayes(word2vec_train_balanced['features'], word2vec_train_balanced['target'], remove_negatives=True, show_train_accuracy=1)

Train accuracy: 0.9320663162151234


## Results

### TF-IDF

In [17]:
models = [lr_tfidf_balanced, dt_tfidf_balanced, rf_tfidf_balanced, gb_tfidf_balanced, nb_tfidf_balanced]
names = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting Tree', 'Naive Bayes']

In [18]:
results_tfidf = ml.multi_model_results(models, names, tfidf_test_balanced['features'], tfidf_test_balanced['target'], lr_tfidf_balanced_scaler)

In [19]:
results_tfidf

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
Logistic Regression,0.951535,0.953333,0.94702,0.950166,0.044164,0.05298,0.951428
Decision Tree,0.914378,0.936842,0.884106,0.90971,0.056782,0.115894,0.913662
Random Forest,0.928918,0.944828,0.907285,0.925676,0.050473,0.092715,0.928406
Gradient Boosting Tree,0.946688,0.946844,0.943709,0.945274,0.050473,0.056291,0.946618
Naive Bayes,0.956381,0.931034,0.983444,0.956522,0.069401,0.016556,0.957022


### Word2Vec

In [20]:
models = [lr_word2vec_balanced, dt_word2vec_balanced, rf_word2vec_balanced, gb_word2vec_balanced, nb_word2vec_balanced]
names = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting Tree', 'Naive Bayes']

In [21]:
results_word2vec = ml.multi_model_results(models, names, word2vec_test_balanced['features'], word2vec_test_balanced['target'], lr_word2vec_balanced_scaler)

In [22]:
results_word2vec

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
Logistic Regression,0.961228,0.945513,0.976821,0.960912,0.053628,0.023179,0.961597
Decision Tree,0.951535,0.947368,0.953642,0.950495,0.050473,0.046358,0.951585
Random Forest,0.969305,0.954984,0.983444,0.969005,0.044164,0.016556,0.96964
Gradient Boosting Tree,0.972536,0.961165,0.983444,0.972177,0.037855,0.016556,0.972794
Naive Bayes,0.634895,0.572797,0.990066,0.725728,0.70347,0.009934,0.643298


# Imbalanced Dataset

## Train TF-IDF

In [23]:
tfidf_train_imbalanced = pd.read_csv(os.path.join(csv_path, train['tfidf_sel'][1]), index_col=0)
tfidf_test_imbalanced = pd.read_csv(os.path.join(csv_path, test['tfidf_sel'][1]), index_col=0)

In [24]:
tfidf_train_imbalanced = separate_features_target(tfidf_train_imbalanced)
tfidf_test_imbalanced = separate_features_target(tfidf_test_imbalanced)

### Logistic Regression

In [25]:
lr_tfidf_imbalanced = ml.train_logistic_regression(tfidf_train_imbalanced['features'], tfidf_train_imbalanced['target'], show_train_accuracy=1)
lr_tfidf_imbalanced, lr_tfidf_imbalanced_scaler= lr_tfidf_imbalanced['model'], lr_tfidf_imbalanced['scaler']

Train accuracy: 0.9907271121577863


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Decision Tree

In [26]:
dt_tfidf_imbalanced = ml.train_decision_tree(tfidf_train_imbalanced['features'], tfidf_train_imbalanced['target'], show_train_accuracy=1)

Train accuracy: 0.966588166028849


### Random Forest

In [27]:
rf_tfidf_imbalanced = ml.train_random_forest(tfidf_train_imbalanced['features'], tfidf_train_imbalanced['target'], show_train_accuracy=1)

Train accuracy: 0.9514277303503091


### Gradient Boosting Tree

In [28]:
gb_tfidf_imbalanced = ml.train_gradient_boost(tfidf_train_imbalanced['features'], tfidf_train_imbalanced['target'], show_train_accuracy=1)

Train accuracy: 0.9840300264939653


### Naive Bayes

In [29]:
nb_tfidf_imbalanced = ml.train_naive_bayes(tfidf_train_imbalanced['features'], tfidf_train_imbalanced['target'], show_train_accuracy=1)

Train accuracy: 0.9765969973506035


## Train Word2Vec

In [30]:
word2vec_train_imbalanced = pd.read_csv(os.path.join(csv_path, train['word2vec'][1]), index_col=0)
word2vec_test_imbalanced = pd.read_csv(os.path.join(csv_path, test['word2vec'][1]), index_col=0)

In [31]:
word2vec_train_imbalanced = separate_features_target(word2vec_train_imbalanced)
word2vec_test_imbalanced = separate_features_target(word2vec_test_imbalanced)

### Logistic Regression

In [32]:
lr_word2vec_imbalanced = ml.train_logistic_regression(word2vec_train_imbalanced['features'], word2vec_train_imbalanced['target'], show_train_accuracy=1)
lr_word2vec_imbalanced, lr_word2vec_imbalanced_scaler= lr_word2vec_imbalanced['model'], lr_word2vec_imbalanced['scaler']

Train accuracy: 0.9871945834559905


### Decision Tree

In [33]:
dt_word2vec_imbalanced = ml.train_decision_tree(word2vec_train_imbalanced['features'], word2vec_train_imbalanced['target'], show_train_accuracy=1)

Train accuracy: 0.9798351486605829


### Random Forest

In [34]:
rf_word2vec_imbalanced = ml.train_random_forest(word2vec_train_imbalanced['features'], word2vec_train_imbalanced['target'], show_train_accuracy=1)

Train accuracy: 0.9813806299676184


### Gradient Boosting Tree

In [35]:
gb_word2vec_imbalanced = ml.train_gradient_boost(word2vec_train_imbalanced['features'], word2vec_train_imbalanced['target'], show_train_accuracy=1)

Train accuracy: 0.9938180747718576


### Naive Bayes

In [36]:
nb_word2vec_imbalanced = ml.train_naive_bayes(word2vec_train_imbalanced['features'], word2vec_train_imbalanced['target'], remove_negatives=True, show_train_accuracy=1)

Train accuracy: 0.9096261407123933


## Results

### TF-IDF

In [37]:
models = [lr_tfidf_imbalanced, dt_tfidf_imbalanced, rf_tfidf_imbalanced, gb_tfidf_imbalanced, nb_tfidf_imbalanced]
names = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting Tree', 'Naive Bayes']

In [38]:
results_tfidf_imbalanced = ml.multi_model_results(models, names, tfidf_test_imbalanced['features'], tfidf_test_imbalanced['target'], lr_tfidf_imbalanced_scaler)

In [39]:
results_tfidf_imbalanced

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
Logistic Regression,0.983809,0.923323,0.903125,0.913112,0.0078,0.096875,0.947663
Decision Tree,0.964675,0.87594,0.728125,0.795222,0.010725,0.271875,0.8587
Random Forest,0.94554,1.0,0.421875,0.593407,0.0,0.578125,0.710938
Gradient Boosting Tree,0.976155,0.944238,0.79375,0.862479,0.004875,0.20625,0.894438
Naive Bayes,0.979688,0.966543,0.8125,0.882852,0.002925,0.1875,0.904788


### Word2Vec

In [40]:
models = [lr_word2vec_imbalanced, dt_word2vec_imbalanced, rf_word2vec_imbalanced, gb_word2vec_imbalanced, nb_word2vec_imbalanced]
names = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting Tree', 'Naive Bayes']

In [41]:
results_word2vec_imbalanced = ml.multi_model_results(models, names, word2vec_test_imbalanced['features'], word2vec_test_imbalanced['target'], lr_word2vec_imbalanced_scaler)

In [42]:
results_word2vec_imbalanced

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
Logistic Regression,0.986753,0.931034,0.928125,0.929577,0.00715,0.071875,0.960488
Decision Tree,0.977922,0.891374,0.871875,0.881517,0.01105,0.128125,0.930413
Random Forest,0.98116,0.960432,0.834375,0.892977,0.003575,0.165625,0.9154
Gradient Boosting Tree,0.986459,0.941935,0.9125,0.926984,0.00585,0.0875,0.953325
Naive Bayes,0.947012,0.652838,0.934375,0.768638,0.051674,0.065625,0.941351
