In [1]:
import os
import pandas as pd
pd.options.display.max_columns = 250

import machine_learning as ml

from ast import literal_eval

In [2]:
# Path
csv_path = './data/csv/'

train = {
    'tfidf_sel' : ['tfidf_chi2_train_1.csv','tfidf_chi2_train_2.csv'],
    'word2vec' : ['word2vec_train_1.csv','word2vec_train_2.csv']
}
test = {
    'tfidf_sel' : ['tfidf_chi2_test_1.csv','tfidf_chi2_test_2.csv'],
    'word2vec' : ['word2vec_test_1.csv','word2vec_test_2.csv']
}

# Balanced Dataset

## Train TF-IDF

In [3]:
tfidf_train_1 = pd.read_csv(os.path.join(csv_path, train['tfidf_sel'][0]), index_col=0)
tfidf_test_1 = pd.read_csv(os.path.join(csv_path, test['tfidf_sel'][0]), index_col=0)

In [4]:
tfidf_train_1 = ml.separate_features_target(tfidf_train_1)
tfidf_test_1 = ml.separate_features_target(tfidf_test_1)

### Logistic Regression

In [5]:
lr_tfidf = ml.train_logistic_regression(tfidf_train_1['features'], tfidf_train_1['target'], show_train_accuracy=1)
lr_tfidf, lr_tfidf_scaler= lr_tfidf['model'], lr_tfidf['scaler']

Train accuracy: 0.9930198383541513


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Decision Tree

In [6]:
dt_tfidf = ml.train_decision_tree(tfidf_train_1['features'], tfidf_train_1['target'], show_train_accuracy=1)

Train accuracy: 0.9357090374724467


### Random Forest

In [7]:
rf_tfidf = ml.train_random_forest(tfidf_train_1['features'], tfidf_train_1['target'], show_train_accuracy=1)

Train accuracy: 0.9474650991917708


### Gradient Boosting Tree

In [8]:
gb_tfidf = ml.train_gradient_boost(tfidf_train_1['features'], tfidf_train_1['target'], show_train_accuracy=1)

Train accuracy: 0.9757531227038942


### Naive Bayes

In [9]:
nb_tfidf = ml.train_naive_bayes(tfidf_train_1['features'], tfidf_train_1['target'], show_train_accuracy=1)

Train accuracy: 0.9717119764878765


## Train Word2Vec

In [10]:
word2vec_train_1 = pd.read_csv(os.path.join(csv_path, train['word2vec'][0]), index_col=0)
word2vec_test_1 = pd.read_csv(os.path.join(csv_path, test['word2vec'][0]), index_col=0)

In [11]:
word2vec_train_1 = ml.separate_features_target(word2vec_train_1)
word2vec_test_1 = ml.separate_features_target(word2vec_test_1)

### Logistic Regression

In [12]:
lr_word2vec = ml.train_logistic_regression(word2vec_train_1['features'], word2vec_train_1['target'], show_train_accuracy=1)
lr_word2vec, lr_word2vec_scaler= lr_word2vec['model'], lr_word2vec['scaler']

Train accuracy: 0.967670830271859


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Decision Tree

In [13]:
dt_word2vec = ml.train_decision_tree(word2vec_train_1['features'], word2vec_train_1['target'], show_train_accuracy=1)

Train accuracy: 0.9856722997795738


### Random Forest

In [14]:
rf_word2vec = ml.train_random_forest(word2vec_train_1['features'], word2vec_train_1['target'], show_train_accuracy=1)

Train accuracy: 0.9805290227773695


### Gradient Boosting Tree

In [15]:
gb_word2vec = ml.train_gradient_boost(word2vec_train_1['features'], word2vec_train_1['target'], show_train_accuracy=1)

Train accuracy: 0.9955914768552535


### Naive Bayes

In [16]:
nb_word2vec = ml.train_naive_bayes(word2vec_train_1['features'], word2vec_train_1['target'], remove_negatives=True, show_train_accuracy=1)

Train accuracy: 0.9434239529757531


## Results

### TF-IDF

In [17]:
models = [lr_tfidf, dt_tfidf, rf_tfidf, gb_tfidf, nb_tfidf]
names = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting Tree', 'Naive Bayes']

In [18]:
results_tfidf = ml.multi_model_results(models, names, tfidf_test_1['features'], tfidf_test_1['target'], lr_tfidf_scaler)

In [19]:
results_tfidf

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
Logistic Regression,0.957416,0.96131,0.952802,0.957037,0.01909,0.023495,0.957395
Decision Tree,0.917768,0.954984,0.876106,0.913846,0.020558,0.061674,0.917585
Random Forest,0.933921,0.962264,0.902655,0.931507,0.017621,0.048458,0.933784
Gradient Boosting Tree,0.939794,0.968553,0.908555,0.937595,0.014684,0.045521,0.939657
Naive Bayes,0.963289,0.959064,0.967552,0.963289,0.020558,0.016153,0.963308


### Word2Vec

In [20]:
models = [lr_word2vec, dt_word2vec, rf_word2vec, gb_word2vec, nb_word2vec]
names = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting Tree', 'Naive Bayes']

In [21]:
results_word2vec = ml.multi_model_results(models, names, word2vec_test_1['features'], word2vec_test_1['target'], lr_word2vec_scaler)

In [22]:
results_word2vec

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
Logistic Regression,0.967695,0.956772,0.979351,0.96793,0.022026,0.010279,0.967746
Decision Tree,0.960352,0.945714,0.976401,0.960813,0.0279,0.011747,0.960423
Random Forest,0.963289,0.959064,0.967552,0.963289,0.020558,0.016153,0.963308
Gradient Boosting Tree,0.964758,0.95389,0.976401,0.965015,0.023495,0.011747,0.964809
Naive Bayes,0.619677,0.567114,0.99705,0.722995,0.378855,0.001468,0.621332


# Imbalanced Dataset

## Train TF-IDF

In [23]:
tfidf_train_2 = pd.read_csv(os.path.join(csv_path, train['tfidf_sel'][1]), index_col=0)
tfidf_test_2 = pd.read_csv(os.path.join(csv_path, test['tfidf_sel'][1]), index_col=0)

In [24]:
tfidf_train_2 = ml.separate_features_target(tfidf_train_2)
tfidf_test_2 = ml.separate_features_target(tfidf_test_2)

### Logistic Regression

In [25]:
lr_tfidf_2 = ml.train_logistic_regression(tfidf_train_2['features'], tfidf_train_2['target'], show_train_accuracy=1)
lr_tfidf_2, lr_tfidf_2_scaler= lr_tfidf_2['model'], lr_tfidf_2['scaler']

Train accuracy: 0.9893023255813953


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Decision Tree

In [26]:
dt_tfidf_2 = ml.train_decision_tree(tfidf_train_2['features'], tfidf_train_2['target'], show_train_accuracy=1)

Train accuracy: 0.9705647840531562


### Random Forest

In [27]:
rf_tfidf_2 = ml.train_random_forest(tfidf_train_2['features'], tfidf_train_2['target'], show_train_accuracy=1)

Train accuracy: 0.9498338870431894


### Gradient Boosting Tree

In [28]:
gb_tfidf_2 = ml.train_gradient_boost(tfidf_train_2['features'], tfidf_train_2['target'], show_train_accuracy=1)

Train accuracy: 0.986046511627907


### Naive Bayes

In [29]:
nb_tfidf_2 = ml.train_naive_bayes(tfidf_train_2['features'], tfidf_train_2['target'], show_train_accuracy=1)

Train accuracy: 0.978936877076412


## Train Word2Vec

In [30]:
word2vec_train_2 = pd.read_csv(os.path.join(csv_path, train['word2vec'][1]), index_col=0)
word2vec_test_2 = pd.read_csv(os.path.join(csv_path, test['word2vec'][1]), index_col=0)

In [31]:
word2vec_train_2 = ml.separate_features_target(word2vec_train_2)
word2vec_test_2 = ml.separate_features_target(word2vec_test_2)

### Logistic Regression

In [32]:
lr_word2vec_2 = ml.train_logistic_regression(word2vec_train_2['features'], word2vec_train_2['target'], show_train_accuracy=1)
lr_word2vec_2, lr_word2vec_2_scaler= lr_word2vec_2['model'], lr_word2vec_2['scaler']

Train accuracy: 0.9833887043189369


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Decision Tree

In [33]:
dt_word2vec_2 = ml.train_decision_tree(word2vec_train_2['features'], word2vec_train_2['target'], show_train_accuracy=1)

Train accuracy: 0.9828571428571429


### Random Forest

In [34]:
rf_word2vec_2 = ml.train_random_forest(word2vec_train_2['features'], word2vec_train_2['target'], show_train_accuracy=1)

Train accuracy: 0.9823255813953489


### Gradient Boosting Tree

In [35]:
gb_word2vec_2 = ml.train_gradient_boost(word2vec_train_2['features'], word2vec_train_2['target'], show_train_accuracy=1)

Train accuracy: 0.993421926910299


### Naive Bayes

In [36]:
nb_word2vec_2 = ml.train_naive_bayes(word2vec_train_2['features'], word2vec_train_2['target'], remove_negatives=True, show_train_accuracy=1)

Train accuracy: 0.9113621262458472


## Results

### TF-IDF

In [37]:
models = [lr_tfidf_2, dt_tfidf_2, rf_tfidf_2, gb_tfidf_2, nb_tfidf_2]
names = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting Tree', 'Naive Bayes']

In [38]:
results_tfidf_2 = ml.multi_model_results(models, names, tfidf_test_2['features'], tfidf_test_2['target'], lr_tfidf_2_scaler)

In [39]:
results_tfidf_2

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
Logistic Regression,0.982992,0.921739,0.895775,0.908571,0.007175,0.009833,0.943926
Decision Tree,0.966782,0.873377,0.757746,0.811463,0.010364,0.022854,0.873151
Random Forest,0.945788,1.0,0.425352,0.596838,0.0,0.054212,0.712676
Gradient Boosting Tree,0.977943,0.962585,0.797183,0.872111,0.002923,0.019134,0.896978
Naive Bayes,0.977943,0.965753,0.794366,0.871716,0.002657,0.019399,0.895716


### Word2Vec

In [40]:
models = [lr_word2vec_2, dt_word2vec_2, rf_word2vec_2, gb_word2vec_2, nb_word2vec_2]
names = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting Tree', 'Naive Bayes']

In [41]:
results_word2vec_2 = ml.multi_model_results(models, names, word2vec_test_2['features'], word2vec_test_2['target'], lr_word2vec_2_scaler)

In [42]:
results_word2vec_2

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,False Positive Rate,False Negative Rate,Area Under ROC Curve
Logistic Regression,0.980866,0.927492,0.864789,0.895044,0.006378,0.012756,0.928873
Decision Tree,0.973691,0.880952,0.833803,0.856729,0.01063,0.015679,0.911033
Random Forest,0.979006,0.963087,0.808451,0.87902,0.002923,0.018071,0.902612
Gradient Boosting Tree,0.985915,0.960366,0.887324,0.922401,0.003455,0.01063,0.941755
Naive Bayes,0.927452,0.569257,0.949296,0.711721,0.067765,0.004783,0.937236
