# Text Classification TF-IDF Training Testing

In [1]:
import pandas as pd
data_df = pd.read_csv('../Output/PubMed_CleanArticles_Top1-10_cancerTypes.csv', encoding='utf-8')

In [2]:
data_df = data_df[['Article', 'Clean Article', 'Target Label', 'Target Name']]
data_df.head(10)

Unnamed: 0,Article,Clean Article,Target Label,Target Name
0,Understanding the symptoms experienced by indi...,understand symptom experience individual lung ...,1,Lung
1,Do statins improve outcomes for patients with ...,statin improve outcome patient non small cell ...,1,Lung
2,"Lung cancer epidemiology, risk factors, and pr...",lung cancer epidemiology risk factor preventio...,1,Lung
3,[Modern Nanomedicine in Treatment of Lung Carc...,modern nanomedicine treatment lung carcinomas ...,1,Lung
4,[Nineteen multiple primary cancer cases of 100...,nineteen multiple primary cancer case patient ...,1,Lung
5,Image-guided radiotherapy and motion managemen...,image guide radiotherapy motion management lun...,1,Lung
6,[III. Immune Checkpoint Inhibitor as a Standar...,iii immune checkpoint inhibitor standard treat...,1,Lung
7,Radiotherapy for small-cell lung cancer-Where ...,radiotherapy small cell lung cancer head radio...,1,Lung
8,Coagulation-fibrinolytic analysis in patients ...,coagulation fibrinolytic analysis patient lung...,1,Lung
9,Revisiting the debate: the use of new agents i...,revisit debate use new agent previously untrea...,1,Lung


In [3]:
import numpy as np
from sklearn.model_selection import train_test_split

train_corpus, test_corpus, train_label_nums, test_label_nums, train_label_names, test_label_names =\
                                 train_test_split(np.array(data_df['Clean Article']), np.array(data_df['Target Label']),
                                                       np.array(data_df['Target Name']), test_size=0.33, random_state=42)

train_corpus.shape, test_corpus.shape

((6700,), (3300,))

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# build BOW features on train articles
tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0)
tv_train_features = tv.fit_transform(train_corpus)

In [5]:
# transform test articles into features
tv_test_features = tv.transform(test_corpus)

In [6]:
print('TFIDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape)

TFIDF model:> Train features shape: (6700, 20634)  Test features shape: (3300, 20634)


# Using Naive Bayes

In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
import time
from sklearn.feature_extraction.text import TfidfVectorizer

mnb = MultinomialNB(alpha=1)
start = time.time()
mnb.fit(tv_train_features, train_label_names)
stop = time.time()
time_mnb = str(round(stop-start,3)) + 's'
print('Training time: ', time_mnb)
mnb_tfidf_cv_scores = cross_val_score(mnb, tv_train_features, train_label_names, cv=5)
mnb_tfidf_cv_mean_score = np.mean(mnb_tfidf_cv_scores)
print('CV Accuracy (5-fold):', mnb_tfidf_cv_scores)
print('Mean CV Accuracy:', mnb_tfidf_cv_mean_score)
mnb_tfidf_test_score = mnb.score(tv_test_features, test_label_names)
print('Test Accuracy:', mnb_tfidf_test_score)

Training time:  0.036s
CV Accuracy (5-fold): [0.94253731 0.94850746 0.94328358 0.95       0.95373134]
Mean CV Accuracy: 0.9476119402985075
Test Accuracy: 0.943030303030303


# Using Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l2', max_iter=100, C=1, random_state=42)
start = time.time()
lr.fit(tv_train_features, train_label_names)
stop = time.time()
time_lr = str(round(stop-start,3)) + 's'
print('Training time: ', time_lr)
lr_tfidf_cv_scores = cross_val_score(lr, tv_train_features, train_label_names, cv=5)
lr_tfidf_cv_mean_score = np.mean(lr_tfidf_cv_scores)
print('CV Accuracy (5-fold):', lr_tfidf_cv_scores)
print('Mean CV Accuracy:', lr_tfidf_cv_mean_score)
lr_tfidf_test_score = lr.score(tv_test_features, test_label_names)
print('Test Accuracy:', lr_tfidf_test_score)

Training time:  3.043s
CV Accuracy (5-fold): [0.98731343 0.98432836 0.9858209  0.98134328 0.9880597 ]
Mean CV Accuracy: 0.9853731343283583
Test Accuracy: 0.9860606060606061


# Using Linear SVM

In [10]:
from sklearn.svm import LinearSVC

svm = LinearSVC(penalty='l2', C=1, random_state=42)
start = time.time()
svm.fit(tv_train_features, train_label_names)
stop = time.time()
time_svm = str(round(stop-start,3)) + 's'
print('Training time: ', time_svm)
svm_tfidf_cv_scores = cross_val_score(svm, tv_train_features, train_label_names, cv=5)
svm_tfidf_cv_mean_score = np.mean(svm_tfidf_cv_scores)
print('CV Accuracy (5-fold):', svm_tfidf_cv_scores)
print('Mean CV Accuracy:', svm_tfidf_cv_mean_score)
svm_tfidf_test_score = svm.score(tv_test_features, test_label_names)
print('Test Accuracy:', svm_tfidf_test_score)

Training time:  0.288s
CV Accuracy (5-fold): [0.98880597 0.98656716 0.98656716 0.98358209 0.98507463]
Mean CV Accuracy: 0.9861194029850745
Test Accuracy: 0.9866666666666667


# Using Linear SVM (SGD)

In [11]:
from sklearn.linear_model import SGDClassifier

svm_sgd = SGDClassifier(loss='hinge', penalty='l2', max_iter=5, random_state=42)
start = time.time()
svm_sgd.fit(tv_train_features, train_label_names)
stop = time.time()
time_sgd = str(round(stop-start,3)) + 's'
print('Training time: ', time_sgd)
svmsgd_tfidf_cv_scores = cross_val_score(svm_sgd, tv_train_features, train_label_names, cv=5)
svmsgd_tfidf_cv_mean_score = np.mean(svmsgd_tfidf_cv_scores)
print('CV Accuracy (5-fold):', svmsgd_tfidf_cv_scores)
print('Mean CV Accuracy:', svmsgd_tfidf_cv_mean_score)
svmsgd_tfidf_test_score = svm_sgd.score(tv_test_features, test_label_names)
print('Test Accuracy:', svmsgd_tfidf_test_score)



Training time:  0.116s
CV Accuracy (5-fold): [0.98955224 0.98507463 0.98955224 0.98283582 0.9858209 ]
Mean CV Accuracy: 0.9865671641791044
Test Accuracy: 0.9875757575757576


# Using Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=10, random_state=42)
start = time.time()
rfc.fit(tv_train_features, train_label_names)
stop = time.time()
time_rfc = str(round(stop-start,3)) + 's'
print('Training time: ', time_rfc)
rfc_tfidf_cv_scores = cross_val_score(rfc, tv_train_features, train_label_names, cv=5)
rfc_tfidf_cv_mean_score = np.mean(rfc_tfidf_cv_scores)
print('CV Accuracy (5-fold):', rfc_tfidf_cv_scores)
print('Mean CV Accuracy:', rfc_tfidf_cv_mean_score)
rfc_tfidf_test_score = rfc.score(tv_test_features, test_label_names)
print('Test Accuracy:', rfc_tfidf_test_score)

Training time:  0.782s
CV Accuracy (5-fold): [0.93134328 0.92985075 0.92014925 0.9141791  0.94029851]
Mean CV Accuracy: 0.9271641791044776
Test Accuracy: 0.93


# Using Gradient Boosted Machines

In [13]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=10, random_state=42)
start = time.time()
gbc.fit(tv_train_features, train_label_names)
stop = time.time()
time_gbc = str(round(stop-start,3)) + 's'
print('Training time: ', time_gbc)
gbc_tfidf_cv_scores = cross_val_score(gbc, tv_train_features, train_label_names, cv=5)
gbc_tfidf_cv_mean_score = np.mean(gbc_tfidf_cv_scores)
print('CV Accuracy (5-fold):', gbc_tfidf_cv_scores)
print('Mean CV Accuracy:', gbc_tfidf_cv_mean_score)
gbc_tfidf_test_score = gbc.score(tv_test_features, test_label_names)
print('Test Accuracy:', gbc_tfidf_test_score)

Training time:  16.515s
CV Accuracy (5-fold): [0.97686567 0.97835821 0.96865672 0.97238806 0.97462687]
Mean CV Accuracy: 0.974179104477612
Test Accuracy: 0.9684848484848485


# TF-IDF Classification Model Comparison

In [15]:
result = pd.DataFrame([['Naive Bayes', 
               time_mnb, mnb_tfidf_cv_mean_score, mnb_tfidf_test_score],
              ['Logistic Regression', 
               time_lr, lr_tfidf_cv_mean_score, lr_tfidf_test_score],
              ['Linear SVM', 
               time_svm, svm_tfidf_cv_mean_score, svm_tfidf_test_score],
              ['Linear SVM (SGD)',  
               time_sgd, svmsgd_tfidf_cv_mean_score, svmsgd_tfidf_test_score],
              ['Random Forest', 
               time_rfc, rfc_tfidf_cv_mean_score, rfc_tfidf_test_score],
              ['Gradient Boosted Machines', 
               time_gbc, gbc_tfidf_cv_mean_score, gbc_tfidf_test_score]],
             columns=['Model', 'Train Time', 'Train Score (TF-IDF)', 'Train Score (TF-IDF)'],
             )
result

Unnamed: 0,Model,Train Time,Train Score (TF-IDF),Train Score (TF-IDF).1
0,Naive Bayes,0.036s,0.947612,0.94303
1,Logistic Regression,3.043s,0.985373,0.986061
2,Linear SVM,0.288s,0.986119,0.986667
3,Linear SVM (SGD),0.116s,0.986567,0.987576
4,Random Forest,0.782s,0.927164,0.93
5,Gradient Boosted Machines,16.515s,0.974179,0.968485


## Accuracy, Precision, Recall, and F1 Score

In [16]:
import model_evaluation_utils as meu
model_name = result['Model'].tolist()
model_var = [mnb, lr, svm, svm_sgd, rfc, gbc]
for i in range(len(model_name)):
    mnb_predictions = model_var[i].predict(tv_test_features)
    unique_classes = list(set(test_label_names))
    print(model_name[i])
    meu.get_metrics(true_labels=test_label_names, predicted_labels=mnb_predictions)
    print()

Naive Bayes
Accuracy: 0.943
Precision: 0.9439
Recall: 0.943
F1 Score: 0.943

Logistic Regression
Accuracy: 0.9861
Precision: 0.9861
Recall: 0.9861
F1 Score: 0.9861

Linear SVM
Accuracy: 0.9867
Precision: 0.9867
Recall: 0.9867
F1 Score: 0.9867

Linear SVM (SGD)
Accuracy: 0.9876
Precision: 0.9876
Recall: 0.9876
F1 Score: 0.9876

Random Forest
Accuracy: 0.93
Precision: 0.931
Recall: 0.93
F1 Score: 0.9299

Gradient Boosted Machines
Accuracy: 0.9685
Precision: 0.9691
Recall: 0.9685
F1 Score: 0.9686

