# Text Classification TF Training Testing

In [1]:
import pandas as pd
data_df = pd.read_csv('../Output/PubMed_CleanArticles_Top1-10_cancerTypes.csv', encoding='utf-8')

In [2]:
data_df = data_df[['Article', 'Clean Article', 'Target Label', 'Target Name']]
data_df.head(10)

Unnamed: 0,Article,Clean Article,Target Label,Target Name
0,Understanding the symptoms experienced by indi...,understand symptom experience individual lung ...,1,Lung
1,Do statins improve outcomes for patients with ...,statin improve outcome patient non small cell ...,1,Lung
2,"Lung cancer epidemiology, risk factors, and pr...",lung cancer epidemiology risk factor preventio...,1,Lung
3,[Modern Nanomedicine in Treatment of Lung Carc...,modern nanomedicine treatment lung carcinomas ...,1,Lung
4,[Nineteen multiple primary cancer cases of 100...,nineteen multiple primary cancer case patient ...,1,Lung
5,Image-guided radiotherapy and motion managemen...,image guide radiotherapy motion management lun...,1,Lung
6,[III. Immune Checkpoint Inhibitor as a Standar...,iii immune checkpoint inhibitor standard treat...,1,Lung
7,Radiotherapy for small-cell lung cancer-Where ...,radiotherapy small cell lung cancer head radio...,1,Lung
8,Coagulation-fibrinolytic analysis in patients ...,coagulation fibrinolytic analysis patient lung...,1,Lung
9,Revisiting the debate: the use of new agents i...,revisit debate use new agent previously untrea...,1,Lung


In [3]:
from sklearn.model_selection import train_test_split
import numpy as np

train_corpus, test_corpus, train_label_nums, test_label_nums, train_label_names, test_label_names =\
                                 train_test_split(np.array(data_df['Clean Article']), np.array(data_df['Target Label']),
                                                       np.array(data_df['Target Name']), test_size=0.33, random_state=42)

train_corpus.shape, test_corpus.shape

((6700,), (3300,))

In [4]:
train_label_nums, test_label_nums

(array([ 9,  6, 10, ...,  6,  1,  8], dtype=int64),
 array([ 7,  5,  2, ...,  7,  9, 10], dtype=int64))

In [5]:
train_label_names, test_label_names

(array(['Thyroid', 'Liver', 'Bladder', ..., 'Liver', 'Lung',
        'Cervix Uteri'], dtype=object),
 array(['Oesophagus', 'Stomach', 'Breast', ..., 'Oesophagus', 'Thyroid',
        'Bladder'], dtype=object))

In [6]:
from collections import Counter

trd = dict(Counter(train_label_names))
tsd = dict(Counter(test_label_names))

(pd.DataFrame([[key, trd[key], tsd[key]] for key in trd], 
             columns=['Target Label', 'Train Count', 'Test Count'])
.sort_values(by=['Train Count', 'Test Count'],
             ascending=False))

Unnamed: 0,Target Label,Train Count,Test Count
2,Bladder,693,307
5,Colorectal,685,315
4,Oesophagus,681,319
8,Cervix Uteri,680,320
1,Liver,677,323
3,Prostate,668,332
9,Breast,665,335
7,Lung,655,345
6,Stomach,649,351
0,Thyroid,647,353


In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score

# build BOW features on train articles
cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0)
cv_train_features = cv.fit_transform(train_corpus)

In [8]:
print(cv_train_features.shape)

(6700, 20634)


In [9]:
print(cv_train_features)

  (0, 15060)	1
  (0, 16872)	1
  (0, 12872)	4
  (0, 1366)	8
  (0, 1125)	3
  (0, 1727)	3
  (0, 2491)	3
  (0, 8641)	2
  (0, 1393)	1
  (0, 6720)	1
  (0, 1571)	1
  (0, 4718)	1
  (0, 15608)	1
  (0, 18694)	4
  (0, 10824)	3
  (0, 12926)	1
  (0, 19556)	2
  (0, 17104)	2
  (0, 6779)	3
  (0, 10256)	1
  (0, 8945)	2
  (0, 17416)	1
  (0, 2618)	1
  (0, 17446)	1
  (0, 777)	1
  :	:
  (6699, 3802)	3
  (6699, 9535)	1
  (6699, 2683)	2
  (6699, 7227)	1
  (6699, 14967)	1
  (6699, 14687)	1
  (6699, 4593)	2
  (6699, 15865)	1
  (6699, 18919)	1
  (6699, 17970)	1
  (6699, 16699)	2
  (6699, 13210)	1
  (6699, 15930)	1
  (6699, 17606)	2
  (6699, 19884)	1
  (6699, 19791)	1
  (6699, 16951)	2
  (6699, 4937)	1
  (6699, 8451)	3
  (6699, 12462)	1
  (6699, 15942)	1
  (6699, 3586)	1
  (6699, 2536)	1
  (6699, 476)	3
  (6699, 15160)	3


In [10]:
# transform test articles into features
cv_test_features = cv.transform(test_corpus)
print(cv_test_features.shape)
print(cv_train_features)

(3300, 20634)
  (0, 15060)	1
  (0, 16872)	1
  (0, 12872)	4
  (0, 1366)	8
  (0, 1125)	3
  (0, 1727)	3
  (0, 2491)	3
  (0, 8641)	2
  (0, 1393)	1
  (0, 6720)	1
  (0, 1571)	1
  (0, 4718)	1
  (0, 15608)	1
  (0, 18694)	4
  (0, 10824)	3
  (0, 12926)	1
  (0, 19556)	2
  (0, 17104)	2
  (0, 6779)	3
  (0, 10256)	1
  (0, 8945)	2
  (0, 17416)	1
  (0, 2618)	1
  (0, 17446)	1
  (0, 777)	1
  :	:
  (6699, 3802)	3
  (6699, 9535)	1
  (6699, 2683)	2
  (6699, 7227)	1
  (6699, 14967)	1
  (6699, 14687)	1
  (6699, 4593)	2
  (6699, 15865)	1
  (6699, 18919)	1
  (6699, 17970)	1
  (6699, 16699)	2
  (6699, 13210)	1
  (6699, 15930)	1
  (6699, 17606)	2
  (6699, 19884)	1
  (6699, 19791)	1
  (6699, 16951)	2
  (6699, 4937)	1
  (6699, 8451)	3
  (6699, 12462)	1
  (6699, 15942)	1
  (6699, 3586)	1
  (6699, 2536)	1
  (6699, 476)	3
  (6699, 15160)	3


In [11]:
print('BOW model:> Train features shape:', cv_train_features.shape, ' Test features shape:', cv_test_features.shape)

BOW model:> Train features shape: (6700, 20634)  Test features shape: (3300, 20634)


# Using Naive Bayes

In [14]:
from sklearn.naive_bayes import MultinomialNB
import time
mnb = MultinomialNB(alpha=1)
start = time.time()
mnb.fit(cv_train_features, train_label_names)
stop = time.time()
time_mnb = str(round(stop-start,3)) + 's'
print('Training time: ', time_mnb)

mnb_bow_cv_scores = cross_val_score(mnb, cv_train_features, train_label_names, cv=5)

mnb_bow_cv_mean_score = np.mean(mnb_bow_cv_scores)
print('CV Accuracy (5-fold):', mnb_bow_cv_scores)
print('Mean CV Accuracy:', mnb_bow_cv_mean_score)
mnb_bow_test_score = mnb.score(cv_test_features, test_label_names)
print('Test Accuracy:', mnb_bow_test_score)

Training time:  0.041s
CV Accuracy (5-fold): [0.93432836 0.94179104 0.93731343 0.94328358 0.94253731]
Mean CV Accuracy: 0.9398507462686567
Test Accuracy: 0.9387878787878788


# Using Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l2', max_iter=100, C=1, random_state=42)
start = time.time()
lr.fit(cv_train_features, train_label_names)
stop = time.time()
time_lr = str(round(stop-start,3)) + 's'
print('Training time: ', time_lr)
lr_bow_cv_scores = cross_val_score(lr, cv_train_features, train_label_names, cv=5)
lr_bow_cv_mean_score = np.mean(lr_bow_cv_scores)
print('CV Accuracy (5-fold):', lr_bow_cv_scores)
print('Mean CV Accuracy:', lr_bow_cv_mean_score)
lr_bow_test_score = lr.score(cv_test_features, test_label_names)
print('Test Accuracy:', lr_bow_test_score)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Training time:  4.737s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


CV Accuracy (5-fold): [0.98731343 0.98432836 0.98208955 0.97761194 0.98432836]
Mean CV Accuracy: 0.983134328358209
Test Accuracy: 0.9854545454545455


# Using Linear SVM

In [16]:
from sklearn.svm import LinearSVC

svm = LinearSVC(penalty='l2', C=1, random_state=42)
start = time.time()
svm.fit(cv_train_features, train_label_names)
stop = time.time()
time_svm = str(round(stop-start,3)) + 's'
print('Training time: ', time_svm)
svm_bow_cv_scores = cross_val_score(svm, cv_train_features, train_label_names, cv=5)
svm_bow_cv_mean_score = np.mean(svm_bow_cv_scores)
print('CV Accuracy (5-fold):', svm_bow_cv_scores)
print('Mean CV Accuracy:', svm_bow_cv_mean_score)
svm_bow_test_score = svm.score(cv_test_features, test_label_names)
print('Test Accuracy:', svm_bow_test_score)

Training time:  0.33s
CV Accuracy (5-fold): [0.98731343 0.98358209 0.97835821 0.9761194  0.98059701]
Mean CV Accuracy: 0.9811940298507462
Test Accuracy: 0.9833333333333333


# Using Linear SVM (SGD)

In [17]:
from sklearn.linear_model import SGDClassifier

svm_sgd = SGDClassifier(loss='hinge', penalty='l2', max_iter=5, random_state=42)
start = time.time()
svm_sgd.fit(cv_train_features, train_label_names)
stop = time.time()
time_sgd = str(round(stop-start,3)) + 's'
print('Training time: ', time_sgd)
svmsgd_bow_cv_scores = cross_val_score(svm_sgd, cv_train_features, train_label_names, cv=5)
svmsgd_bow_cv_mean_score = np.mean(svmsgd_bow_cv_scores)
print('CV Accuracy (5-fold):', svmsgd_bow_cv_scores)
print('Mean CV Accuracy:', svmsgd_bow_cv_mean_score)
svmsgd_bow_test_score = svm_sgd.score(cv_test_features, test_label_names)
print('Test Accuracy:', svmsgd_bow_test_score)



Training time:  0.155s
CV Accuracy (5-fold): [0.98432836 0.97835821 0.97761194 0.97686567 0.98283582]
Mean CV Accuracy: 0.9800000000000001
Test Accuracy: 0.9821212121212122


# Using Random Forest

In [18]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=10, random_state=42)
start = time.time()
rfc.fit(cv_train_features, train_label_names)
stop = time.time()
time_rfc = str(round(stop-start,3)) + 's'
print('Training time: ', time_rfc)
rfc_bow_cv_scores = cross_val_score(rfc, cv_train_features, train_label_names, cv=5)
rfc_bow_cv_mean_score = np.mean(rfc_bow_cv_scores)
print('CV Accuracy (5-fold):', rfc_bow_cv_scores)
print('Mean CV Accuracy:', rfc_bow_cv_mean_score)
rfc_bow_test_score = rfc.score(cv_test_features, test_label_names)
print('Test Accuracy:', rfc_bow_test_score)

Training time:  0.841s
CV Accuracy (5-fold): [0.93955224 0.94402985 0.9238806  0.93731343 0.90970149]
Mean CV Accuracy: 0.9308955223880597
Test Accuracy: 0.926060606060606


# Using Gradient Boosted Machine

In [20]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=10, random_state=42)
start = time.time()
gbc.fit(cv_train_features, train_label_names)
stop = time.time()
time_gbc = str(round(stop-start,3)) + 's'
print('Training time: ', time_gbc)
gbc_bow_cv_scores = cross_val_score(gbc, cv_train_features, train_label_names, cv=5)
gbc_bow_cv_mean_score = np.mean(gbc_bow_cv_scores)
print('CV Accuracy (5-fold):', gbc_bow_cv_scores)
print('Mean CV Accuracy:', gbc_bow_cv_mean_score)
gbc_bow_test_score = gbc.score(cv_test_features, test_label_names)
print('Test Accuracy:', gbc_bow_test_score)

Training time:  8.96s
CV Accuracy (5-fold): [0.97910448 0.98358209 0.97537313 0.97910448 0.98134328]
Mean CV Accuracy: 0.9797014925373133
Test Accuracy: 0.9766666666666667


# TF Classification Model Comparison

In [24]:
result = pd.DataFrame([['Naive Bayes', time_mnb, mnb_bow_cv_mean_score, mnb_bow_test_score],
              ['Logistic Regression', time_lr,lr_bow_cv_mean_score, lr_bow_test_score],
              ['Linear SVM', time_svm,svm_bow_cv_mean_score, svm_bow_test_score],
              ['Linear SVM (SGD)', time_sgd,svmsgd_bow_cv_mean_score, svmsgd_bow_test_score],
              ['Random Forest', time_rfc,rfc_bow_cv_mean_score, rfc_bow_test_score],
              ['Gradient Boosted Machines',time_gbc, gbc_bow_cv_mean_score, gbc_bow_test_score]],
             columns=['Model', 'Train Time', 'Train Score (TF)', 'Test Score (TF)'],
             )
result

Unnamed: 0,Model,Train Time,Train Score (TF),Test Score (TF)
0,Naive Bayes,0.041s,0.939851,0.938788
1,Logistic Regression,4.737s,0.983134,0.985455
2,Linear SVM,0.33s,0.981194,0.983333
3,Linear SVM (SGD),0.155s,0.98,0.982121
4,Random Forest,0.841s,0.930896,0.926061
5,Gradient Boosted Machines,8.96s,0.979701,0.976667


In [25]:
result.T

Unnamed: 0,0,1,2,3,4,5
Model,Naive Bayes,Logistic Regression,Linear SVM,Linear SVM (SGD),Random Forest,Gradient Boosted Machines
Train Time,0.041s,4.737s,0.33s,0.155s,0.841s,8.96s
Train Score (TF),0.939851,0.983134,0.981194,0.98,0.930896,0.979701
Test Score (TF),0.938788,0.985455,0.983333,0.982121,0.926061,0.976667


## Accuracy, Precision, Recall, and F1 Score

In [27]:
import model_evaluation_utils as meu
model_name = result['Model'].tolist()
model_var = [mnb, lr, svm, svm_sgd, rfc, gbc]
for i in range(len(model_name)):
    mnb_predictions = model_var[i].predict(cv_test_features)
    unique_classes = list(set(test_label_names))
    print(model_name[i])
    meu.get_metrics(true_labels=test_label_names, predicted_labels=mnb_predictions)
    print()

Naive Bayes
Accuracy: 0.9388
Precision: 0.9394
Recall: 0.9388
F1 Score: 0.9388

Logistic Regression
Accuracy: 0.9855
Precision: 0.9855
Recall: 0.9855
F1 Score: 0.9854

Linear SVM
Accuracy: 0.9833
Precision: 0.9834
Recall: 0.9833
F1 Score: 0.9833

Linear SVM (SGD)
Accuracy: 0.9821
Precision: 0.9824
Recall: 0.9821
F1 Score: 0.9822

Random Forest
Accuracy: 0.9261
Precision: 0.9271
Recall: 0.9261
F1 Score: 0.9262

Gradient Boosted Machines
Accuracy: 0.9767
Precision: 0.9772
Recall: 0.9767
F1 Score: 0.9768

