# Text Classification TF-IDF Training Testing

In [1]:
import pandas as pd
data_df = pd.read_csv('../Output/PubMed_CleanArticles_Top1-10_cancerTypes.csv', encoding='utf-8')

In [3]:
data_df = data_df[['Article', 'Clean Article', 'Target Label', 'Target Name']]
data_df.head(10)

Unnamed: 0,Article,Clean Article,Target Label,Target Name
0,Understanding the symptoms experienced by indi...,understand symptom experience individual lung ...,1,Lung
1,Do statins improve outcomes for patients with ...,statin improve outcome patient non small cell ...,1,Lung
2,"Lung cancer epidemiology, risk factors, and pr...",lung cancer epidemiology risk factor preventio...,1,Lung
3,[Modern Nanomedicine in Treatment of Lung Carc...,modern nanomedicine treatment lung carcinomas ...,1,Lung
4,[Nineteen multiple primary cancer cases of 100...,nineteen multiple primary cancer case patient ...,1,Lung
5,Image-guided radiotherapy and motion managemen...,image guide radiotherapy motion management lun...,1,Lung
6,[III. Immune Checkpoint Inhibitor as a Standar...,iii immune checkpoint inhibitor standard treat...,1,Lung
7,Radiotherapy for small-cell lung cancer-Where ...,radiotherapy small cell lung cancer head radio...,1,Lung
8,Coagulation-fibrinolytic analysis in patients ...,coagulation fibrinolytic analysis patient lung...,1,Lung
9,Revisiting the debate: the use of new agents i...,revisit debate use new agent previously untrea...,1,Lung


In [4]:
import numpy as np
from sklearn.model_selection import train_test_split

train_corpus, test_corpus, train_label_nums, test_label_nums, train_label_names, test_label_names =\
                                 train_test_split(np.array(data_df['Clean Article']), np.array(data_df['Target Label']),
                                                       np.array(data_df['Target Name']), test_size=0.33, random_state=42)

train_corpus.shape, test_corpus.shape

((6700,), (3300,))

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# build BOW features on train articles
tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0)
tv_train_features = tv.fit_transform(train_corpus)

In [6]:
# transform test articles into features
tv_test_features = tv.transform(test_corpus)

In [7]:
print('TFIDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape)

TFIDF model:> Train features shape: (6700, 20634)  Test features shape: (3300, 20634)


# Using Naive Bayes

In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

mnb = MultinomialNB(alpha=1)
mnb.fit(tv_train_features, train_label_names)
mnb_tfidf_cv_scores = cross_val_score(mnb, tv_train_features, train_label_names, cv=5)
mnb_tfidf_cv_mean_score = np.mean(mnb_tfidf_cv_scores)
print('CV Accuracy (5-fold):', mnb_tfidf_cv_scores)
print('Mean CV Accuracy:', mnb_tfidf_cv_mean_score)
mnb_tfidf_test_score = mnb.score(tv_test_features, test_label_names)
print('Test Accuracy:', mnb_tfidf_test_score)

CV Accuracy (5-fold): [0.94192107 0.9485842  0.94328358 0.95067265 0.95287958]
Mean CV Accuracy: 0.9474682167780435
Test Accuracy: 0.943030303030303


# Using Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l2', max_iter=100, C=1, random_state=42)
lr.fit(tv_train_features, train_label_names)
lr_tfidf_cv_scores = cross_val_score(lr, tv_train_features, train_label_names, cv=5)
lr_tfidf_cv_mean_score = np.mean(lr_tfidf_cv_scores)
print('CV Accuracy (5-fold):', lr_tfidf_cv_scores)
print('Mean CV Accuracy:', lr_tfidf_cv_mean_score)
lr_tfidf_test_score = lr.score(tv_test_features, test_label_names)
print('Test Accuracy:', lr_tfidf_test_score)



CV Accuracy (5-fold): [0.98734177 0.98435171 0.98507463 0.98355755 0.98803291]
Mean CV Accuracy: 0.9856717141912659
Test Accuracy: 0.9848484848484849


# Using Linear SVM

In [10]:
from sklearn.svm import LinearSVC

svm = LinearSVC(penalty='l2', C=1, random_state=42)
svm.fit(tv_train_features, train_label_names)
svm_tfidf_cv_scores = cross_val_score(svm, tv_train_features, train_label_names, cv=5)
svm_tfidf_cv_mean_score = np.mean(svm_tfidf_cv_scores)
print('CV Accuracy (5-fold):', svm_tfidf_cv_scores)
print('Mean CV Accuracy:', svm_tfidf_cv_mean_score)
svm_tfidf_test_score = svm.score(tv_test_features, test_label_names)
print('Test Accuracy:', svm_tfidf_test_score)

CV Accuracy (5-fold): [0.98883098 0.98658718 0.98656716 0.98355755 0.98504114]
Mean CV Accuracy: 0.9861168016738626
Test Accuracy: 0.9866666666666667


# Using Linear SVM (SGD)

In [11]:
from sklearn.linear_model import SGDClassifier

svm_sgd = SGDClassifier(loss='hinge', penalty='l2', max_iter=5, random_state=42)
svm_sgd.fit(tv_train_features, train_label_names)
svmsgd_tfidf_cv_scores = cross_val_score(svm_sgd, tv_train_features, train_label_names, cv=5)
svmsgd_tfidf_cv_mean_score = np.mean(svmsgd_tfidf_cv_scores)
print('CV Accuracy (5-fold):', svmsgd_tfidf_cv_scores)
print('Mean CV Accuracy:', svmsgd_tfidf_cv_mean_score)
svmsgd_tfidf_test_score = svm_sgd.score(tv_test_features, test_label_names)
print('Test Accuracy:', svmsgd_tfidf_test_score)



CV Accuracy (5-fold): [0.98808637 0.98733234 0.99029851 0.98430493 0.98578908]
Mean CV Accuracy: 0.9871622467618817
Test Accuracy: 0.9875757575757576


# Using Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=10, random_state=42)
rfc.fit(tv_train_features, train_label_names)
rfc_tfidf_cv_scores = cross_val_score(rfc, tv_train_features, train_label_names, cv=5)
rfc_tfidf_cv_mean_score = np.mean(rfc_tfidf_cv_scores)
print('CV Accuracy (5-fold):', rfc_tfidf_cv_scores)
print('Mean CV Accuracy:', rfc_tfidf_cv_mean_score)
rfc_tfidf_test_score = rfc.score(tv_test_features, test_label_names)
print('Test Accuracy:', rfc_tfidf_test_score)

CV Accuracy (5-fold): [0.92926284 0.93964232 0.93507463 0.93721973 0.92296185]
Mean CV Accuracy: 0.9328322763945776
Test Accuracy: 0.93


# Using Gradient Boosted Machines

In [13]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=10, random_state=42)
gbc.fit(tv_train_features, train_label_names)
gbc_tfidf_cv_scores = cross_val_score(gbc, tv_train_features, train_label_names, cv=5)
gbc_tfidf_cv_mean_score = np.mean(gbc_tfidf_cv_scores)
print('CV Accuracy (5-fold):', gbc_tfidf_cv_scores)
print('Mean CV Accuracy:', gbc_tfidf_cv_mean_score)
gbc_tfidf_test_score = gbc.score(tv_test_features, test_label_names)
print('Test Accuracy:', gbc_tfidf_test_score)

CV Accuracy (5-fold): [0.97766195 0.97988077 0.96716418 0.9715994  0.97382199]
Mean CV Accuracy: 0.9740256593089966
Test Accuracy: 0.9684848484848485


# TF-IDF Classification Model Comparison

In [15]:
pd.DataFrame([['Naive Bayes', 
               mnb_tfidf_cv_mean_score, mnb_tfidf_test_score],
              ['Logistic Regression', 
               lr_tfidf_cv_mean_score, lr_tfidf_test_score],
              ['Linear SVM', 
               svm_tfidf_cv_mean_score, svm_tfidf_test_score],
              ['Linear SVM (SGD)',  
               svmsgd_tfidf_cv_mean_score, svmsgd_tfidf_test_score],
              ['Random Forest', 
               rfc_tfidf_cv_mean_score, rfc_tfidf_test_score],
              ['Gradient Boosted Machines', 
               gbc_tfidf_cv_mean_score, gbc_tfidf_test_score]],
             columns=['Model',  'CV Score (TF-IDF)', 'Test Score (TF-IDF)'],
             )

Unnamed: 0,Model,CV Score (TF-IDF),Test Score (TF-IDF)
0,Naive Bayes,0.947468,0.94303
1,Logistic Regression,0.985672,0.984848
2,Linear SVM,0.986117,0.986667
3,Linear SVM (SGD),0.987162,0.987576
4,Random Forest,0.932832,0.93
5,Gradient Boosted Machines,0.974026,0.968485
