# Load PubMed

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt

In [3]:
PubMed_Articles_Location = "../Src/pubmed-CancerType_Top1-10-set_Output.xlsx"
PubMed_Articles = pd.read_excel(PubMed_Articles_Location)

PubMed_Articles.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,PUMID,Title,Abstract,CancerType
0,0,0,29790681,Multiple primary lung cancer: A literature rev...,"Nowadays, lung cancer is a leading cause of de...",Lung
1,1,1,27261907,Epidemiology of Lung Cancer.,Lung cancer has been transformed from a rare d...,Lung
2,2,2,29635240,Heterogeneity in Lung Cancer.,Lung cancer diagnosis is a challenge since it ...,Lung
3,3,3,30955514,Lung Cancer.,Lung cancer is the world's leading cause of ca...,Lung
4,4,4,26667338,Lung Cancer in Never Smokers.,Lung cancer is predominantly associated with c...,Lung


In [11]:
PubMed_Articles['Content'] = PubMed_Articles['Title']+ ' ' + PubMed_Articles['Abstract']
PubMed_Articles

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,PUMID,Title,Abstract,CancerType,Content
0,0,0,29790681,Multiple primary lung cancer: A literature rev...,"Nowadays, lung cancer is a leading cause of de...",Lung,Multiple primary lung cancer: A literature rev...
1,1,1,27261907,Epidemiology of Lung Cancer.,Lung cancer has been transformed from a rare d...,Lung,Epidemiology of Lung Cancer. Lung cancer has b...
2,2,2,29635240,Heterogeneity in Lung Cancer.,Lung cancer diagnosis is a challenge since it ...,Lung,Heterogeneity in Lung Cancer. Lung cancer diag...
3,3,3,30955514,Lung Cancer.,Lung cancer is the world's leading cause of ca...,Lung,Lung Cancer. Lung cancer is the world's leadin...
4,4,4,26667338,Lung Cancer in Never Smokers.,Lung cancer is predominantly associated with c...,Lung,Lung Cancer in Never Smokers. Lung cancer is p...
...,...,...,...,...,...,...,...
99995,9995,9995,1942705,[A clinico-pathological study of 69 cases afte...,A clinico-pathological study was conducted on ...,Bladder,[A clinico-pathological study of 69 cases afte...
99996,9996,9996,16325414,Continent urinary diversion.,During the last decade continent urinary diver...,Bladder,Continent urinary diversion. During the last d...
99997,9997,9997,28039452,Preoperative neutrophil-lymphocyte ratio can s...,The prognostic role of systemic inflammatory r...,Bladder,Preoperative neutrophil-lymphocyte ratio can s...
99998,9998,9998,13687923,[Indication and results of partial bladder res...,The prognostic role of systemic inflammatory r...,Bladder,[Indication and results of partial bladder res...


In [12]:
PubMed_Articles = PubMed_Articles.reindex(columns = ['PUMID','Title','Abstract','Content','CancerType'])
PubMed_Articles

Unnamed: 0,PUMID,Title,Abstract,Content,CancerType
0,29790681,Multiple primary lung cancer: A literature rev...,"Nowadays, lung cancer is a leading cause of de...",Multiple primary lung cancer: A literature rev...,Lung
1,27261907,Epidemiology of Lung Cancer.,Lung cancer has been transformed from a rare d...,Epidemiology of Lung Cancer. Lung cancer has b...,Lung
2,29635240,Heterogeneity in Lung Cancer.,Lung cancer diagnosis is a challenge since it ...,Heterogeneity in Lung Cancer. Lung cancer diag...,Lung
3,30955514,Lung Cancer.,Lung cancer is the world's leading cause of ca...,Lung Cancer. Lung cancer is the world's leadin...,Lung
4,26667338,Lung Cancer in Never Smokers.,Lung cancer is predominantly associated with c...,Lung Cancer in Never Smokers. Lung cancer is p...,Lung
...,...,...,...,...,...
99995,1942705,[A clinico-pathological study of 69 cases afte...,A clinico-pathological study was conducted on ...,[A clinico-pathological study of 69 cases afte...,Bladder
99996,16325414,Continent urinary diversion.,During the last decade continent urinary diver...,Continent urinary diversion. During the last d...,Bladder
99997,28039452,Preoperative neutrophil-lymphocyte ratio can s...,The prognostic role of systemic inflammatory r...,Preoperative neutrophil-lymphocyte ratio can s...,Bladder
99998,13687923,[Indication and results of partial bladder res...,The prognostic role of systemic inflammatory r...,[Indication and results of partial bladder res...,Bladder


In [13]:
ClassIDMap = {'Lung': 1,
              'Breast':2,
              'Colorectal':3,
              'Prostate': 4,
              'Stomach':5,
              'Liver':6,
              'Oesophagus':7,
              'Cervix Uteri':8,
              'Thyroid':9,
              'Bladder': 10}
ClassIDMap

{'Lung': 1,
 'Breast': 2,
 'Colorectal': 3,
 'Prostate': 4,
 'Stomach': 5,
 'Liver': 6,
 'Oesophagus': 7,
 'Cervix Uteri': 8,
 'Thyroid': 9,
 'Bladder': 10}

In [14]:
corpus, target_labels, target_names = (PubMed_Articles['Content'], [ClassIDMap[label] for label in PubMed_Articles['CancerType']], PubMed_Articles['CancerType'])
data_df = pd.DataFrame({'Article': corpus, 'Target Label': target_labels, 'Target Name': target_names})
print(data_df.shape)
data_df.head(10)

(100000, 3)


Unnamed: 0,Article,Target Label,Target Name
0,Multiple primary lung cancer: A literature rev...,1,Lung
1,Epidemiology of Lung Cancer. Lung cancer has b...,1,Lung
2,Heterogeneity in Lung Cancer. Lung cancer diag...,1,Lung
3,Lung Cancer. Lung cancer is the world's leadin...,1,Lung
4,Lung Cancer in Never Smokers. Lung cancer is p...,1,Lung
5,Lung cancer in persons with HIV. PURPOSE OF RE...,1,Lung
6,Epidemiology of Lung Cancer. Lung cancer conti...,1,Lung
7,"Lung cancer screening: advantages, controversi...",1,Lung
8,Lung cancer: diagnosis and management. Lung ca...,1,Lung
9,Diagnosis and Molecular Classification of Lung...,1,Lung


# Text Preprocessing

In [16]:
import text_normalizer as tn
import nltk
stopword_list = nltk.corpus.stopwords.words('english')
# just to keep negation if any in bi-grams
stopword_list.remove('no')
stopword_list.remove('not')
#stopword_list.add('cancer')

# normalize our corpus
norm_corpus = tn.normalize_corpus(corpus=data_df['Article'], html_stripping=True, contraction_expansion=True, 
                                  accented_char_removal=True, text_lower_case=True, text_lemmatization=True, 
                                  text_stemming=False, special_char_removal=True, remove_digits=True,
                                  stopword_removal=True, stopwords=stopword_list)
data_df['Clean Article'] = norm_corpus

KeyboardInterrupt: 

In [None]:
data_df = data_df[['Article', 'Clean Article', 'Target Label', 'Target Name']]
data_df.head(10)

In [None]:
data_df = data_df.replace(r'^(\s?)+$', np.nan, regex=True)
data_df = data_df.dropna().reset_index(drop=True)

In [None]:
data_df.to_csv('../Output/PubMed_CleanArticles_Top1-10_cancerTypes.csv', encoding='utf-8',index=False)