# Load PubMed

In [1]:
import pandas as pd
PubMed = pd.read_excel('../Src/(Cleaned)pubmed-CancerType_Top1-10-set_10000-data.xlsx')
PubMed

Unnamed: 0,PUMID,Title,Abstract,CancerType,Description,Clean_Description
0,17078348,Understanding the symptoms experienced by indi...,The purpose of this study was to gain a better...,Lung,Understanding the symptoms experienced by indi...,understand symptom experience individual lung ...
1,30206083,Do statins improve outcomes for patients with ...,INTRODUCTION: Lung cancer is the most common n...,Lung,Do statins improve outcomes for patients with ...,statin improve outcome patient non small cell ...
2,22974775,"Lung cancer epidemiology, risk factors, and pr...",The greatest risk by far for developing lung c...,Lung,"Lung cancer epidemiology, risk factors, and pr...",lung cancer epidemiology risk factor preventio...
3,26299737,[Modern Nanomedicine in Treatment of Lung Carc...,BACKGROUNDS: Despite the fast development of n...,Lung,[Modern Nanomedicine in Treatment of Lung Carc...,modern nanomedicine treatment lung carcinoma b...
4,8815254,[Nineteen multiple primary cancer cases of 100...,"In our department, half of 100 consecutive lun...",Lung,[Nineteen multiple primary cancer cases of 100...,nineteen multiple primary cancer case patient ...
...,...,...,...,...,...,...
9995,24122724,High morbidity and mortality found for high-ri...,OBJECTIVES: To give an updated review concerni...,Bladder,High morbidity and mortality found for high-ri...,high morbidity mortality find high risk non mu...
9996,10447660,Case-referent study on occupational risk facto...,OBJECTIVE: To evaluate the possible associatio...,Bladder,Case-referent study on occupational risk facto...,case referent study occupational risk factor b...
9997,3582456,Intravesical irrigation with distilled water d...,"In a retrospective study, the influence of dis...",Bladder,Intravesical irrigation with distilled water d...,intravesical irrigation distill water immediat...
9998,21897260,Ileal neobladder in women with bladder cancer:...,PURPOSE OF REVIEW: Radical cystectomy and urin...,Bladder,Ileal neobladder in women with bladder cancer:...,ileal neobladder woman bladder cancer cancer c...


# Extract TF-IDF Features

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(use_idf=True, ngram_range=(1, 2), min_df=10, max_df=0.8)
tv_matrix = tv.fit_transform(PubMed['Clean_Description'])
tv_matrix.shape

(10000, 15814)

# Cluster Articles Using Affinity Propagation

In [3]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim_features = cosine_similarity(tv_matrix)

In [4]:
from sklearn.cluster import AffinityPropagation
from collections import Counter

ap = AffinityPropagation(max_iter=10)
ap.fit(cosine_sim_features)
res = Counter(ap.labels_)
res.most_common(10)

[(247, 87),
 (8, 83),
 (193, 83),
 (60, 80),
 (306, 75),
 (137, 75),
 (157, 67),
 (196, 66),
 (53, 65),
 (237, 64)]

In [5]:
PubMed['affprop_cluster'] = ap.labels_
filtered_clusters = [item[0] for item in res.most_common(10)]
filtered_pubmed = PubMed[PubMed['affprop_cluster'].isin(filtered_clusters)]
article_clusters = (filtered_pubmed[['Title', 'PUMID', 'affprop_cluster']]
                  .sort_values(by=['affprop_cluster', 'PUMID'], 
                               ascending=False))
article_clusters

Unnamed: 0,Title,PUMID,affprop_cluster
7252,Using amide proton transfer to identify cervic...,31071471,306
8754,Alterations in the gut microbiota and metaboli...,30565661,306
9878,Haematuria in ADPKD: not always benign. Be aware!,28993351,306
6419,A comparative analysis of whole genome sequenc...,28465312,306
7306,Predicting tumor recurrence in patients with c...,27445314,306
...,...,...,...
375,[Correlation study of selenium levels in the h...,3595331,8
404,"Treatment of stage I lung cancer (T1N0M0, T2N0...",2820071,8
124,[Lung cancer in Internal Medicine].,1623098,8
237,[Lung cancer: comparative study of a public an...,1062208,8


In [6]:
article_clusters.to_excel('../Output/Affinity_Propagation_Cluster_Output.xlsx', index=False) 

In [7]:
article_clusters = (filtered_pubmed[['Title', 'PUMID', 'affprop_cluster']]
                  .sort_values(by=['affprop_cluster', 'PUMID'], 
                               ascending=False)
                  .groupby('affprop_cluster').head(20))

In [8]:
# get key features for each cluster
# get articles belonging to each cluster
for cluster_num in range(len(filtered_clusters)):
    articles = article_clusters[article_clusters['affprop_cluster'] == filtered_clusters[cluster_num]]['Title'].values.tolist()
    print('CLUSTER #'+str(filtered_clusters[cluster_num]))
    print()
    print('PubMed Titles:', articles)
    print('-'*80)

CLUSTER #247

PubMed Titles: ['Oncological Safety of Ultrasonically Activated Surgical Devices During Gastric Cancer Surgery.', 'Transplacental arsenic exposure produced 5-methylcytosine methylation changes and aberrant microRNA expressions in livers of male fetal mice.', 'A novel knowledge-derived data potentizing method revealed unique liver cancer-associated genetic variants.', 'IL‑17A promotes CXCR2‑dependent angiogenesis in a mouse model of liver cancer.', 'Systemic doxorubicin and hepatocellular carcinoma: the end of an era never risen up.', 'Efficacy of Surface-Modified PLGA Nanoparticles as a Function of Cervical Cancer Type.', 'Clinical significance and biological role of cancer-derived Type I collagen in lung and esophageal cancers.', 'Molecular pathways in the development and treatment of oesophageal cancer.', 'Biomarker-Driven and Molecular Targeted Therapies for Hepatobiliary Cancers.', 'SNHG6 Acts as a Genome-Wide Hypomethylation Trigger via Coupling of miR-1297-Mediated 

## Select one article (using "PUMID"), from each cluster and extract top 5 similiarest articles (instances)

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

doc_sim = cosine_similarity(tv_matrix)
doc_sim_df = pd.DataFrame(doc_sim)
doc_sim_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,1.000000,0.061515,0.088812,0.074249,0.097627,0.030607,0.113703,0.062316,0.054583,0.055684,...,0.007435,0.016421,0.029709,0.041221,0.012108,0.012048,0.041185,0.027924,0.011591,0.020964
1,0.061515,1.000000,0.074174,0.088862,0.068625,0.037924,0.171106,0.115116,0.069546,0.135795,...,0.017958,0.044825,0.028957,0.027395,0.018010,0.155452,0.023099,0.028555,0.025803,0.028207
2,0.088812,0.074174,1.000000,0.137289,0.110647,0.058015,0.248487,0.111255,0.080277,0.081217,...,0.009129,0.036299,0.009844,0.015768,0.020715,0.012955,0.225336,0.009698,0.005016,0.007722
3,0.074249,0.088862,0.137289,1.000000,0.109521,0.071664,0.247494,0.093393,0.080238,0.056884,...,0.049775,0.023406,0.009157,0.027696,0.034338,0.012724,0.011421,0.016919,0.010655,0.011032
4,0.097627,0.068625,0.110647,0.109521,1.000000,0.044762,0.157064,0.069573,0.061845,0.044650,...,0.004479,0.018916,0.030967,0.008876,0.036243,0.021370,0.042591,0.033074,0.008619,0.003658
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.012048,0.155452,0.012955,0.012724,0.021370,0.012357,0.018038,0.033188,0.015994,0.021941,...,0.054799,0.146896,0.055373,0.103847,0.063855,1.000000,0.067035,0.083979,0.077338,0.112880
9996,0.041185,0.023099,0.225336,0.011421,0.042591,0.007293,0.033225,0.008350,0.024374,0.048804,...,0.030455,0.122245,0.045901,0.038382,0.065740,0.067035,1.000000,0.094757,0.057887,0.060383
9997,0.027924,0.028555,0.009698,0.016919,0.033074,0.014134,0.010902,0.015690,0.012249,0.041560,...,0.096783,0.156038,0.059505,0.044596,0.101673,0.083979,0.094757,1.000000,0.059989,0.067853
9998,0.011591,0.025803,0.005016,0.010655,0.008619,0.010631,0.014759,0.002436,0.007899,0.016626,...,0.030624,0.142499,0.102864,0.058796,0.212628,0.077338,0.057887,0.059989,1.000000,0.039349


In [10]:
pubmed_list = PubMed['Title'].values
pubmed_list, pubmed_list.shape

(array(['Understanding the symptoms experienced by individuals with lung cancer.',
        'Do statins improve outcomes for patients with non-small cell lung cancer? A systematic review and meta-analysis protocol.',
        'Lung cancer epidemiology, risk factors, and prevention.', ...,
        'Intravesical irrigation with distilled water during and immediately after transurethral resection and later for superficial bladder cancer.',
        'Ileal neobladder in women with bladder cancer: cancer control and functional aspects.',
        'Molecular Subtype Profiling of Urothelial Carcinoma Using a Subtype-Specific Immunohistochemistry Panel.'],
       dtype=object), (10000,))

In [11]:
import numpy as np

def similar_articles(pubmed_title, article=pubmed_list, doc_sims=doc_sim_df):
    # find pubmed index
    pubmed_idx = np.where(article == pubmed_title)[0][0]
    # get article similarities
    article_similarities = doc_sims.iloc[pubmed_idx].values
    # get top 5 similar pubmed article IDs
    similar_pubmed_idxs = np.argsort(-article_similarities)[1:6]
    # get top 5 article
    similar_article = article[similar_pubmed_idxs]
    # return the top 5 similar articles
    return similar_article

In [12]:
articles= []
articles_cluster = []
for i in filtered_clusters:
    pubmed_articles = article_clusters[article_clusters['affprop_cluster'] == i]['Title'].values.tolist()
    articles.append(pubmed_articles[0])
    articles_cluster.append(i)
articles

['Oncological Safety of Ultrasonically Activated Surgical Devices During Gastric Cancer Surgery.',
 '[Role of Circular RNA in Diagnosis, Development and Durg Resistance of Lung Cancer].',
 '[Multiple gastric adenocarcinoma of fundic gland type after H. pylori eradication: a case report].',
 'Heart failure in breast cancer survivors: implications of miR126?',
 'Using amide proton transfer to identify cervical squamous carcinoma/adenocarcinoma and evaluate its differentiation grade.',
 'Genomics of Prostate Cancer: What Nurses Need to Know.',
 'Predictors of efficacy of androgen-receptor-axis-targeted therapies in patients with metastatic castration-sensitive prostate cancer: A systematic review and',
 'Functions of circular RNAs and their potential applications in gastric cancer.',
 'The role of three-dimensional printing in the surgical management of breast cancer.',
 'Regorafenib Combined With Sirolimus Achieves Successful Treatment of Diffuse Double Lung Metastasis After Liver Transp

In [13]:
data = {'Title': [], 'Affprop_cluster': [], 'Similar_Title': [], 'Similar_cluster': []}
df_similar = pd.DataFrame(data)
df_similar

Unnamed: 0,Title,Affprop_cluster,Similar_Title,Similar_cluster


In [14]:
i = 0
for article in articles:
    print('Article Title from Cluster', articles_cluster[i], ':', article)
    print()
    print('Top 5 similar article:')
    five_similar_articles = similar_articles(pubmed_title=article)
    for one_article in five_similar_articles:
        article_details = PubMed.loc[PubMed['Title'] == one_article]
        print('From Cluster', article_details['affprop_cluster'].tolist()[0], 'Title :', one_article)
        new_row = {'Title': articles[i], 'Affprop_cluster': articles_cluster[i], 'Similar_Title': one_article, 'Similar_cluster': article_details['affprop_cluster'].tolist()[0]}
        df_similar = df_similar.append(new_row, ignore_index=True)
    print('-'*80)
    print()
    i+=1

Article Title from Cluster 247 : Oncological Safety of Ultrasonically Activated Surgical Devices During Gastric Cancer Surgery.

Top 5 similar article:
From Cluster 245 Title : Establishment and evaluation of cancer-specific human monoclonal antibody GAH for targeting chemotherapy using immunoliposomes.
From Cluster 213 Title : Establishment of Hepatocellular Cancer Induced Pluripotent Stem Cells Using a Reprogramming Technique.
From Cluster 245 Title : [Serum isoferritin assay in patients with hepatitis, cirrhosis and primary liver cancer].
From Cluster 199 Title : [Early gastric cancer].
From Cluster 411 Title : Adriamycin-mediated potentiation of cytotoxicity against freshly isolated bladder cancer cells by autologous non-activated peripheral blood lymphocytes and tumor infiltrating lymphocytes.
--------------------------------------------------------------------------------

Article Title from Cluster 8 : [Role of Circular RNA in Diagnosis, Development and Durg Resistance of Lung C

In [15]:
df_similar

Unnamed: 0,Title,Affprop_cluster,Similar_Title,Similar_cluster
0,Oncological Safety of Ultrasonically Activated...,247.0,Establishment and evaluation of cancer-specifi...,245.0
1,Oncological Safety of Ultrasonically Activated...,247.0,Establishment of Hepatocellular Cancer Induced...,213.0
2,Oncological Safety of Ultrasonically Activated...,247.0,[Serum isoferritin assay in patients with hepa...,245.0
3,Oncological Safety of Ultrasonically Activated...,247.0,[Early gastric cancer].,199.0
4,Oncological Safety of Ultrasonically Activated...,247.0,Adriamycin-mediated potentiation of cytotoxici...,411.0
5,"[Role of Circular RNA in Diagnosis, Developmen...",8.0,The problem of cancer: lung cancer as a paradi...,7.0
6,"[Role of Circular RNA in Diagnosis, Developmen...",8.0,Is a nihilist approach to lung cancer still ju...,7.0
7,"[Role of Circular RNA in Diagnosis, Developmen...",8.0,Surgery in locally advanced non-small cell lun...,34.0
8,"[Role of Circular RNA in Diagnosis, Developmen...",8.0,The P2X7 purinergic receptor: a potential ther...,23.0
9,"[Role of Circular RNA in Diagnosis, Developmen...",8.0,Combined modality therapy for lung cancer.,34.0
