#### Clustering Asthma among COVID-19 papers

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn import preprocessing
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
import nltk
import string
from nltk.stem import PorterStemmer
from sklearn.cluster import KMeans


In [3]:
import plotly.express as px

Only those papers where the word "asthma" appears at least once in their abstract were selected. The papers were downloaded in order to be more easily handled, locally. 

In [4]:
asthma_df = pd.read_csv("asthma_data.csv")
asthma_df.head()

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,...,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id,abstract_lower,title_lower
0,qva0jt86,4ba79e54ecf81b30b56461a6aec2094eaf7b7f06,PMC,Relevance of human metapneumovirus in exacerba...,10.1186/1465-9921-6-150,PMC1334186,16371156.0,cc-by,BACKGROUND AND METHODS: Human metapneumovirus ...,2005-12-21,...,Respir Res,,,,document_parses/pdf_json/4ba79e54ecf81b30b5646...,document_parses/pmc_json/PMC1334186.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,,background and methods: human metapneumovirus ...,relevance of human metapneumovirus in exacerba...
1,chz8luni,d68d71553d3a31381c0c3851351f912a9a7be1c9,PMC,Surfactant therapy for acute respiratory failu...,10.1186/cc5944,PMC2206432,17573963.0,cc-by,INTRODUCTION: Exogenous surfactant is used to ...,2007-06-15,...,Crit Care,,,,document_parses/pdf_json/d68d71553d3a31381c0c3...,document_parses/pmc_json/PMC2206432.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,,introduction: exogenous surfactant is used to ...,surfactant therapy for acute respiratory failu...
2,3zh8jmc2,fe2000f280297c40bc53ce95d703a9ca6aac19fd,PMC,Differential Regulation of Type I Interferon a...,10.1371/journal.ppat.1000587,PMC2736567,19806178.0,cc-by,A number of paramyxoviruses are responsible fo...,2009-09-18,...,PLoS Pathog,,,,document_parses/pdf_json/fe2000f280297c40bc53c...,document_parses/pmc_json/PMC2736567.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,,a number of paramyxoviruses are responsible fo...,differential regulation of type i interferon a...
3,7p3b6tyf,3ccbd07ee1865e4f2afffdb6cc8b6039ab605ee7,PMC,The Tennessee Children's Respiratory Initiativ...,10.1111/j.1440-1843.2010.01743.x,PMC2992986,20409023.0,no-cc,Background and objective: The ‘attack rate’ of...,2010-04-08,...,Respirology,,,,document_parses/pdf_json/3ccbd07ee1865e4f2afff...,document_parses/pmc_json/PMC2992986.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,,background and objective: the ‘attack rate’ of...,the tennessee children's respiratory initiativ...
4,xrsyj1tc,fa88fbb8716e5fca7d513bcb5a0a608456a59205,PMC,Analysing the eosinophil cationic protein - a ...,10.1186/1465-9921-12-10,PMC3030543,21235798.0,cc-by,Eosinophil granulocytes reside in respiratory ...,2011-01-14,...,Respir Res,,,,document_parses/pdf_json/fa88fbb8716e5fca7d513...,document_parses/pmc_json/PMC3030543.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,,eosinophil granulocytes reside in respiratory ...,analysing the eosinophil cationic protein - a ...


There are 2567 papers containing the word "asthma", among the coronavirus-related publications 

In [5]:
asthma_df.shape

(2567, 21)

We divide the papers between those published before the new coronavirus SARS-CoV-2 has appeared (December 2019) and those published after the COVID-19 outbreak. We pick December 2019 as the cut-off date.

In [6]:
asthma_before_covid = asthma_df.loc[asthma_df['publish_time']<"2019-12-01"].reset_index(drop=True)
asthma_after_covid = asthma_df.loc[asthma_df['publish_time']>="2019-12-01"].reset_index(drop=True)

Below, for both these two groups of papers, we follow the same data processing. The steps followed are:

* Tokenization of the papers' abstracts text
* Stemming of the tokens 
* Use of the TfidfVectorizer 

#### Papers before COVID-19

In [7]:
texts_before = asthma_before_covid["abstract_lower"].tolist()

In [8]:
def custom_tokenizer(str_input):
    
    stemmer = PorterStemmer()
    words = nltk.word_tokenize(str_input)
    
    words = [word.replace('â¡', '') for word in words]
    words = [word.replace('â¢', '') for word in words]
    words = [word.replace('â£', '') for word in words]
    
    words = [''.join(c for c in word if c not in string.punctuation+'©±×≤≥●＜--“”→„') for word in words]
    words = [word for word in words if word not in ['‘', '’', '„']]
    
    
    words = [word for word in words if word]
    words = [word for word in words if not any(char.isdigit() for char in word)]
    
    #remove stop words before stemming - nltk list of stop words - read file and tolist
    words = [stemmer.stem(word) for word in words]
    return words

In [9]:
vec_before = TfidfVectorizer(tokenizer=custom_tokenizer,
                             max_features=2000,
                      stop_words='english')
matrix_before = vec_before.fit_transform(texts_before)
df = pd.DataFrame(matrix_before.toarray(), columns=vec_before.get_feature_names())
df.head()

  'stop_words.' % sorted(inconsistent))


Unnamed: 0,aa,abdelbaset,abil,abl,abnorm,abov,absenc,absent,absolut,abstract,...,zhang,zu,à,étude,été,α,β,–,•,−
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


The Kmeans is explained here:
https://towardsdatascience.com/understanding-k-means-clustering-in-machine-learning-6a6e67336aa1

In [10]:
number_of_clusters=20
km_before = KMeans(n_clusters=number_of_clusters)
km_before.fit(matrix_before)

KMeans(n_clusters=20)

Let's have an over view of our cluster's centers (centroids) anf labels. After that, we get the top terms for every cluster. In other wrods, we see which are the most frequently mentioned words per cluster. Note: since we have used Stemming, we only have the root of the words now.

In [11]:
centroids, labels = km_before.cluster_centers_, km_before.labels_
print(centroids)
print(labels)

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.00704329 ... 0.00377553 0.         0.        ]
 [0.         0.         0.00166836 ... 0.01091459 0.         0.        ]
 ...
 [0.         0.         0.00534464 ... 0.00140264 0.         0.        ]
 [0.         0.         0.         ... 0.01674118 0.         0.        ]
 [0.00421719 0.         0.         ... 0.         0.         0.        ]]
[ 7 14 19 ...  2  2  9]


In [12]:
print("Top terms per cluster:")
order_centroids = km_before.cluster_centers_.argsort()[:, ::-1]
terms = vec_before.get_feature_names()
for i in range(number_of_clusters):
    top_ten_words = [terms[ind] for ind in order_centroids[i, :20]]
    print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))

Top terms per cluster:
Cluster 0: pneumonia m asthma children exacerb chlamydia atyp mycoplasma infect patient c group wa posit pcr test role studi symptom communityacquir
Cluster 1: rv rvc infect children rsv wa cell detect wheez rhinoviru rva viral asthma speci respiratori exacerb viru human bronchiol type
Cluster 2: asthma health diseas effect use thi air ha studi manag patient children pulmonari indoor care medic review includ treatment public
Cluster 3: influenza vaccin pregnanc pneumonia women patient asthma respiratori pregnant complic infect viru adult wa case condit risk pandem dure diseas
Cluster 4: la le l en d une et el y que lo à enfant par asthm est chez respiratoir été étude
Cluster 5: respiratori infect virus rhinoviru cold human ill lower diseas tract elderli acut caus antivir otiti host coronavirus new exacerb common
Cluster 6: infect asthma wheez respiratori develop viral infant ill children earli rsv life childhood factor risk associ bronchiol caus sever viru
Cluste

In [13]:
order_centroids
#word with index 1049 has highest Tfidf value in the first array

array([[1367, 1054,  154, ..., 1214, 1215,    0],
       [1599, 1601,  885, ..., 1167, 1168,    0],
       [ 154,  760,  513, ...,  964,  962,    0],
       ...,
       [1052,  653,  275, ..., 1106, 1108,    0],
       [1184, 1317, 1937, ..., 1183, 1185,    0],
       [  74,  275, 1566, ..., 1089, 1096,  999]], dtype=int64)

PCA and its implementation in Python is explained here: https://jakevdp.github.io/PythonDataScienceHandbook/05.09-principal-component-analysis.html

In [14]:
T = preprocessing.Normalizer().fit_transform(df)
# Dimesionality reduction to 2
pca_model = PCA(n_components=2)
pca_model.fit(T) 
T = pca_model.transform(T) 

# transform the 'centroids of KMean'

centroid_pca = pca_model.transform(centroids)
# print(centroid_pca)

In [15]:
asthma_before_covid['labels'] = km_before.labels_
asthma_before_covid['pca_1'] = T[:, 0]
asthma_before_covid['pca_2'] = T[:, 1]

In [16]:
asthma_before_covid['labels'] = asthma_before_covid['labels'].astype(str)

In [17]:
fig = px.scatter(asthma_before_covid, 
                 x="pca_1", 
                 y="pca_2", 
                 color="labels",
                 hover_data=['title'])

In [19]:
fig.show()