In [1]:
# Import Python Packages required for analysis
import pandas as pd
import numpy as np
import plotly.express as px
import warnings 
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv("metadata.csv", encoding='latin-1')
data.head(3)

Unnamed: 0.1,Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
0,0,ug7v899j,d1aafb70c066a2068b02786f8929fd9c900897fb,PMC,Clinical features of culture-proven Mycoplasma...,10.1186/1471-2334-1-6,PMC35282,11472636,no-cc,OBJECTIVE: This retrospective chart review des...,2001-07-04,"Madani, Tariq A; Al-Ghamdi, Aisha A",BMC Infect Dis,,,,document_parses/pdf_json/d1aafb70c066a2068b027...,document_parses/pmc_json/PMC35282.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,
1,1,02tnwd4m,6b0567729c2143a66d737eb0a2f63f2dce2e5a7d,PMC,Nitric oxide: a pro-inflammatory mediator in l...,10.1186/rr14,PMC59543,11667967,no-cc,Inflammatory diseases of the respiratory tract...,2000-08-15,"Vliet, Albert van der; Eiserich, Jason P; Cros...",Respir Res,,,,document_parses/pdf_json/6b0567729c2143a66d737...,document_parses/pmc_json/PMC59543.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
2,2,ejv2xln0,06ced00a5fc04215949aa72528f2eeaae1d58927,PMC,Surfactant protein-D and pulmonary host defense,10.1186/rr19,PMC59549,11667972,no-cc,Surfactant protein-D (SP-D) participates in th...,2000-08-25,"Crouch, Erika C",Respir Res,,,,document_parses/pdf_json/06ced00a5fc04215949aa...,document_parses/pmc_json/PMC59549.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,


In [4]:
df_csv = data
# Select relevant columns that are needed for analysis 
df_csv = df_csv[['cord_uid','title','doi','abstract','publish_time','authors','journal','doi','pmcid','pubmed_id']]
df_csv.head()

Unnamed: 0,cord_uid,title,doi,abstract,publish_time,authors,journal,doi.1,pmcid,pubmed_id
0,ug7v899j,Clinical features of culture-proven Mycoplasma...,10.1186/1471-2334-1-6,OBJECTIVE: This retrospective chart review des...,2001-07-04,"Madani, Tariq A; Al-Ghamdi, Aisha A",BMC Infect Dis,10.1186/1471-2334-1-6,PMC35282,11472636
1,02tnwd4m,Nitric oxide: a pro-inflammatory mediator in l...,10.1186/rr14,Inflammatory diseases of the respiratory tract...,2000-08-15,"Vliet, Albert van der; Eiserich, Jason P; Cros...",Respir Res,10.1186/rr14,PMC59543,11667967
2,ejv2xln0,Surfactant protein-D and pulmonary host defense,10.1186/rr19,Surfactant protein-D (SP-D) participates in th...,2000-08-25,"Crouch, Erika C",Respir Res,10.1186/rr19,PMC59549,11667972
3,2b73a28n,Role of endothelin-1 in lung disease,10.1186/rr44,Endothelin-1 (ET-1) is a 21 amino acid peptide...,2001-02-22,"Fagan, Karen A; McMurtry, Ivan F; Rodman, David M",Respir Res,10.1186/rr44,PMC59574,11686871
4,9785vg6d,Gene expression in epithelial cells in respons...,10.1186/rr61,Respiratory syncytial virus (RSV) and pneumoni...,2001-05-11,"Domachowske, Joseph B; Bonville, Cynthia A; Ro...",Respir Res,10.1186/rr61,PMC59580,11686888


In [5]:
# Removing articles without abstracts
print(len(df_csv))
df_csv = df_csv[~df_csv['abstract'].isnull()]
print(len(df_csv))

1121433
820893


In [6]:
# Remvoning the section keywords from the abstracts
df_csv['abstract'] = df_csv['abstract'].apply(lambda x: 
                                          x.replace('BACKGROUND:','').replace('BACKGROUNDS:','').replace('OBJECTIVES:','')
                                          .replace('OBJECTIVE:','').replace('METHODS:','').replace('METHOD:','')
                                          .replace('RESULTS:','').replace('RESULT:','')
                                          .replace('CONCLUSION:','').replace('CONCLUSIONS:',''))

In [7]:
# Converting abstract to lower case
df_csv['abstract'] = df_csv['abstract'].apply(lambda x: x.lower())
# This removes the lines that contains text "this article is protected by copyright. all rights reserved"
df_csv['abstract'] = df_csv['abstract'].apply(lambda x: x.replace('this article is protected by copyright. all rights reserved',''))

In [9]:
# Saving the file
df_csv.to_csv('/abstract_final.csv', index=None)

In [12]:
# Removing articles without abstracts
print(len(df_csv))
df_csv = df_csv[~df_csv['publish_time'].isnull()]
print(len(df_csv))

820893
819168


In [14]:
df = df_csv
# Converting the timestamp string to date format, which Python can process
df['publish_time_new'] = pd.to_datetime(df['publish_time'], format='%Y-%m-%d',errors='coerce')

In [15]:
print(len(df))
df = df[df.publish_time_new != "NaT"]
print(len(df))

819168
819168


In [16]:
# Removing articles that are published before 1 Jan 2020. The remaining articles are 40665
import datetime
df= df[df['publish_time_new']>'2020-01-01']
len(df)

594929

In [18]:
# Detecting and removing articles with abstracts written in other than English
# The left-over articles are 40078
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0
def langdet (x):
    try:
        return detect(x)
    except:
        return "NA"
df['lang'] = df['abstract'].apply(lambda x: langdet(x))
df = df[df['lang'].str.contains('en')]
df.to_csv('articles_clean_text_eng.csv', index=None)
len(df)

588430

In [37]:
import pandas as pd
df = pd.read_csv('articles_clean_text_eng.csv')

In [38]:
df1 = df[df.abstract.str.contains("covid|sars-cov-2|omicron|novel coronavirus|coronavirus 2019|novel corona virus|corona virus 2019")]
len(df1)

392932

In [39]:
unk = """'Unknown'
'Not Known'
'Little is known'
'Unrevealed'
'Uncertain' 
'Undetermined'
'Understudied'
'Unexplored'
'Not fully understood'
'Literature gap'
'Research gap'
'Knowledge gap'
'Future studies'
'Future research'
'Research problem'
'More studies'
'More research'
'Further studies'
'Further research'""" 
unk = unk.replace("\n",",").replace("'","").lower().split(",")
unk= "|".join(unk)
unk

'unknown|not known|little is known|unrevealed|uncertain |undetermined|understudied|unexplored|not fully understood|literature gap|research gap|knowledge gap|future studies|future research|research problem|more studies|more research|further studies|further research'

In [40]:
df2 = df1[df1.abstract.str.contains(unk)]
len(df2)

33206

In [5]:
# This code cleans the data for further analysis
# It removes wired characters, and punctuations
# It also lemmatize and finds word stems
# Finally it converts the clean abstract to individual words (tokens) and finds, unigrams, bigrams and trigrams
import re
import nltk
import string
from textblob import TextBlob
# nltk.download('stopwords')
# nltk.download('wordnet')
stopword = nltk.corpus.stopwords.words('english')
#my_file = open("stopwords.txt", "r")
#content = my_file.read().split('\n')
#stopword.extend(content)
stopword = list(set(stopword))
stopword = [w.strip() for w in stopword]
stopword = set(stopword)
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()
from nltk import bigrams, trigrams

def removeWeirdChars(text):
    weridPatterns = re.compile("["u"\U0001F600-\U0001F64F"u"\U0001F300-\U0001F5FF"u"\U0001F680-\U0001F6FF"u"\U0001F1E0-\U0001F1FF"u"\U00002702-\U000027B0"u"\U000024C2-\U0001F251"u"\U0001f926-\U0001f937"u'\U00010000-\U0010ffff'
                               u"\u200d"u"\u2640-\u2642"u"\u2600-\u2B55"u"\u23cf"u"\u23e9"u"\u231a"u"\u3030"u"\ufe0f"u"\u2069"u"\u2066"u"\u200c"u"\u2068"u"\u2067""]+", flags=re.UNICODE)
    return weridPatterns.sub(r'', text)
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text
def tokenization(text):
    text = text.split()#re.split('\W+', text)
    text = ','.join(set(text))
    return text
def remove_stopwords(text):
    text = [word.strip() for word in text.split() if word not in stopword]
    text = ' '.join(text)
    return text
def stemming(text):
    text = [ps.stem(word) for word in text.split()]
    text = ' '.join(text)
    return text

def lemmatizer(text):
    text = [wn.lemmatize(word) for word in text.split()]
    text = ' '.join(text)
    return text
def clean_text(text):
    text_lc = " ".join([word.lower() for word in text.split() if word not in string.punctuation]) # remove puntuation
    text_rc = re.sub('[0-9]+', '', text_lc)
    tokens = re.split('\W+', text_rc)    # tokenization
    text = [word for word in tokens if word not in stopword]  # remove stopwords and stemming
    text = ' '.join(text)
    return text


df2['clean_text'] = df2['abstract'].apply(lambda x: clean_text(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [41]:
wl = unk.split("|")
def sentence(x):
    s, w = [], []
    sent = x.split(".")
    if len(sent)<=1:
        s, w =["-1"], ["-1"]
    else:
        for c, i in enumerate(sent):
            for j in wl:
                if j in i:
                    w.append(j)
                    if c==0 or c==len(sent)-1:
                        s.append(i) 
                    else:
                        s.append(sent[c-1])
                        s.append(sent[c])
                        s.append(sent[c+1])
    s = list(dict.fromkeys(s))
    w = list(dict.fromkeys(w))
    C = ".".join(s)
    D = ",".join(w)
    return pd.Series([C, D])

In [42]:
df2[['sentences', 'words']] = df2['abstract'].apply(sentence)

In [14]:
df2.head()

Unnamed: 0,cord_uid,title,doi,abstract,publish_time,authors,journal,doi.1,pmcid,pubmed_id,publish_time_new,lang,clean_text,sentences,words
51,51tkew79,Molecular characterization of Streptococcus ag...,10.1186/s12879-020-4776-7,streptococcus agalctiae (group b streptococcu...,2020-01-13,"Gizachew, Mucheye; Tiruneh, Moges; Moges, Fele...",BMC Infect Dis,10.1186/s12879-020-4776-7,PMC6958622,31931732,2020-01-13,en,streptococcus agalctiae group b streptococcus ...,tetracycline resistant determinant genes such...,further studies
78,7r9v3frr,An implantable system for long-term assessment...,10.1038/s41598-020-57528-3,atrial fibrillation (af) is a progressive arrh...,2020-01-17,"Klapper-Goldstein, Hadar; Murninkas, Michael; ...",Sci Rep,10.1038/s41598-020-57528-3,PMC6969190,31953473,2020-01-17,en,atrial fibrillation af progressive arrhythmia ...,"unexpectedly, shams also developed progressiv...",further studies
122,ukcv4t4f,The expression patterns of immune response gen...,10.1371/journal.pone.0228068,hepatitis e is an enteric disease highly preva...,2020-02-03,"Ramdasi, Ashwini Y.; Arankalle, Vidya A.",PLoS One,10.1371/journal.pone.0228068,PMC6996850,32012176,2020-02-03,en,hepatitis e enteric disease highly prevalent d...,the data obtained here could be correlated wi...,future studies
448,mldwn4sj,Acute respiratory distress syndrome subphenoty...,10.1186/s12931-020-01337-9,subphenotypes were recently reported within c...,2020-04-07,"Carla, Adrien; Pereira, Bruno; Boukail, Hanifa...",Respir Res,10.1186/s12931-020-01337-9,PMC7137453,32264897,2020-04-07,en,subphenotypes recently reported within clinica...,subphenotypes were recently reported within c...,unknown
658,osk0uzw5,The structureâactivity relationship review o...,10.1007/s11418-019-01383-8,abstract: morus genus plants are mainly distri...,2020-01-02,"Yan, Jiejing; Ruan, Jingya; Huang, Peijian; Su...",J Nat Med,10.1007/s11418-019-01383-8,PMC7205851,31897975,2020-01-02,en,abstract morus genus plants mainly distributed...,in the light of the references published over...,further research


In [16]:
len(df2),len(df2.pubmed_id==None)

(35082, 35082)

In [43]:
df2.to_csv("articles_with_sentences and words1.csv", index=None)

In [3]:
import pandas as pd
df2 = pd.read_csv("articles_with_sentences and words1.csv")

# Analysis after Colab

In [1]:
import pandas as pd
df2 = pd.read_csv("articles_with_topics.csv")

In [2]:
df2.columns

Index(['cord_uid', 'title', 'doi', 'abstract', 'publish_time', 'authors',
       'journal', 'doi.1', 'pmcid', 'pubmed_id', 'publish_time_new', 'lang',
       'sentences', 'words', 'topics', 'new_topics'],
      dtype='object')

In [5]:
len(set(df2.topics))

191

In [38]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def ubt(x):
    word_data = x
    #word_data = "lime soda is the best selling item in fast food stores"

    # load nltk's stop word list
    stop_words = list(stopwords.words('english'))
    # extend the stop words list
    #stop_words.extend(["best", "selling", "item", "fast"])

    # tokenise the string and remove stop words
    word_tokens = word_tokenize(word_data)
    clean_word_data = [w for w in word_tokens if not w.lower() in stop_words]

    # get bigrams
    bigrams_list = [" ".join(item) for item in nltk.bigrams(clean_word_data)]
    #print(bigrams_list)

    # get trigrams 
    trigrams_list = [" ".join(item) for item in nltk.trigrams(clean_word_data)]
    #print(trigrams_list)
    lst = []
    lst.extend(clean_word_data)
    lst.extend(bigrams_list)
    lst.extend(trigrams_list)
    from collections import Counter
    cnt = Counter(lst)
    del cnt["."]
    return cnt.most_common(50)

In [55]:
import re
dst =""
for i in range(191):
    st = " ".join(df2[df2.topics==i]["sentences"].tolist())
    st = re.sub(r'[^a-zA-Z0-9. ]', '', st)
    dst+="topic-"+str(i) 
    dst +="\n"
    dst += str(ubt(st))
    dst +="\n"

In [56]:
import re
dst1 =""
for i in range(50):
    st = " ".join(df2[df2.new_topics==i]["sentences"].tolist())
    st = re.sub(r'[^a-zA-Z0-9. ]', '', st)
    dst1+="topic-"+str(i) 
    dst1 +="\n"
    dst1 += str(ubt(st))
    dst1 +="\n"

In [57]:
dst += "\n\n\n *********************Reduced Topics*********************\n\n\n"
dst +=dst1
print(dst)

topic-0
[('telehealth', 184), ('care', 181), ('telemedicine', 148), ('patients', 147), ('pandemic', 135), ('covid19', 121), ('research', 117), ('use', 94), ('future', 90), ('covid19 pandemic', 83), ('health', 79), ('services', 67), ('patient', 67), ('access', 55), ('needed', 54), ('studies', 52), ('known', 46), ('however', 45), ('future research', 45), ('little', 44), ('little known', 43), ('study', 42), ('unknown', 40), ('may', 40), ('satisfaction', 40), ('inperson', 37), ('. future', 37), ('. however', 34), ('among', 33), ('using', 32), ('healthcare', 32), ('. research', 32), ('technology', 31), ('clinical', 30), ('visits', 30), ('impact', 29), ('pandemic .', 29), ('delivery', 28), ('outcomes', 28), ('health care', 28), ('care .', 28), ('medical', 25), ('barriers', 25), ('implementation', 25), ('practice', 25), ('interventions', 24), ('improve', 23), ('review', 22), ('research needed', 22), ('pediatric', 21)]
topic-1
[('covid19', 301), ('patients', 231), ('risk', 90), ('unknown', 85)

In [58]:
with open('Topics Keywords with frequencies.txt', 'w') as f:
    f.write(dst)

# After Manually Labelled

In [3]:
import pandas as pd
df = pd.read_csv("articles_with_topics.csv")
df.head(3)

Unnamed: 0,cord_uid,title,doi,abstract,publish_time,authors,journal,doi.1,pmcid,pubmed_id,publish_time_new,lang,sentences,words,topics,new_topics
0,azdtbnqj,Combined Hyperglycemic Hyperosmolar Syndrome a...,10.1155/2021/6429710,although most children with coronavirus diseas...,2021-11-27,"Tseng, Yu Shan; Tilford, Bradley; Sethuraman, ...",Case Rep Crit Care,10.1155/2021/6429710,PMC8627355,34791286.0,2021-11-27,en,"as the pandemic continues, clinicians should ...",further studies,116,-1
1,k8nbgxsf,Evidences suggesting a possible role of Vitami...,10.4103/ijp.ijp_654_20,the severe acute respiratory syndrome coronavi...,2021-11-24,"Singh, Shruti; Singh, C. M.; Ranjan, Alok; Kum...",Indian J Pharmacol,10.4103/ijp.ijp_654_20,PMC8641745,34854410.0,2021-11-24,en,while some risk factors such as the presence ...,further research,-1,-1
2,t3tlj38t,Predictors to Use Mobile Apps for Monitoring C...,10.2196/28416,ehealth apps have been recognized as a valuab...,2021-12-20,"Jansen-Kosterink, Stephanie; Hurmuz, Marian; d...",JMIR Form Res,10.2196/28416,PMC8691407,34818210.0,2021-12-20,en,ehealth apps have been recognized as a valuab...,unknown,-1,-1


In [23]:
len(df)

12853

In [20]:
tp = {
    -1:"Unlabelled",
    0:"Unlabelled",
    1:"TeleHealth",
    2: "Thrombosis",
    3:"Maternal & Neonatal (Pregnancy)",
    4:"Cardiovascular Complications",
    5: "Neurological Complications",
    6: "Surgical Considerations",
    7:"Respiratory (Pulmonary) Complications",
    8 :"Prediction Models",
    9: "Vaccination Efficacy & Safety",
    10:"Mental Health",
    11:"Education",
    12:"Healthcare workers’ Mental Health",
    13:"Vaccination Hesitancy",
    14:"Vaccination Hesitancy",
    15:"Respiratory (Pulmonary) Complications",
    16:"Origin of COVID-19",
    17:"Angiotensin Converting Enzyme (ACE)",
    18:"Media & Communication",
    19:"Origin of COVID-19",
    20:"Diabetes",
    21:"Origin of COVID-19",
    22:"General",
    23:"Organ Transplantation",
    24:"Treatment of COVID-19",
    25:"Mortality",
    26: "Dietary Supplementation",
    27:"General",
    28:"General",
    29: "Obesity",
    30:"Maternal & Neonatal (Pregnancy)",
    31: "Pediatrics",
    32: "Transmission",
    33: "Hepatic Complications",
    34: "Herbal Medicine",
    35: "Public Policies & Precaution Measures",
    36: "Ethnicity",
    37: "Cancer",
    38:"General",
    39: "Food Security",
    40: "Wastewater Surveillance",
    41:"Vaccination Hesitancy",
    42:"General",
    43: "Dental & Oral Health",
    44: "Face Mask",
    45:"TeleHealth",
    46: "Olfactory (Smell & Taste)",
    47: "Emerging Variants",
    48: "Support System", 
    49: "Immune System"
}

In [21]:
df["Topic Labels"]=df["new_topics"].apply(lambda x: tp[x])

In [22]:
df.groupby(by=["Topic Labels"])["title"].count()

Topic Labels
Angiotensin Converting Enzyme (ACE)       113
Cancer                                     64
Cardiovascular Complications              216
Dental & Oral Health                       59
Diabetes                                   95
Dietary Supplementation                    80
Education                                 136
Emerging Variants                          56
Ethnicity                                  65
Face Mask                                  58
Food Security                              62
General                                   370
Healthcare workers’ Mental Health         136
Hepatic Complications                      70
Herbal Medicine                            66
Immune System                              52
Maternal & Neonatal (Pregnancy)           294
Media & Communication                     109
Mental Health                             138
Mortality                                  87
Neurological Complications                177
Obesity              