In [1]:
import os
import glob
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
FILE_PATH = os.getcwd() + r'\xml'
files = glob.glob(os.path.join(FILE_PATH, '*.xml'))

In [3]:
def get_soup(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, "html.parser")
    return soup

In [4]:
mesh_terms_dic={}

def collect_mesh_terms(soup):
    for item in tqdm(soup.findAll('medlinecitation')):

        try:
            #year
            pubdate = item.select_one('article > journal > journalissue > pubdate > year')
            if pubdate==None:
                pubdate = item.select_one('article > journal > journalissue > pubdate > medlinedate')
                year  =int((pubdate.text)[0:4])
            else:
                year = int(pubdate.text)
        except:
            continue
            
        if not(1991<=year<=2020):
            continue
        year_index = year-1991

        #mesh terms
        meshterms = item.find_all('descriptorname')
        meshterm_length.append(len(meshterms))
        for term in meshterms:
            mesh = term.text
                                                        #from 1991 to 2020. 30 spaces for freq
            year_freq_list = mesh_terms_dic.get(mesh, [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0])
            year_freq_list[year_index] +=  1
            mesh_terms_dic[mesh] = year_freq_list


In [5]:
#num of terms per doc
meshterm_length=[]

In [6]:
for file in tqdm(files):
    soup = get_soup(file)
    collect_mesh_terms(soup)

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/9713 [00:00<?, ?it/s]

  0%|          | 0/9994 [00:00<?, ?it/s]

  0%|          | 0/9255 [00:00<?, ?it/s]

  0%|          | 0/9738 [00:00<?, ?it/s]

  0%|          | 0/9078 [00:00<?, ?it/s]

  0%|          | 0/8825 [00:00<?, ?it/s]

  0%|          | 0/9712 [00:00<?, ?it/s]

  0%|          | 0/9025 [00:00<?, ?it/s]

In [7]:
len(mesh_terms_dic)

12898

In [8]:
freq_list = []
for key, values in mesh_terms_dic.items():
    freq_list.append((sum(values), key))

In [9]:
freq_list.sort(reverse=True)

In [10]:
out_form_list = []
count=1
for freq, term in freq_list[:1000]:
    row = []
    row.append(count)
    row.append(term)
    for year_count in mesh_terms_dic[term]:
        row.append(year_count)
    row.append(freq)
    out_form_list.append(row)  
    count+=1

In [11]:
columns_list = []
columns_list.append('rank')
columns_list.append('Mesh Term')
for i in range(1991, 2021, 1):
    columns_list.append(i)
columns_list.append('total appearance')

In [12]:
len(out_form_list)

1000

In [13]:
df = pd.DataFrame(out_form_list, columns=columns_list)

In [14]:
df.head()

Unnamed: 0,rank,Mesh Term,1991,1992,1993,1994,1995,1996,1997,1998,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,total appearance
0,1,"Carcinoma, Hepatocellular",775,793,890,920,929,988,975,1074,...,3081,3768,4384,4191,4712,4109,4828,4771,5733,71296
1,2,Humans,727,751,826,844,866,935,895,988,...,2930,3611,4212,4028,4517,3973,4656,4614,5522,67671
2,3,Liver Neoplasms,729,753,829,844,829,869,831,894,...,2925,3608,4245,4059,4556,3970,4703,4649,5633,66587
3,4,Male,336,378,435,491,418,475,502,538,...,1794,2134,2497,2414,2633,2198,2636,2617,2713,37544
4,5,Female,272,324,368,408,384,403,430,431,...,1561,1882,2166,2194,2348,1944,2350,2335,2406,32784


In [15]:
df.to_excel('MeSH_terms_Top1000(0526).xlsx', index=False)