In [1]:
import pandas as pd
import numpy as np
import ast
from tqdm import tqdm
import sys
import time

from flashgeotext.geotext import GeoText
from geopy.geocoders import Nominatim

In [2]:
articles = pd.read_csv('data/characteristics_labelled.csv', index_col=0)

In [3]:
articles.authors = articles.authors.apply(lambda x: ast.literal_eval(x) if type(x) == str else x)
articles.author_affils = articles.author_affils.apply(lambda x: ast.literal_eval(x) if type(x) == str else x)
articles.keywords = articles.keywords.apply(lambda x: ast.literal_eval(x) if type(x) == str else x)
articles.references_pmids = articles.references_pmids.apply(lambda x: ast.literal_eval(x) if type(x) == str else x)
articles.mesh_terms = articles.mesh_terms.apply(lambda x: ast.literal_eval(x) if type(x) == str else x)

In [4]:
articles['affil_countries'] = np.nan
articles['affil_countries'] = articles['affil_countries'].astype(object)
articles['affil_countries_unique'] = np.nan
articles['affil_countries_unique'] = articles['affil_countries'].astype(object)
articles['affil_first_country'] = np.nan
articles['affil_last_country'] = np.nan

In [5]:
def find_affil_countries(affils: list, retry_count = 5):
    
    geolocator = Nominatim(user_agent='health_ai_scraper')
    geotext = GeoText()
    
    if affils == affils: # Check to make sure not NaN
        country_list = []
        location = None
        
        last_affil = None
        last_country = None
        
        try_count = 0
    
        for affil in affils:
            if affil == last_affil: # Check to see if we've seen this before and take a shortcut if we have
                country_list = country_list + last_country

            else:
                while try_count < retry_count:
                    try:
                        last_affil = affil # Set that we've examined this affil
                
                        countries = [*geotext.extract(input_text=affil, span_info=True)['countries'].keys()] # Look for countries
                
                        if (len(countries) == 0): # If we dont find a country look harder
                            cities = [*geotext.extract(input_text=affil, span_info=True)['cities'].keys()]
                            if len(cities) > 0:
                                location = geolocator.geocode(cities[-1])
                            else:
                                location = geolocator.geocode(' '.join(affil.split(" ")[-2:]))
                                if location == None:
                                    location = geolocator.geocode(affil)
                                    if location == None:
                                        tqdm.write("Can't find a country for:")
                                        tqdm.write(affil)
                                        country_list = country_list + [np.nan]
                                        last_country = [np.nan]
                            
                        else: # If we do find a country then att it to the list and set the last_country variable
                            country_list = country_list + countries
                            last_country = countries
        
                        if location != None: # If we found an address using the other search techniques
                            countries = [*geotext.extract(input_text=location.address, span_info=True)['countries'].keys()]
                            country_list = country_list + countries
                            last_country = countries
                            
                        break
                            
                    except:
                        try_count += 1
                        tqdm.write(f"Error parsing {affil}, trying again for a maximum of 5 times.")
                        
    else: # If the affil is NaN then make the country list NaN
        country_list = [np.nan]
        
    unique_countries = list(set(country_list))
    first_affil_country = country_list[0]
    last_affil_country = country_list[-1]    
        
    return country_list, unique_countries, first_affil_country, last_affil_country

In [6]:
def parse_affil_countries(df, max_consecutive_failures = 5, filter_column = 'include'):
    
    consecutive_failures = 0
    
    country_df = df.copy()
    
    country_df['affil_countries'] = np.nan
    country_df['affil_countries_unique'] = np.nan
    country_df['affil_countries_first'] = np.nan
    country_df['affil_countries_last'] = np.nan
    
    with tqdm(total=country_df.shape[0], file=sys.stdout) as pbar:
        for row in country_df[country_df[filter_column] == 1].itertuples():

            try:
                affils = row.author_affils
    
                country_list, unique_countries, first_affil_country, last_affil_country = find_affil_countries(affils)
    
                country_df.loc[row.Index, 'affil_countries'] = str(country_list)
                country_df.loc[row.Index, 'affil_countries_unique'] = str(list(set(country_list)))
                country_df.loc[row.Index, 'affil_countries_first'] = country_list[0]
                country_df.loc[row.Index, 'affil_countries_last'] = country_list[-1]
            
                consecutive_failures = 0
        
            except Exception as e:
                tqdm.write(e)
                consecutive_failures += 1
                if consecutive_failures >=  max_consecutive_failures:
                    tqdm.write("Failed too many in a row, something is broken, stopping and returning possibly partially labelled DF...")
                    break
                
            pbar.update(1)
            
    country_df.replace("[nan]", np.nan, inplace=True)
            
    return country_df

In [7]:
consecutive_failures = 0

with tqdm(total=articles.shape[0], file=sys.stdout) as pbar:
    for row in articles.itertuples():

        try:
            affils = row.author_affils
    
            country_list, unique_countries, first_affil_country, last_affil_country = find_affil_countries(affils)
    
            articles.loc[row.Index, 'affil_countries'] = str(country_list)
            articles.loc[row.Index, 'affil_countries_unique'] = str(list(set(country_list)))
            articles.loc[row.Index, 'affil_first_country'] = country_list[0]
            articles.loc[row.Index, 'affil_last_country'] = country_list[-1]
            
            consecutive_failures = 0
        
        except:
            consecutive_failures += 1
            if consecutive_failures > 5:
                print("Failed too many in a row, something is broken, stopping...")
                break
            time.sleep(1)
                
        pbar.update(1)

Can't find a country for:                                                                                              
Cosmo Artificial Intelligence-AI Ltd.                                                                                  
Can't find a country for:                                                                                              
Ming Hsieh Department of Electrical and Computer Engineering (R.M.L., A.A.J.).                                         
Can't find a country for:                                                                                              
Departments of Radiology (P.E.K.).                                                                                     
Can't find a country for:                                                                                              
Ming Hsieh Department of Electrical and Computer Engineering (R.M.L., A.A.J.) ajoshi@usc.edu.                          
Can't find a country for:               

Can't find a country for:                                                                                              
Department of Biomedical EngineeringCase Western Reserve UniversityClevelandOH44106USA.                                
Can't find a country for:                                                                                              
The Tony and Leona Campane Center for Excellence in Image-Guided Surgery and Advancing Imaging ResearchCleveland Clinic Cole Eye InstituteClevelandOH44106USA.
Can't find a country for:                                                                                              
Department of Biomedical EngineeringCase Western Reserve UniversityClevelandOH44106USA.                                
Can't find a country for:                                                                                              
The Tony and Leona Campane Center for Excellence in Image-Guided Surgery and Advancing Imaging ResearchCleveland Clinic C

Can't find a country for:                                                                                              
Department of Epidemiology and Biostatistics.                                                                          
Can't find a country for:                                                                                              
Data used in preparation of this article were obtained from the Alzheimer's Disease Neuroimaging Initiative (ADNI) database (adni.loni.usc.edu). As such, the investigators within the ADNI contributed to the design and implementation of ADNI and/or provided data but did not participate in analysis or writing of this report. A complete listing of ADNI investigators can be found at: http://adni.loni.usc.edu/wp-content/uploads/how_to_apply/ADNI_Acknowledgement_List.pdf.
Can't find a country for:                                                                                              
Data used in the preparation of this article was 

Supernus Pharmaceuticals, Inc.. Electronic address: anasser@supernus.com.                                              
Can't find a country for:                                                                                              
Research and Practical Clinical Center for Diagnostics and Telemedicine Technologies.                                  
Can't find a country for:                                                                                              
Research and Practical Clinical Center for Diagnostics and Telemedicine Technologies.                                  
Can't find a country for:                                                                                              
Skolkovo Institute of Science and Technology; Kharkevich Institute for Information Transmission Problems.              
Can't find a country for:                                                                                              
Research and Practical Clinical Center f

4Renishaw plc, Spectroscopy Products Division, Gloucestershire.                                                        
Can't find a country for:                                                                                              
Department of Statistics and Actuarial Sciences.                                                                       
Can't find a country for:                                                                                              
3Neurosurgery.                                                                                                         
Can't find a country for:                                                                                              
3Neurosurgery.                                                                                                         
Can't find a country for:                                                                                              
Department of Radiology (W.B., J.S., M.P

Can't find a country for:                                                                                              
6School of Biomedical EngineeringUniversity of Western OntarioLondonONN6A 3K7Canada.                                   
Can't find a country for:                                                                                              
1Department of Electrical and Computer EngineeringUniversity of Western OntarioLondonONN6A 3K7Canada.                  
Can't find a country for:                                                                                              
1Department of EngineeringFerrara University44122FerraraItaly.                                                         
Can't find a country for:                                                                                              
2MIST E-R40129BolognaItaly.                                                                                            
Can't find a country for:               

Can't find a country for:                                                                                              
3College of ComputingMichigan Technological UniversityHoughtonMI49931USA.                                              
Can't find a country for:                                                                                              
1Department of Computer Science and EngineeringShanghai Jiao Tong UniversityShanghai200240China.                       
Can't find a country for:                                                                                              
2Department of Computer ScienceUniversity of SheffieldSheffieldS1 4DPU.K.                                              
Can't find a country for:                                                                                              
1Department of Computer Science and EngineeringShanghai Jiao Tong UniversityShanghai200240China.                       
Can't find a country for:               

Can't find a country for:                                                                                              
3Department of NeurosciencesUniversity of CaliforniaSan DiegoCA92106USA.                                               
Can't find a country for:                                                                                              
2Department of Electrical and Computer EngineeringUniversity of CaliforniaSan DiegoCA92106USA.                         
Error parsing Neurology (D.S.)., trying again for a maximum of 5 times.                                                
Error parsing Neurology (D.S.)., trying again for a maximum of 5 times.                                                
Error parsing Neurology (D.S.)., trying again for a maximum of 5 times.                                                
Error parsing Neurology (D.S.)., trying again for a maximum of 5 times.                                                
Error parsing Neurology (D.S.)., trying 

Precision Neurotherapeutics Lab (A.J.H.-D., K.W.S., P.R.J, B.R.B., K.R.S.).                                            
Can't find a country for:                                                                                              
Medical Informatics & Clinical Epidemiology, and.                                                                      
Can't find a country for:                                                                                              
Medical Informatics & Clinical Epidemiology, and.                                                                      
Can't find a country for:                                                                                              
Division of Gastroenterology, Department of Medicine, Olive View-UCLA Medical Center, Sylmar, California. Electronic address: jtabibian@dhs.lacounty.gov.
Can't find a country for:                                                                                              
3Neuro

Neurosurgery (G.C.).                                                                                                   
Can't find a country for:                                                                                              
Departments of Radiology (M.B., M.K., M.-Y.S., D.C.).                                                                  
Can't find a country for:                                                                                              
Neuro-Oncology (D.B.).                                                                                                 
Can't find a country for:                                                                                              
Departments of Radiology (M.B., M.K., M.-Y.S., D.C.) chowd3@uci.edu.                                                   
Can't find a country for:                                                                                              
4 Department of Biostatistics, and.     

Division of Vascular Surgery, Stanford Health Care, Stanford, Calif. Electronic address: nleeper@stanford.edu.         
Can't find a country for:                                                                                              
Psychology (M.L.C.).                                                                                                   
Can't find a country for:                                                                                              
MIT Computer Science And Artificial Intelligence Laboratory, Tel.: +1-617-800-3033, souillardmandar@csail.mit.edu.     
Can't find a country for:                                                                                              
MIT Computer Science And Artificial Intelligence Laboratory, Tel.: +1-617-253-5879, davis@csail.mit.edu.               
Can't find a country for:                                                                                              
MIT Computer Science And Artificial Inte

In [10]:
articles.sample(25)

Unnamed: 0,pmid,doi,title,abstract,article_date,pubmed_date,article_type,lang,journal,journal_short,...,subspec_arrhyt,spec_dm,subspec_retina,spec_haem,spec_obs,spec_renal,affil_countries,affil_countries_unique,affil_first_country,affil_last_country
52691,33191075,10.1016/j.euroneuro.2020.11.009,Higher baseline interleukin-1β and TNF-α hampe...,Raised pro-inflammatory immune/inflammatory se...,2020-11-13,2020-11-17,Journal Article,eng,European neuropsychopharmacology : the journal...,Eur Neuropsychopharmacol,...,0.0,0.0,0.0,0.0,0.0,0.0,"['Italy', 'Italy', 'Italy', 'Italy', 'Italy', ...",['Italy'],Italy,Italy
25961,34364092,10.1016/j.sleep.2021.07.023,Digital phenotyping of sleep patterns among he...,This study aimed to identify sleep disturbance...,2021-07-19,2021-08-08,Journal Article,eng,Sleep medicine,Sleep Med,...,0.0,0.0,0.0,0.0,0.0,0.0,"['United States', 'United States', 'United Sta...",['United States'],United States,United States
166735,18819544,,Assessment of gastric cancer survival: using a...,This study is designed to assess the applicati...,,2008-09-30,Journal Article,eng,Pakistan journal of biological sciences : PJBS,Pak J Biol Sci,...,0.0,0.0,0.0,0.0,0.0,0.0,[nan],[nan],,
106874,29643160,10.1136/bmjopen-2017-020124,Using machine learning techniques to develop f...,Mortality and morbidity following surgery are ...,2018-04-10,2018-04-13,Journal Article,eng,BMJ open,BMJ Open,...,0.0,0.0,0.0,0.0,0.0,1.0,"['United States', 'United States', 'United Sta...",['United States'],United States,United States
47395,33432267,10.1007/s11760-020-01820-2,Evaluation of deep learning-based approaches f...,"The COVID-19, novel coronavirus or SARS-Cov-2,...",2021-01-07,2021-01-13,Journal Article,eng,"Signal, image and video processing",Signal Image Video Process,...,0.0,0.0,0.0,0.0,0.0,0.0,"['China', 'China', 'China', 'China']",['China'],China,China
58278,32941514,10.1371/journal.pone.0239071,Association between metabolic risk factors and...,This study aims to investigate correlation bet...,2020-09-17,2020-09-18,Journal Article,eng,PloS one,PLoS One,...,0.0,1.0,1.0,0.0,0.0,0.0,"['South Korea', 'South Korea', 'South Korea', ...",['South Korea'],South Korea,South Korea
73789,32078557,10.1109/TCBB.2020.2973978,Cross-Domain Classification Model With Knowled...,Conventional classification models for epilept...,2021-02-03,2020-02-23,Journal Article,eng,IEEE/ACM transactions on computational biology...,IEEE/ACM Trans Comput Biol Bioinform,...,0.0,0.0,0.0,0.0,0.0,0.0,[nan],[nan],,
72138,32178296,10.3390/s20061579,Automatic Detection of Arrhythmia Based on Mul...,Automatic detection of arrhythmia is of great ...,2020-03-12,2020-03-18,Journal Article,eng,"Sensors (Basel, Switzerland)",Sensors (Basel),...,1.0,0.0,0.0,0.0,0.0,0.0,"['China', 'China', 'China', 'China', 'China']",['China'],China,China
26186,34354361,10.2147/JMDH.S322431,Early Prediction of COVID-19 Ventilation Requi...,"Coronavirus disease 2019 (COVID-19), caused by...",2021-07-30,2021-08-07,Journal Article,eng,Journal of multidisciplinary healthcare,J Multidiscip Healthc,...,0.0,0.0,0.0,0.0,0.0,0.0,"['Saudi Arabia', 'Saudi Arabia', 'Saudi Arabia...",['Saudi Arabia'],Saudi Arabia,Saudi Arabia
57095,33001400,10.1007/s10877-020-00598-5,A mathematical model for predicting intracrani...,To develop and validate a mathematical model f...,2020-10-01,2020-10-02,Journal Article,eng,Journal of clinical monitoring and computing,J Clin Monit Comput,...,0.0,0.0,0.0,0.0,0.0,0.0,"['China', 'China', 'China', 'China', 'United S...","['United Kingdom', 'United States', 'China']",China,United Kingdom


In [8]:
#articles.to_csv('data/final_raw.csv')

In [11]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42307 entries, 1 to 192947
Data columns (total 80 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   pmid                    42307 non-null  int64  
 1   doi                     37409 non-null  object 
 2   title                   42306 non-null  object 
 3   abstract                42307 non-null  object 
 4   article_date            32669 non-null  object 
 5   pubmed_date             42307 non-null  object 
 6   article_type            42307 non-null  object 
 7   lang                    42307 non-null  object 
 8   journal                 42307 non-null  object 
 9   journal_short           42307 non-null  object 
 10  journal_country         42307 non-null  object 
 11  authors                 41281 non-null  object 
 12  author_affils           31156 non-null  object 
 13  keywords                24364 non-null  object 
 14  mesh_terms              32535 non-nul

## CLEAN

In [12]:
articles['affil_fill_country'] = articles['affil_first_country']

In [13]:
## using first affiliation as primary country
## fill first with data from last author, then with pubmed country metadata
articles['affil_fill_country'] = articles['affil_fill_country'].fillna(articles['affil_last_country'])
articles['affil_fill_country'] = articles['affil_fill_country'].fillna(articles['journal_country'])

articles['affil_fill_country'] = articles['affil_fill_country'].astype('string')

In [14]:
##clean countries
articles["affil_fill_country"].replace({"England": "United Kingdom", 
                                             "Wales": "United Kingdom", 
                                             "Scotland": "United Kingdom", 
                                             "China (Republic : 1949- )" : "Taiwan"}, inplace=True)

articles['affil_fill_country'].value_counts()

United States          14664
China                   7107
United Kingdom          2903
South Korea             1651
Germany                 1567
                       ...  
Trinidad and Tobago        1
Belarus                    1
Cameroon                   1
Kazakhstan                 1
Palestine                  1
Name: affil_fill_country, Length: 97, dtype: int64

In [15]:
articles['affil_first_country'] = articles['affil_first_country'].fillna(articles['affil_last_country'])

articles["affil_first_country"].replace({"England": "United Kingdom", 
                                             "Wales": "United Kingdom", 
                                             "Scotland": "United Kingdom", 
                                             "China (Republic : 1949- )" : "Taiwan"}, inplace=True)

In [16]:

articles['affil_first_country'].value_counts()

United States          6983
China                  6911
South Korea            1651
United Kingdom         1262
India                  1207
                       ... 
Belarus                   1
Trinidad and Tobago       1
Kazakhstan                1
Cameroon                  1
Palestine                 1
Name: affil_first_country, Length: 93, dtype: int64

In [17]:
##lowercasing list of ANY author
articles['countries_lc'] = articles['affil_countries_unique'].str.lower().fillna(np.nan)

#articles['countries_lc'] = articles['countries_lc'].fillna(articles['affil_fill_country']).str.lower().astype('string')

In [18]:
articles['countries_lc'].replace(np.nan, '', inplace=True)

In [29]:
lmic_list = ["afghanistan", "burundi", "burkina faso", "central african republic", "congo", "eritrea", 
             "ethiopia", "guinea", "gambia", "guinea-bissau", "liberia", "madagascar", "mali", "mozambique", "malawi", 
             "niger", "north korea", "democratic republic of korea", "rwanda", "sudan", "sierra leone", "somalia", "south sudan", "syrian arab republic", 
             "chad", "togo", "uganda", "yemen", "angola", "benin", "bangladesh", "belize", "bolivia", "bhutan", 
             "cote d'ivoire", "ivory coast", "cameroon", "congo", "comoros", "cabo verde", "djibouti", "algeria", "egypt", 
             "micronesia", "ghana", "honduras", "haiti", "indonesia", "india", "iran", "kenya", 
             "kyrgyz republic", "cambodia", "kiribati", "lao", "sri lanka", "lesotho", "morocco", "myanmar", "mongolia", 
             "mauritania", "nigeria", "nicaragua", "nepal", "pakistan", "philippines", "papua new guinea", 
             "west bank and gaza", "palestinbe", "senegal", "solomon islands", "el salvador", "sao tome", "eswatini", 
             "tajikistan", "timor-leste", "tunisia", "tanzania", "ukraine", "uzbekistan", "vietnam", "vanuatu", "samoa", 
             "zambia", "zimbabwe", "albania", "argentina", "armenia", "american samoa", "azerbaijan", "bulgaria", 
             "bosnia", "belarus", "brazil", "botswana", "china", "colombia", "costa rica", "cuba", 
             "dominica", "dominican republic", "ecuador", "fiji", "gabon", "georgia", "equatorial guinea", "grenada", 
             "guatemala", "guyana", "iraq", "jamaica", "jordan", "kazakhstan", "lebanon", "libya", "lucia", "moldova", 
             "maldives", "mexico", "marshall islands", "north macedonia", "montenegro", "mauritius", "malaysia", "namibia", 
             "panama", "peru", "paraguay", "romania", "russian federation", "russia", "serbia", "suriname", "thailand", "turkmenistan", 
             "tonga", "turkey", "tuvalu", "st. vincent", "grenadines", "kosovo", "south africa", "venezuela"]

In [30]:
lmic_lower_list = ["afghanistan", "burundi", "burkina faso", "central african republic", "congo", "eritrea", 
             "ethiopia", "guinea", "gambia", "guinea-bissau", "liberia", "madagascar", "mali", "mozambique", "malawi", 
             "niger", "north korea", "democratic republic of korea", "rwanda", "sudan", 'iran', "sierra leone", "somalia", "south sudan", "syrian arab republic", 
             "chad", "togo", "uganda", "yemen", "angola", "benin", "bangladesh", "belize", "bolivia", "bhutan", 
             "cote d'ivoire", "ivory coast", "cameroon", "congo", "comoros", "cabo verde", "djibouti", "algeria", "egypt", 
             "micronesia", "ghana", "honduras", "haiti", "indonesia", "india", "kenya", 
             "kyrgyz republic", "cambodia", "kiribati", "lao", "sri lanka", "lesotho", "morocco", "myanmar", "mongolia", 
             "mauritania", "nigeria", "nicaragua", "nepal", "pakistan", "philippines", "papua new guinea", 
             "west bank and gaza", "palestinbe", "senegal", "solomon islands", "el salvador", "sao tome", "eswatini", 
             "tajikistan", "timor-leste", "tunisia", "tanzania", "ukraine", "uzbekistan", "vietnam", "vanuatu", "samoa", 
             "zambia", "zimbabwe"]

In [31]:
##flags for ANY author

#initiate
articles['lmic_author_flag'] = np.where(articles['countries_lc'].str.contains('iran'), "1", "0")
articles['lmic_author_lower_flag'] = np.where(articles['countries_lc'].str.contains('iran'), "1", "0")
articles['lmic_china_flag'] = np.where(articles['countries_lc'].str.contains('china'), "1", "0")

In [32]:
#use lists
for x in lmic_list:
    articles['lmic_author_flag'] = np.where(articles['countries_lc'].str.contains(x), "1", articles['lmic_author_flag'])
    
for y in lmic_lower_list:
    articles['lmic_author_lower_flag'] = np.where(articles['countries_lc'].str.contains(x), "1", articles['lmic_author_lower_flag'])

In [33]:
articles['pubmed_date'] = pd.to_datetime(articles['pubmed_date'])

In [34]:
## new column for year, and year+month
articles['year'] = articles['pubmed_date'].dt.year

In [37]:
articles['lmic_author_lower_flag'].value_counts()

0    41739
1      568
Name: lmic_author_lower_flag, dtype: int64

In [38]:
articles.to_csv('data/final_raw.csv')

In [39]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42307 entries, 1 to 192947
Data columns (total 85 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   pmid                    42307 non-null  int64         
 1   doi                     37409 non-null  object        
 2   title                   42306 non-null  object        
 3   abstract                42307 non-null  object        
 4   article_date            32669 non-null  object        
 5   pubmed_date             42307 non-null  datetime64[ns]
 6   article_type            42307 non-null  object        
 7   lang                    42307 non-null  object        
 8   journal                 42307 non-null  object        
 9   journal_short           42307 non-null  object        
 10  journal_country         42307 non-null  object        
 11  authors                 41281 non-null  object        
 12  author_affils           31156 non-null  objec

In [40]:
imported = pd.read_csv('data/included_abstracts_21032022.csv', index_col=0)
imported.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38912 entries, 0 to 38911
Data columns (total 84 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   pmid                    38912 non-null  int64  
 1   doi                     34248 non-null  object 
 2   title                   38911 non-null  object 
 3   abstract                38912 non-null  object 
 4   article_date            30193 non-null  object 
 5   pubmed_date             38912 non-null  object 
 6   article_type            38912 non-null  object 
 7   lang                    38912 non-null  object 
 8   journal                 38912 non-null  object 
 9   journal_short           38912 non-null  object 
 10  journal_country         38912 non-null  object 
 11  authors                 38016 non-null  object 
 12  author_affils           28554 non-null  object 
 13  keywords                22292 non-null  object 
 14  mesh_terms              25649 non-null