In [2]:
import pandas as pd
import networkx as nx

In [3]:
df = pd.read_csv('../data_light/authors_journals.csv') \
       .drop(['Unnamed: 0','aff_name', 'aff_id','aff_city_id','author_name', 'aff_country_code'], axis=1) \
       .rename(columns={'journal.title': 'journal'}) 

# merge with pub_info to get the year
df = df.merge(pd.read_csv('../data_light/pubs_info.csv'), on='pub_id')

# merge with author_info to get the gender
df = df.merge(pd.read_csv('../data_light/authors_info.csv').drop(['current_organization_id'], axis=1), on='researcher_id')

# just a pub_id - research_id pair, no duplicated, groupby
df = df.groupby(['pub_id', 'researcher_id']).agg({'journal': 'first',
                                                  'year': 'first',
                                                  'gender': 'first',
                                                  'LMIC': 'max'}).reset_index()

# drop PLOS Medicine
df = df[df['journal'] != 'PLOS Medicine']

# exclude papers with only one author
df = df.groupby('pub_id').filter(lambda x: len(x) > 1) 

# exclude papers with more than 100 authors
df = df.groupby('pub_id').filter(lambda x: len(x) <= 100)

df.head()

Unnamed: 0,pub_id,researcher_id,journal,year,gender,LMIC
0,pub.1000001707,ur.01006172666.33,JAMA,2015,female,0
1,pub.1000001707,ur.01012736025.78,JAMA,2015,male,0
2,pub.1000001707,ur.010132635727.81,JAMA,2015,male,0
3,pub.1000001707,ur.01023477132.25,JAMA,2015,female,0
4,pub.1000001707,ur.01043100547.54,JAMA,2015,female,0


In [4]:
# count the number of authors in total
print(f"Total number of authors: {len(df.researcher_id.unique())}")
print(f"Total number of publications: {len(df.pub_id.unique())}")

Total number of authors: 161872
Total number of publications: 50378


In [5]:
# count the number of authors per publication, sort desc
af = df.groupby('pub_id') \
       .agg({'researcher_id': 'count'}) \
       .reset_index() \
       .sort_values('researcher_id', ascending=False)

print(af.head(10))


               pub_id  researcher_id
47340  pub.1145514438             99
43377  pub.1134737671             95
40243  pub.1126154306             94
43740  pub.1135447669             91
33828  pub.1106829208             91
49389  pub.1151220554             90
32341  pub.1103196468             90
45848  pub.1141186137             88
43857  pub.1135839002             88
32647  pub.1103931047             86


In [6]:
# print the number of publications per year
print(df.groupby('year').agg({'pub_id': 'count'}).sort_values('year'))


      pub_id
year        
2007   10790
2008   11073
2009   12332
2010   12099
2011   13125
2012   14955
2013   17562
2014   17272
2015   19086
2016   23743
2017   23215
2018   24260
2019   25037
2020   27132
2021   27460
2022   24647
2023     671


In [7]:
# print the number of publications per journal
print(df.groupby('journal').agg({'pub_id': 'count'}).sort_values('pub_id', ascending=False))

                                 pub_id
journal                                
The Lancet                        97153
JAMA                              57373
The BMJ                           52718
Nature Medicine                   49566
New England Journal of Medicine   47649


In [8]:
# print number of LMIC authors
print(f"Number of LMIC authors: {len(df[df['LMIC'] == 1].researcher_id.unique())}")
# relative now
print(df.LMIC.value_counts(normalize=True))

Number of LMIC authors: 17467
0    0.90472
1    0.09528
Name: LMIC, dtype: float64


In [9]:
# print number of female authors with value counts
print(df.gender.value_counts())
print(df.gender.value_counts(normalize=True))

male      187065
female     93557
Name: gender, dtype: int64
male      0.666608
female    0.333392
Name: gender, dtype: float64


In [10]:
df.to_csv('../data_light/data_clean.csv', index=False)