In [1]:
import pandas as pd
import networkx as nx

In [11]:
df = pd.read_csv('../data_light/authors_journals.csv') \
       .drop(['Unnamed: 0','aff_name', 'aff_id','aff_city_id','author_name', 'aff_country_code'], axis=1) \
       .rename(columns={'journal.title': 'journal'}) 

# merge with pub_info to get the year
df = df.merge(pd.read_csv('../data_light/pubs_info.csv'), on='pub_id')

# merge with author_info to get the gender
df = df.merge(pd.read_csv('../data_light/authors_info.csv').drop(['current_organization_id'], axis=1), on='researcher_id')

# just a pub_id - research_id pair, no duplicated, groupby
df = df.groupby(['pub_id', 'researcher_id']).agg({'journal': 'first',
                                                  'year': 'first',
                                                  'gender': 'first',
                                                  'LMIC': 'max'}).reset_index()

# drop PLOS Medicine
df = df[df['journal'] != 'PLOS Medicine']

# exclude papers with only one author
df1 = df.groupby('pub_id').filter(lambda x: len(x) > 1) 
print(f"{len(df.researcher_id.unique())} authors (beginning)")
print(f"{len(df.researcher_id.unique()) - len(df1.researcher_id.unique())} authors dropped with only one paper")
print(f"{len(df1.researcher_id.unique())} authors")


# exclude papers with more than 100 authors
df2 = df1.groupby('pub_id').filter(lambda x: len(x) <= 100)
print(f"{len(df1.researcher_id.unique()) - len(df2.researcher_id.unique())} authors dropped with more than 100 papers")
print(f"{len(df2.researcher_id.unique())} authors")


# drop if gender is unknown
df3 = df2[df2.gender.isna() == False]
print(f"{len(df2.researcher_id.unique()) - len(df3.researcher_id.unique())} authors dropped with no gender info")
print(f"{len(df3.researcher_id.unique())} authors (final)")

# transform female into 1, male into 0
df3['gender'] = df3['gender'].apply(lambda x: 1 if x == "female" else 0)

df = df3.copy()

df.head()

172359 authors (beginning)
7722 authors dropped with only one paper
164637 authors
2765 authors dropped with more than 100 papers
161872 authors
15739 authors dropped with no gender info
146133 authors (final)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['gender'] = df3['gender'].apply(lambda x: 1 if x == "female" else 0)


Unnamed: 0,pub_id,researcher_id,journal,year,gender,LMIC
0,pub.1000001707,ur.01006172666.33,JAMA,2015,1,0
1,pub.1000001707,ur.01012736025.78,JAMA,2015,0,0
2,pub.1000001707,ur.010132635727.81,JAMA,2015,0,0
3,pub.1000001707,ur.01023477132.25,JAMA,2015,1,0
4,pub.1000001707,ur.01043100547.54,JAMA,2015,1,0


In [12]:
# count na 
df3.isna().sum()

pub_id           0
researcher_id    0
journal          0
year             0
gender           0
LMIC             0
dtype: int64

In [13]:
# count the number of authors in total
print(f"Total number of authors: {len(df.researcher_id.unique())}")
print(f"Total number of publications: {len(df.pub_id.unique())}")

Total number of authors: 146133
Total number of publications: 50149


In [14]:
# count the number of authors per publication, sort desc
af = df.groupby('pub_id') \
       .agg({'researcher_id': 'count'}) \
       .reset_index() \
       .sort_values('researcher_id', ascending=False)

print(af.head(10))


               pub_id  researcher_id
40077  pub.1126154306             94
43195  pub.1134737671             87
47138  pub.1145514438             87
43674  pub.1135839002             84
32216  pub.1103196468             83
49170  pub.1151220554             83
32172  pub.1103149630             82
43557  pub.1135447669             81
10853  pub.1022462306             78
46223  pub.1142611890             78


In [15]:
# print the number of publications per year
print(df.groupby('year').agg({'pub_id': 'count'}).sort_values('year'))


      pub_id
year        
2007   10054
2008   10323
2009   11583
2010   11281
2011   12222
2012   13902
2013   16219
2014   16065
2015   17586
2016   21990
2017   21369
2018   22228
2019   22862
2020   24860
2021   25074
2022   22396
2023     608


In [16]:
# print the number of publications per journal
print(df.groupby('journal').agg({'pub_id': 'count'}).sort_values('pub_id', ascending=False))

                                 pub_id
journal                                
The Lancet                        88940
JAMA                              53246
The BMJ                           49386
Nature Medicine                   45234
New England Journal of Medicine   43816


In [19]:
# print number of LMIC authors
print(df.gender.value_counts())
print(df.LMIC.value_counts(normalize=True))

0    187065
1     93557
Name: gender, dtype: int64
0    0.915392
1    0.084608
Name: LMIC, dtype: float64


In [18]:
# print number of female authors with value counts
print(df.gender.value_counts())
print(df.gender.value_counts(normalize=True))

0    187065
1     93557
Name: gender, dtype: int64
0    0.666608
1    0.333392
Name: gender, dtype: float64


In [20]:
df.to_csv('../data_light/data_clean.csv', index=False)

In [21]:
import tableone

In [22]:
df.head()

Unnamed: 0,pub_id,researcher_id,journal,year,gender,LMIC
0,pub.1000001707,ur.01006172666.33,JAMA,2015,1,0
1,pub.1000001707,ur.01012736025.78,JAMA,2015,0,0
2,pub.1000001707,ur.010132635727.81,JAMA,2015,0,0
3,pub.1000001707,ur.01023477132.25,JAMA,2015,1,0
4,pub.1000001707,ur.01043100547.54,JAMA,2015,1,0


In [None]:
# create tableone
