# descriptive analysis of the dataset

## defines and imports

In [1]:
  !pip install tableone

Collecting tableone
  Downloading tableone-0.8.0-py3-none-any.whl (33 kB)
Installing collected packages: tableone
Successfully installed tableone-0.8.0


In [2]:
  !pip install pycountry

Collecting pycountry
  Downloading pycountry-22.3.5.tar.gz (10.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m47.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: pycountry
  Building wheel for pycountry (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pycountry: filename=pycountry-22.3.5-py2.py3-none-any.whl size=10681832 sha256=2798d3b1bfc4f54e83f9770d7fa2a1a2f86bcef474f3c9330e82319f2bfca851
  Stored in directory: /root/.cache/pip/wheels/03/57/cc/290c5252ec97a6d78d36479a3c5e5ecc76318afcb241ad9dbe
Successfully built pycountry
Installing collected packages: pycountry
Successfully installed pycountry-22.3.5


In [3]:
import pandas as pd
import numpy as np
import tableone
from tqdm import tqdm
import matplotlib.pyplot as plt

In [4]:
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

  and should_run_async(code)


In [5]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Change the current working directory to the folder "scientometrics" in your Google Drive
import os
os.chdir('/content/drive/My Drive/scientometrics')

# Verify that you've changed the directory correctly
print("Current Working Directory:", os.getcwd())

Mounted at /content/drive
Current Working Directory: /content/drive/My Drive/scientometrics


## load data

In [6]:
df = pd.read_csv('clean/max_30_authors.csv')
df['gender'].replace({'female': 1, 'male': 0}, inplace=True)
df = df[df.journal != "PLOS Medicine"]


In [7]:
df.head()

Unnamed: 0,pub_id,researcher_id,journal,year,gender,aff_country_code,aff_id,LMIC
0,pub.1000001707,ur.01006172666.33,JAMA,2015,1.0,US,grid.411024.2,0.0
1,pub.1000001707,ur.01012736025.78,JAMA,2015,0.0,US,,0.0
2,pub.1000001707,ur.010132635727.81,JAMA,2015,0.0,US,grid.266102.1,0.0
3,pub.1000001707,ur.01023477132.25,JAMA,2015,1.0,US,grid.62560.37,0.0
4,pub.1000001707,ur.01043100547.54,JAMA,2015,1.0,US,grid.267308.8,0.0


In [8]:
bins = [2006, 2010, 2014, 2018, 2023]

# Define the labels for the bins
labels = ['2007-2010', '2011-2014', '2015-2018', '2019-2022']

# Create a new column 'year_group' with the 4-year bins
df['year_group'] = pd.cut(df['year'], bins=bins, labels=labels, right=False)

## pooled
same authors may show more than once

In [9]:
df

Unnamed: 0,pub_id,researcher_id,journal,year,gender,aff_country_code,aff_id,LMIC,year_group
0,pub.1000001707,ur.01006172666.33,JAMA,2015,1.0,US,grid.411024.2,0.0,2015-2018
1,pub.1000001707,ur.01012736025.78,JAMA,2015,0.0,US,,0.0,2015-2018
2,pub.1000001707,ur.010132635727.81,JAMA,2015,0.0,US,grid.266102.1,0.0,2015-2018
3,pub.1000001707,ur.01023477132.25,JAMA,2015,1.0,US,grid.62560.37,0.0,2015-2018
4,pub.1000001707,ur.01043100547.54,JAMA,2015,1.0,US,grid.267308.8,0.0,2015-2018
...,...,...,...,...,...,...,...,...,...
282367,pub.1154120876,ur.01266011275.23,New England Journal of Medicine,2022,0.0,ES,grid.411066.4,0.0,2019-2022
282368,pub.1154141323,ur.01014111373.24,The BMJ,2022,0.0,GB,grid.431398.4,0.0,2019-2022
282369,pub.1154141323,ur.012230464276.43,The BMJ,2022,,GB,grid.431398.4,0.0,2019-2022
282370,pub.1154141323,ur.014110205705.32,The BMJ,2022,0.0,GB,grid.431398.4,0.0,2019-2022


In [10]:
cols = ['gender', 'LMIC','journal', 'year_group']
limit = {'gender': 2, 'LMIC': 2}
order = {'gender': [1,0], 'LMIC': [1,0]}

table0 = tableone.TableOne(df, columns = cols, categorical = cols, limit=limit, order=order)


In [11]:
table0

Unnamed: 0,Unnamed: 1,Missing,Overall
n,,,282372
"gender, n (%)",0.0,21724.0,174397 (66.9)
"gender, n (%)",1.0,,86251 (33.1)
"LMIC, n (%)",0.0,2812.0,252725 (90.4)
"LMIC, n (%)",1.0,,26835 (9.6)
"journal, n (%)",JAMA,0.0,52280 (18.5)
"journal, n (%)",Nature Medicine,,41384 (14.7)
"journal, n (%)",New England Journal of Medicine,,45933 (16.3)
"journal, n (%)",The BMJ,,50866 (18.0)
"journal, n (%)",The Lancet,,91909 (32.5)


## by paper
grouping by paper

### at least one

In [12]:
grouped_gender = df.groupby('pub_id')['gender'].apply(lambda x: (x == 1).any())
grouped_LMIC = df.groupby('pub_id')['LMIC'].apply(lambda x: (x == 1).any())

# Count the percentage of papers with at least one female author
percent_female = (grouped_gender.sum() / len(grouped_gender)) * 100

# Count the percentage of papers with at least one LMIC author
percent_LMIC = (grouped_LMIC.sum() / len(grouped_LMIC)) * 100

print(f"The percentage of papers with at least one female author is {percent_female:.2f}%")
print(f"The percentage of papers with at least one LMIC author is {percent_LMIC:.2f}%")

The percentage of papers with at least one female author is 66.57%
The percentage of papers with at least one LMIC author is 15.34%


### team composition -- lmic

In [13]:
# Step 1 & 2: Group by 'pub_id' and aggregate
agg_df = df.groupby('pub_id').agg(
    total_authors=pd.NamedAgg(column='researcher_id', aggfunc='count'),
    lmic_authors=pd.NamedAgg(column='LMIC', aggfunc='sum')
).reset_index()

# Step 3: Classify papers
agg_df['classification'] = 'HIC Only'
agg_df.loc[agg_df['lmic_authors'] == agg_df['total_authors'], 'classification'] = 'LMIC Only'
agg_df.loc[(agg_df['lmic_authors'] > 0) & (agg_df['lmic_authors'] < agg_df['total_authors']), 'classification'] = 'Mixed'

# Step 4: Calculate Percentages
total_papers = len(agg_df)
lmic_only_papers = len(agg_df[agg_df['classification'] == 'LMIC Only'])
mixed_papers = len(agg_df[agg_df['classification'] == 'Mixed'])

lmic_only_percentage = (lmic_only_papers / total_papers) * 100
mixed_percentage = (mixed_papers / total_papers) * 100

print(f"Only {lmic_only_percentage:.2f}% of papers had their entire author team from LMICs.")
print(f"Only {mixed_percentage:.2f}% of papers had a mix of authors from LMICs and HICs.")


Only 4.48% of papers had their entire author team from LMICs.
Only 10.86% of papers had a mix of authors from LMICs and HICs.


### team composition -- gender

In [14]:

# Step 1 & 2: Group by 'pub_id' and aggregate
agg_df = df.groupby('pub_id').agg(
    total_authors=pd.NamedAgg(column='researcher_id', aggfunc='count'),
    women_authors=pd.NamedAgg(column='gender', aggfunc='sum')
).reset_index()

# Step 3: Classify papers
agg_df['classification'] = 'Men Only'
agg_df.loc[agg_df['women_authors'] == agg_df['total_authors'], 'classification'] = 'Women Only'
agg_df.loc[(agg_df['women_authors'] > 0) & (agg_df['women_authors'] < agg_df['total_authors']), 'classification'] = 'Mixed'

# Step 4: Calculate Percentages
total_papers = len(agg_df)
women_only_papers = len(agg_df[agg_df['classification'] == 'Women Only'])
mixed_papers = len(agg_df[agg_df['classification'] == 'Mixed'])

women_only_percentage = (women_only_papers / total_papers) * 100
mixed_percentage = (mixed_papers / total_papers) * 100

print(f"Only {women_only_percentage:.2f}% of papers had their entire author team composed of women.")
print(f"Only {mixed_percentage:.2f}% of papers had a mix of both men and women.")


Only 4.11% of papers had their entire author team composed of women.
Only 62.47% of papers had a mix of both men and women.


In [15]:
grouped_df = df.groupby('pub_id').first()

cols = ['journal', 'year_group']

table1 = tableone.TableOne(grouped_df, columns = cols, categorical = cols)
table1

Unnamed: 0,Unnamed: 1,Missing,Overall
n,,,49764
"journal, n (%)",JAMA,0.0,9886 (19.9)
"journal, n (%)",Nature Medicine,,3694 (7.4)
"journal, n (%)",New England Journal of Medicine,,8303 (16.7)
"journal, n (%)",The BMJ,,11481 (23.1)
"journal, n (%)",The Lancet,,16400 (33.0)
"year_group, n (%)",2007-2010,0.0,6590 (13.2)
"year_group, n (%)",2011-2014,,10673 (21.4)
"year_group, n (%)",2015-2018,,13959 (28.1)
"year_group, n (%)",2019-2022,,18542 (37.3)


## by author
grouping by author

In [16]:
def get_df_authors(sce, m):
  df = pd.read_csv(f'clean/{sce}max_{m}_authors.csv')

  # groupby paper_id and count the number of authors
  df['num_authors'] = df.groupby('pub_id')['researcher_id'].transform('count')
  # create a column for the author position
  df['author_position'] = df.groupby('pub_id')['researcher_id'].transform(lambda x: x.rank(method='first'))
  # create a column for the author is first author
  df['is_first_author'] = df['author_position'] == 1
  # create a column for the author is last author
  df['is_last_author'] = df['author_position'] == df['num_authors']

  df_authors = df.groupby('researcher_id').agg({
      'pub_id': 'count',
      'is_first_author': 'sum',
      'is_last_author': 'sum',
      'journal': 'nunique',
      'gender': lambda x: x.mode()[0] if len(x.mode()) > 0 else np.nan, # the mode
      'LMIC': 'max',
      'aff_country_code': lambda x: x.mode()[0] if len(x.mode()) > 0 else np.nan,
  }).reset_index()

  df_authors.columns = ['researcher_id', 'num_pubs', 'num_first_author', 'num_last_author', 'num_journals',
                        'gender', 'LMIC', 'aff_country_code']

  # encode num_first_author and num_last_author as binary, 1 = 1, >1 if 2,3,4,5,....
  df_authors['first_author_more_than_once'] = df_authors['num_first_author'].apply(lambda x: 1 if x > 1 else 0)
  df_authors['last_author_more_than_once'] = df_authors['num_last_author'].apply(lambda x: 1 if x > 1 else 0)
  df_authors['more_than_one_journal'] = df_authors['num_journals'].apply(lambda x: 1 if x > 1 else 0)
  df_authors['more_than_one_pub'] = df_authors['num_pubs'].apply(lambda x: 1 if x > 1 else 0)
  df_authors['first_author_once'] = df_authors['num_first_author'].apply(lambda x: 1 if x == 1 else 0)
  df_authors['last_author_once'] = df_authors['num_last_author'].apply(lambda x: 1 if x == 1 else 0)

  return df_authors

### overall

In [17]:
tbls = {}

for s in tqdm(["", "gender_opti/", "gender_pessi/", "lmic_opti/", "lmic_pessi/"]):

  tbls[s] = {}

  for m in [30]:
    tbls[s][m] = get_df_authors(s, m)

100%|██████████| 5/5 [11:08<00:00, 133.73s/it]


#### normal

In [18]:
cols = ['gender', 'LMIC', 'first_author_once', 'last_author_once', 'first_author_more_than_once', 'last_author_more_than_once', 'more_than_one_journal', 'more_than_one_pub', 'aff_country_code']
limit = {'first_author_once': 1, 'last_author_once': 1, 'first_author_more_than_once': 1, 'last_author_more_than_once': 1, 'more_than_one_journal': 1, 'more_than_one_pub': 1, 'aff_country_code': 10}
order = {'first_author_once': [1,0], 'last_author_once': [1,0],'first_author_more_than_once': [1,0], 'last_author_more_than_once': [1,0], 'more_than_one_journal': [1,0], 'more_than_one_pub': [1,0]}

tableone.TableOne(tbls[""][30], columns = cols, categorical = cols, limit=limit, order=order)


Unnamed: 0,Unnamed: 1,Missing,Overall
n,,,151536
"gender, n (%)",female,14419.0,51150 (37.3)
"gender, n (%)",male,,85967 (62.7)
"LMIC, n (%)",0.0,2615.0,132599 (89.0)
"LMIC, n (%)",1.0,,16322 (11.0)
"first_author_once, n (%)",1,0.0,23194 (15.3)
"last_author_once, n (%)",1,0.0,23435 (15.5)
"first_author_more_than_once, n (%)",1,0.0,7853 (5.2)
"last_author_more_than_once, n (%)",1,0.0,7742 (5.1)
"more_than_one_journal, n (%)",1,0.0,24909 (16.4)


In [19]:
# # Count the intersection of 1s in both the 'gender' and 'LMIC' columns
# count_intersection = len(tbls[""][30][(tbls[""][30]['gender'] == 1) & (tbls[""][30]['LMIC'] == 1)])

# print(f"1s in both 'gender' and 'LMIC': {count_intersection/len(tbls[""][30])*100:.2f}%")

### by lmic

#### normal

In [20]:
# create tableone object
cols = ['first_author_once', 'last_author_once','first_author_more_than_once', 'last_author_more_than_once', 'more_than_one_journal', 'more_than_one_pub']
limit = {'first_author_once': 1, 'last_author_once': 1,'first_author_more_than_once': 1, 'last_author_more_than_once': 1, 'more_than_one_journal': 1, 'more_than_one_pub': 1}
order = {'first_author_once': [1,0], 'last_author_once':[1,0],'first_author_more_than_once': [1,0], 'last_author_more_than_once': [1,0], 'more_than_one_journal': [1,0], 'more_than_one_pub': [1,0]}

table3 = tableone.TableOne(tbls[""][30], columns = cols, categorical = cols, limit=limit, order=order,
                           groupby=['LMIC'], pval=True, overall=False, missing=False, smd=False, htest_name=True)

table3

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by LMIC,Grouped by LMIC,Grouped by LMIC,Grouped by LMIC
Unnamed: 0_level_1,Unnamed: 1_level_1,0.0,1.0,P-Value,Test
n,,132599,16322,,
"first_author_once, n (%)",1.0,20336 (15.3),2419 (14.8),0.086,Chi-squared
"last_author_once, n (%)",1.0,20477 (15.4),2499 (15.3),0.667,Chi-squared
"first_author_more_than_once, n (%)",1.0,7220 (5.4),616 (3.8),<0.001,Chi-squared
"last_author_more_than_once, n (%)",1.0,7112 (5.4),604 (3.7),<0.001,Chi-squared
"more_than_one_journal, n (%)",1.0,22975 (17.3),1921 (11.8),<0.001,Chi-squared
"more_than_one_pub, n (%)",1.0,42528 (32.1),4374 (26.8),<0.001,Chi-squared


#### optimistic

In [21]:
# create tableone object
cols = ['first_author_once', 'last_author_once','first_author_more_than_once', 'last_author_more_than_once', 'more_than_one_journal', 'more_than_one_pub']
limit = {'first_author_once': 1, 'last_author_once': 1,'first_author_more_than_once': 1, 'last_author_more_than_once': 1, 'more_than_one_journal': 1, 'more_than_one_pub': 1}
order = {'first_author_once': [1,0], 'last_author_once':[1,0],'first_author_more_than_once': [1,0], 'last_author_more_than_once': [1,0], 'more_than_one_journal': [1,0], 'more_than_one_pub': [1,0]}

table3 = tableone.TableOne(tbls["lmic_opti/"][30], columns = cols, categorical = cols, limit=limit, order=order,
                           groupby=['LMIC'], pval=True, overall=False, missing=False, smd=False)

table3

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by LMIC,Grouped by LMIC,Grouped by LMIC
Unnamed: 0_level_1,Unnamed: 1_level_1,0.0,1.0,P-Value
n,,140674,21458,
"first_author_once, n (%)",1.0,21142 (15.0),3074 (14.3),0.007
"last_author_once, n (%)",1.0,21245 (15.1),3167 (14.8),0.194
"first_author_more_than_once, n (%)",1.0,7605 (5.4),734 (3.4),<0.001
"last_author_more_than_once, n (%)",1.0,7513 (5.3),713 (3.3),<0.001
"more_than_one_journal, n (%)",1.0,26690 (19.0),2806 (13.1),<0.001
"more_than_one_pub, n (%)",1.0,45638 (32.4),5391 (25.1),<0.001


#### pessimistic

In [22]:
# create tableone object
cols = ['first_author_once', 'last_author_once','first_author_more_than_once', 'last_author_more_than_once', 'more_than_one_journal', 'more_than_one_pub']
limit = {'first_author_once': 1, 'last_author_once': 1,'first_author_more_than_once': 1, 'last_author_more_than_once': 1, 'more_than_one_journal': 1, 'more_than_one_pub': 1}
order = {'first_author_once': [1,0], 'last_author_once':[1,0],'first_author_more_than_once': [1,0], 'last_author_more_than_once': [1,0], 'more_than_one_journal': [1,0], 'more_than_one_pub': [1,0]}

table3 = tableone.TableOne(tbls["lmic_pessi/"][30], columns = cols, categorical = cols, limit=limit, order=order,
                           groupby=['LMIC'], pval=True, overall=False, missing=False, smd=False)

table3

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by LMIC,Grouped by LMIC,Grouped by LMIC
Unnamed: 0_level_1,Unnamed: 1_level_1,0.0,1.0,P-Value
n,,143298,18834,
"first_author_once, n (%)",1.0,21583 (15.1),2633 (14.0),<0.001
"last_author_once, n (%)",1.0,21704 (15.1),2708 (14.4),0.006
"first_author_more_than_once, n (%)",1.0,7622 (5.3),717 (3.8),<0.001
"last_author_more_than_once, n (%)",1.0,7540 (5.3),686 (3.6),<0.001
"more_than_one_journal, n (%)",1.0,26704 (18.6),2792 (14.8),<0.001
"more_than_one_pub, n (%)",1.0,45785 (32.0),5244 (27.8),<0.001


### by gender

#### normal

In [23]:
# create tableone object
cols = ['first_author_once', 'last_author_once','first_author_more_than_once', 'last_author_more_than_once', 'more_than_one_journal', 'more_than_one_pub']
limit = {'first_author_once': 1, 'last_author_once': 1,'first_author_more_than_once': 1, 'last_author_more_than_once': 1, 'more_than_one_journal': 1, 'more_than_one_pub': 1}
order = {'first_author_once': [1,0], 'last_author_once':[1,0],'first_author_more_than_once': [1,0], 'last_author_more_than_once': [1,0], 'more_than_one_journal': [1,0], 'more_than_one_pub': [1,0]}

table4 = tableone.TableOne(tbls[""][30], columns = cols, categorical = cols, limit=limit, order=order,
                           groupby=['gender'], pval=True, overall=False, missing=False, smd=False)

table4


Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by gender,Grouped by gender,Grouped by gender
Unnamed: 0_level_1,Unnamed: 1_level_1,female,male,P-Value
n,,51150,85967,
"first_author_once, n (%)",1.0,7443 (14.6),13617 (15.8),<0.001
"last_author_once, n (%)",1.0,7527 (14.7),13829 (16.1),<0.001
"first_author_more_than_once, n (%)",1.0,2213 (4.3),5164 (6.0),<0.001
"last_author_more_than_once, n (%)",1.0,2187 (4.3),5071 (5.9),<0.001
"more_than_one_journal, n (%)",1.0,6930 (13.5),16512 (19.2),<0.001
"more_than_one_pub, n (%)",1.0,14383 (28.1),29189 (34.0),<0.001


#### optimistic

In [24]:
# create tableone object
cols = ['first_author_once', 'last_author_once','first_author_more_than_once', 'last_author_more_than_once', 'more_than_one_journal', 'more_than_one_pub']
limit = {'first_author_once': 1, 'last_author_once': 1,'first_author_more_than_once': 1, 'last_author_more_than_once': 1, 'more_than_one_journal': 1, 'more_than_one_pub': 1}
order = {'first_author_once': [1,0], 'last_author_once':[1,0],'first_author_more_than_once': [1,0], 'last_author_more_than_once': [1,0], 'more_than_one_journal': [1,0], 'more_than_one_pub': [1,0]}

table4 = tableone.TableOne(tbls["gender_opti/"][30], columns = cols, categorical = cols, limit=limit, order=order,
                           groupby=['gender'], pval=True, overall=False, missing=False, smd=False)

table4


Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by gender,Grouped by gender,Grouped by gender
Unnamed: 0_level_1,Unnamed: 1_level_1,female,male,P-Value
n,,71212,90920,
"first_author_once, n (%)",1.0,10129 (14.2),14087 (15.5),<0.001
"last_author_once, n (%)",1.0,10155 (14.3),14257 (15.7),<0.001
"first_author_more_than_once, n (%)",1.0,2921 (4.1),5418 (6.0),<0.001
"last_author_more_than_once, n (%)",1.0,2893 (4.1),5333 (5.9),<0.001
"more_than_one_journal, n (%)",1.0,10532 (14.8),18964 (20.9),<0.001
"more_than_one_pub, n (%)",1.0,19816 (27.8),31213 (34.3),<0.001


#### pessimistic

In [25]:
# create tableone object
cols = ['first_author_once', 'last_author_once','first_author_more_than_once', 'last_author_more_than_once', 'more_than_one_journal', 'more_than_one_pub']
limit = {'first_author_once': 1, 'last_author_once': 1,'first_author_more_than_once': 1, 'last_author_more_than_once': 1, 'more_than_one_journal': 1, 'more_than_one_pub': 1}
order = {'first_author_once': [1,0], 'last_author_once':[1,0],'first_author_more_than_once': [1,0], 'last_author_more_than_once': [1,0], 'more_than_one_journal': [1,0], 'more_than_one_pub': [1,0]}

table4 = tableone.TableOne(tbls["gender_pessi/"][30], columns = cols, categorical = cols, limit=limit, order=order,
                           groupby=['gender'], pval=True, overall=False, missing=False, smd=False)

table4


Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by gender,Grouped by gender,Grouped by gender
Unnamed: 0_level_1,Unnamed: 1_level_1,female,male,P-Value
n,,55364,106768,
"first_author_once, n (%)",1.0,7850 (14.2),16366 (15.3),<0.001
"last_author_once, n (%)",1.0,7936 (14.3),16476 (15.4),<0.001
"first_author_more_than_once, n (%)",1.0,2405 (4.3),5934 (5.6),<0.001
"last_author_more_than_once, n (%)",1.0,2372 (4.3),5854 (5.5),<0.001
"more_than_one_journal, n (%)",1.0,8677 (15.7),20819 (19.5),<0.001
"more_than_one_pub, n (%)",1.0,15948 (28.8),35081 (32.9),<0.001


## by journal

In [26]:
df = pd.read_csv('clean/max_30_authors.csv')
df = df[df.journal != "PLOS Medicine"]

In [27]:
grouped_journal = df.groupby('journal').agg({
    'pub_id': 'count',
    'researcher_id': 'nunique',
}).reset_index()

total_unique_papers = df['pub_id'].nunique()

grouped_journal['perc'] = (grouped_journal['pub_id'] / grouped_journal['pub_id'].sum()) * 100

grouped_journal.reset_index(drop=True, inplace=True)

grouped_journal

Unnamed: 0,journal,pub_id,researcher_id,perc
0,JAMA,52280,32361,18.514584
1,Nature Medicine,41384,32509,14.655844
2,New England Journal of Medicine,45933,32268,16.266839
3,The BMJ,50866,30996,18.013826
4,The Lancet,91909,55903,32.548907


## by continent

In [28]:
import pycountry

# read country to continent mapping
df_country_continent = pd.read_csv('data_light/country_to_continent.csv')

# assuming df is your DataFrame and 'country' is the column with 2 letter country codes
df_country_continent['iso3'] = df_country_continent['country'].apply(lambda x: pycountry.countries.get(alpha_2=x).alpha_3)

df_country_continent.to_csv('data_light/country_to_continent.csv', index=False)

# merge country to continent mapping with df_authors
df_authors = tbls[""][30].merge(df_country_continent, left_on='aff_country_code', right_on='country', how='left')

In [29]:
  import matplotlib.pyplot as plt
  import matplotlib.ticker as ticker

  plt.figure(figsize=(8, 4))
  ax = continent_counts.plot(kind='bar')

  # Rotate the tick labels
  ax.set_xticklabels(ax.get_xticklabels(), ha="right")

  # Add percentage text labels
  total = len(df_authors)
  for p in ax.patches:
      percentage = '{:.1f}%'.format(100 * p.get_height() / total)
      ax.annotate(percentage, (p.get_x() + p.get_width() / 2., p.get_height()),
                  ha='center', va='bottom')

  # Add x-axis and y-axis labels
  plt.xlabel("Continent")
  plt.ylabel("No. of Authors (in thousands)")
  plt.title("Distribution of authors per continent")
  plt.ylim([0, 70000])

  # Add a custom y-axis label formatter
  ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, p: '{:.0f}k'.format(x/1000)))

  plt.show()


NameError: ignored

<Figure size 800x400 with 0 Axes>

## by country

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# Count the number of authors by country and take the top 20
country_counts = df_authors['aff_country_code'].value_counts().head(20)

# Set up the figure and axis
plt.figure(figsize=(10, 5))
ax = country_counts.plot(kind='bar')

# Rotate the tick labels
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")

# Add percentage text labels
total = len(df_authors)
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height() / total)
    ax.annotate(percentage, (p.get_x() + p.get_width() / 2. + .1, p.get_height()),
                ha='center', va='bottom', fontsize=7)

# Add x-axis and y-axis labels
plt.xlabel("Country")
plt.ylabel("No. of Authors (in thousands)")

# Add a title
plt.title("Distribution of authors by country (top 20)")

# Add a custom y-axis label formatter
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, p: '{:.0f}k'.format(x/1000)))

plt.show()


## drop the map

In [None]:
import geopandas as gpd

# Load the geometries
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

# Use ISO-ALPHA-2 code as the index (the 2-digit country code)
world.set_index('iso_a3', inplace=True)

author_counts = df_authors.groupby('iso3')['researcher_id'].nunique()
world = world.join(author_counts)

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, 1, figsize=(15, 5))
world.plot(column='researcher_id', ax=ax, legend=True, cmap='YlOrRd')
ax.set_title('Number of authors by country\n', fontsize=15)
ax.set_axis_off()
plt.show()