# Imports

In [1]:
import joblib
import pandas as pd

# Final dataframe

In [2]:
df = pd.read_parquet("data/final_cleaning_dataset.parquet")

In [3]:
df.shape, df.orcid.nunique()

((16430063, 19), 7071222)

In [4]:
clean_role = df.role.apply(lambda x : x.strip().replace(".","").lower() if x and type(x) is str else None)
df = df.assign(clean_role=clean_role)

In [5]:
role_df = pd.read_parquet("data/roles.parquet")
stem_classifications = pd.read_parquet("data/stem_and_med_classifications.parquet")

### Finalize merge

In [6]:
df.shape

(16430063, 20)

In [7]:
df = df.merge(role_df[~role_df.clean_role.duplicated()],on="clean_role",how = "left")

In [8]:
df.shape

(16430063, 21)

In [9]:
stem_classifications.columns = ['clean_affiliation','n_affil_tot','stem_prob','med_clf']

In [10]:
df_fin = pd.merge(df, stem_classifications.drop_duplicates("clean_affiliation"), on ='clean_affiliation')
df.shape, df_fin.shape

((16430063, 21), (16430063, 24))

## Apply Med classifier to roles

In [11]:
md_pattern = r'''(?ix)                    # Case insensitive, verbose mode
    \b(?:
        m\.?d\.?                           # md, m.d., m.d, MD, M.D., etc.
        |medical\s+doctor                  # "medical doctor"
        |doctor\s+of\s+medicine           # "doctor of medicine"
        |physician                         # physician
        |m\.?d\.?,?\s+ph\.?d\.?           # MD PhD combinations
        |ph\.?d\.?,?\s+m\.?d\.?           # PhD MD combinations
        |dr\.?\s+\(m\.?d\.?\)             # Dr. (MD) format
        |m\.?b\.?b\.?s\.?                 # MBBS (international)
        |doctor\s+of\s+osteopathic\s+medicine
               # Spanish
        |médico
        |doctor(?:a)?\s+en\s+medicina
        |licenciado\s+en\s+medicina
        |medico                              # without accent
        
        # Portuguese
        |médico(?:a)?
        |medicina
        |medicina\s+(?:geral)?
        
        # French
        |médecin
        |docteur\s+en\s+médecine
        |dr\.?\s+en\s+médecine
        
        # German
        |arzt|ärztin
        |doktor\s+der\s+medizin
        |dr\.?\s+med\.?
        |facharzt|fachärztin
        
        # Italian
        |medico
        |dottore\s+in\s+medicina
        |dott\.?\s+med\.?
        
        # Russian (Cyrillic)
        |врач
        |доктор\s+медицин
        |медик
        
        # Chinese (Simplified & Traditional)
        |医生|醫生
        |医师|醫師
        |大夫
        
        # Japanese
        |医者|医師
        
        # Korean
        |의사
        
        # Arabic
        |طبيب
        |دكتور\s+طب
        
        # Hindi
        |चिकित्सक
        
        # Polish
        |lekarz
        |doktor\s+medycyny
        
        # Swedish/Norwegian/Danish
        |läkare|lege|læge
        
        # Turkish
        |hekim
        |tıp\s+doktoru
        
        # Greek
        |γιατρός
        |ιατρός
        
        # Hebrew
        |רופא
        
        # Indonesian/Malay
        |dokter
        
        # Vietnamese
        |bác\s+sĩ
        
        # Thai
        |แพทย์
        |หมอ
        
        # Czech
        |lékař
        
        # Hungarian
        |orvos
        
        # Romanian
        |medic
        |doctor\s+în\s+medicină
        
        # Finnish
        |lääkäri
        
        # Ukrainian
        |лікар
        
        # Bulgarian
        |лекар
    )\b
'''

df_fin['is_md_role'] = df_fin['role'].str.contains(md_pattern, na=False)


In [12]:
df_fin['med_clf'] = df_fin.med_clf | df_fin.is_md_role

In [None]:
#pd.set_option('display.max_columns', None)
df_fin.to_csv("data/full_affiliations_data.csv", index=False)

In [None]:
df_fin.sample(50000).to_csv("data/full_affiliations_data_sample_50k.csv",index=False)

In [13]:
pd.set_option('display.max_columns', None)

df_fin.head()

Unnamed: 0,orcid,name,type,org_name,role,country,department_name,start_year,end_year,affiliation,affiliation_type,clean_name,given,p(gf),to_field,source_lang_translate,from_field,translated_affiliation,clean_affiliation,clean_role,role_category,n_affil_tot,stem_prob,med_clf,is_md_role
0,0000-0003-4352-5571,Tijana Parezanović,employment,Alfa BK Univerzitet,Full Professor,RS,Faculty of Foreign Languages,2025.0,,faculty of foreign languages,department_name,Tijana Parezanović,Tijana Parezanović,0.989,,,,faculty of foreign languages,foreign languages,full professor,prof,11158,0.001328,False,False
1,0000-0003-4732-3571,"P.V.R.; Souza PVR; Souza, PVR Souza",employment,Instituto de Tecnologia em Fármacos,Laboratory of Natural Products for Public Health,BR,Natural Products,2011.0,,natural products,department_name,"P.V.R.; Souza PVR; Souza, PVR Souza","P.V.R.; Souza PVR; Souza, PVR Souza",0.581104,,,,natural products,natural products,laboratory of natural products for public health,,185,0.85822,False,False
2,0009-0006-9787-7571,Rui Zhang,employment,Northeastern University,,CN,School of Metallurgy,1997.0,,school of metallurgy,department_name,Rui Zhang,Rui Zhang,0.25,,,,school of metallurgy,metallurgy,,,1423,0.947013,False,False
3,0000-0002-2797-0571,Rongbo Shen,employment,Guangzhou National Laboratory,Vice investigator,CN,,2023.0,,guangzhou national laboratory,org_name,Rongbo Shen,Rongbo Shen,0.250126,,,,guangzhou national laboratory,guangzhou national laboratory,vice investigator,,15,0.882782,False,False
4,0000-0002-2797-0571,Rongbo Shen,employment,Tencent (China),Researcher,CN,Tencent AI Lab.,2020.0,2023.0,tencent ai lab.,department_name,Rongbo Shen,Rongbo Shen,0.250126,,,,tencent ai lab.,tencent ai lab.,researcher,research,1,0.636722,False,False


# Generate Country/Year files

In [2]:
af = pd.read_csv("data/full_affiliations_data.csv")[['p(gf)','country','stem_prob','start_year','end_year',
             'orcid','clean_role',"role_category","clean_affiliation",'med_clf']]

In [3]:
af.end_year.isnull().sum(), len(af), af.end_year.isnull().sum()/ len(af)

(np.int64(7151725), 16430063, np.float64(0.43528287140469274))

In [4]:
def get_role_based(role_cat, default=10):
    if role_cat == 'bachelors':
        return 4, 'BS'
    if role_cat == "masters/postgrad":
        return 2, 'MS'
    if role_cat == "postdoc":
        return 3, 'POSTDOC'
    if role_cat == "phd":
        return 5, 'PHD'
    if role_cat == 'prof' or role_cat=='head':
        return 10, 'PROF'
    return default, 'DEFAULT'
    

def get_by_oid(user_affil,out_dir,default_role):
    user_affil = user_affil.sort_values("start_year")
    total_affils = len(user_affil)
    #get current endyear list
    endyears = user_affil.end_year.values.tolist()
    startyears = user_affil.start_year.values.tolist()
    #keep track of how we fill
    fill_type = ['ENDYEAR']*total_affils
    
    for affil_iter in range(total_affils):
        # if we don't have an endyear
        if pd.isnull(user_affil.iloc[affil_iter].end_year):
            # get role-based end year (min len of affiliation)
            year_add, role_fill = get_role_based(user_affil.iloc[affil_iter].role_category,
                                                 default=default_role)
            end_year = startyears[affil_iter] + year_add
            fill_type[affil_iter] = role_fill
            if affil_iter < (total_affils-1) and startyears[affil_iter+1] - 1 >= end_year:
                # if there's a later startyear in the subsequent iteration, and it is > role-based, use it
                end_year = startyears[affil_iter+1] - 1
                fill_type[affil_iter] = "NEXT_START"
            endyears[affil_iter] = end_year
    user_affil['end_year'] = endyears
    
    out_fil = os.path.join(out_dir,str(user_affil.orcid.iloc[0])+".csv")
    all_rows = []
    for i,row in user_affil.iterrows():
        all_rows += [{"country":row['country'],
                      "oid":row['orcid'],
                      'stem' : row['stem_prob'],
                      'gender' : row['p(gf)'],
                      'is_med' : row['med_clf'],
                      "year":y
                } for y in range(int(row['start_year']),int(row['end_year']+1))
               ]
    pd.DataFrame(all_rows).to_csv(out_fil,index=False,header=False)
    return fill_type
        
    

In [None]:
import os
os.mkdir("orcid_res_10")
os.mkdir("orcid_res_5")

In [10]:
from multiprocessing import Pool
import numpy as np


# Split dataframe into chunks
n_cpus = 12
chunks = np.array_split(af, n_cpus)

def process_chunk(chunk):
    chunk.groupby("orcid").apply(get_by_oid, out_dir="orcid_res_10",default_role=10).reset_index()

# Process chunks in parallel
with Pool(n_cpus) as pool:
    chunk_results = pool.map(process_chunk, chunks)


  chunk.groupby("orcid").apply(get_by_oid, out_dir="orcid_res_10",default_role=10).reset_index()
  chunk.groupby("orcid").apply(get_by_oid, out_dir="orcid_res_10",default_role=10).reset_index()
  chunk.groupby("orcid").apply(get_by_oid, out_dir="orcid_res_10",default_role=10).reset_index()
  chunk.groupby("orcid").apply(get_by_oid, out_dir="orcid_res_10",default_role=10).reset_index()
  chunk.groupby("orcid").apply(get_by_oid, out_dir="orcid_res_10",default_role=10).reset_index()
  chunk.groupby("orcid").apply(get_by_oid, out_dir="orcid_res_10",default_role=10).reset_index()
  chunk.groupby("orcid").apply(get_by_oid, out_dir="orcid_res_10",default_role=10).reset_index()
  chunk.groupby("orcid").apply(get_by_oid, out_dir="orcid_res_10",default_role=10).reset_index()
  chunk.groupby("orcid").apply(get_by_oid, out_dir="orcid_res_10",default_role=10).reset_index()
  chunk.groupby("orcid").apply(get_by_oid, out_dir="orcid_res_10",default_role=10).reset_index()
  chunk.groupby("orcid").apply

In [11]:
def process_chunk(chunk):
    chunk.groupby("orcid").apply(get_by_oid, out_dir="orcid_res_5",default_role=5).reset_index()

# Process chunks in parallel
with Pool(n_cpus) as pool:
    chunk_results = pool.map(process_chunk, chunks)


  chunk.groupby("orcid").apply(get_by_oid, out_dir="orcid_res_5",default_role=5).reset_index()
  chunk.groupby("orcid").apply(get_by_oid, out_dir="orcid_res_5",default_role=5).reset_index()
  chunk.groupby("orcid").apply(get_by_oid, out_dir="orcid_res_5",default_role=5).reset_index()
  chunk.groupby("orcid").apply(get_by_oid, out_dir="orcid_res_5",default_role=5).reset_index()
  chunk.groupby("orcid").apply(get_by_oid, out_dir="orcid_res_5",default_role=5).reset_index()
  chunk.groupby("orcid").apply(get_by_oid, out_dir="orcid_res_5",default_role=5).reset_index()
  chunk.groupby("orcid").apply(get_by_oid, out_dir="orcid_res_5",default_role=5).reset_index()
  chunk.groupby("orcid").apply(get_by_oid, out_dir="orcid_res_5",default_role=5).reset_index()
  chunk.groupby("orcid").apply(get_by_oid, out_dir="orcid_res_5",default_role=5).reset_index()
  chunk.groupby("orcid").apply(get_by_oid, out_dir="orcid_res_5",default_role=5).reset_index()
  chunk.groupby("orcid").apply(get_by_oid, out_dir

In [None]:
# To concatenate into a single file
#find . -maxdepth 1 -type f -name '*.csv' -print0 | xargs -0 cat > ../all_default10.csv