## Section 2.1 - Preprocessing and academic field assignments

This notebook is executed after completing the procedure in "1_postdoc_career_trajectories.ipynb"

In [5]:
import pandas as pd
from collections import defaultdict
from tqdm import tqdm
import numpy as np
from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

### Import data

In [6]:
dat = pd.read_csv('_dat_orcid_2022_postdoc_trajectories.csv.gz', compression='gzip')
print(dat.shape)
dat = dat.drop_duplicates()

dat.department = dat.department.astype(str)
dat.title = dat.title.astype(str)

print(dat.shape, len(set(dat.orcid)))

(445869, 15)
(445869, 15) 125364


In [7]:
dat[dat.orcid=='0000-0002-4863-4729'] # the author's trajectory. It is correct. 

Unnamed: 0,orcid,firstname,lastname,inst,country,department,title,start_year,start_month,end_year,end_month,doctor,postdoc,end_date,start_date
221652,0000-0002-4863-4729,Hyunuk,Kim,Pohang University of Science and Technology,KR,Industrial and Management Engineering,phd,2016.0,9.0,2019.0,8.0,True,False,2019-08-01,
221653,0000-0002-4863-4729,Hyunuk,Kim,Boston University,US,"Information Systems, Questrom School of Business",postdoctoral associate,2019.0,8.0,2021.0,6.0,False,True,,2019-08-01
221654,0000-0002-4863-4729,Hyunuk,Kim,Boston University,US,"Administrative Sciences, Metropolitan College",assistant professor,2021.0,7.0,,,False,False,,2021-07-01


### Focusing on postdocs whose first postdoc starts in or after 2011.

In [8]:
postdocs = dat[dat.postdoc]
first_postdocs = postdocs.groupby('orcid').head(1)

unique_postdocs = list(first_postdocs.orcid.unique())
print(len(unique_postdocs))
recent_postdocs = first_postdocs[first_postdocs.start_year>=2011]
unique_recent_postdocs = list(recent_postdocs.orcid.unique())
print(len(unique_recent_postdocs))

125364
98692


### Dimensions API

In [None]:
import requests
import json

KEY = '' # your private API key

import dimcli
dimcli.login(key=KEY,
             endpoint="https://app.dimensions.ai")

dsl = dimcli.Dsl()

### Collect publications

In [None]:
import glob, os
if not os.path.exists('/data02/orcid/dimensions_082123_recent'):
    os.makedirs('/data02/orcid/dimensions_082123_recent')

downloaded_files = glob.glob('/data02/orcid/dimensions_082123_recent/*.csv')

In [None]:
import time

size=400

for start in np.arange(0, len(unique_recent_postdocs), size):
    end = start+size
    fname = f"/data02/orcid/dimensions_082123_recent/{start}_{end}.csv"
    if fname not in downloaded_files:
        query = f"""search publications where researchers.orcid_id in {json.dumps(unique_recent_postdocs[start:end])} return publications[category_for_2020+id+title+authors+times_cited+date+journal_title_raw+proceedings_title+concepts_scores]"""
        res = dsl.query_iterative(query, limit=1000)
        df = res.as_dataframe()
        df.to_csv(fname)
        time.sleep(15)
    else:
        continue

### Reshape the collected metadata to a csv file for postdoc ORCIDs

In [None]:
# This process would take about an hour. 

from collections import defaultdict
orcid_pubs = defaultdict(list)
for file in tqdm(downloaded_files):
    tmp = pd.read_csv(file, index_col=0)
    tmp.authors = tmp.authors.apply(ast.literal_eval)
    tmp.authors = tmp.authors.apply(lambda x: [t['orcid'][0] for t in x if t['orcid']])
    for ix, row in tmp.iterrows():
        authors = row.authors
        for ix, a in enumerate(authors):
            orcid_pubs[a].append([a, row['id'], row['date'], row['title'], row['journal_title_raw'],
                                  row['category_for_2020'], row['times_cited'], ix])
orcid_pubs = {a: orcid_pubs[a] for a in orcid_pubs if a in unique_postdocs}
    
complete_df = []
for x in tqdm(orcid_pubs):
    target_orcid_recs = pd.DataFrame(orcid_pubs[x])
    target_orcid_recs = target_orcid_recs.astype(str).drop_duplicates().reset_index(drop=True)
    complete_df.append(target_orcid_recs)
    
complete_df = pd.concat(complete_df)
complete_df = complete_df.reset_index(drop=True)

In [None]:
complete_df.to_csv('_dat_orcid_2022_recent_postdocs_dimensions_publications_082123.csv', index=False)