In [None]:
import tarfile, xmltodict, pickle
import pandas as pd
from itertools import chain
from collections import defaultdict

import warnings
warnings.filterwarnings('ignore')

### Extracting only required ORCID information for the paper

In [None]:
def get_rel_info(x):
    """
    This function is to extract name, identifier, employements, and educations 
    from the ORCID's public data.
    """
    try:
        p = content['record:record']['person:person'].get('person:name', None)
        a = content['record:record']['common:orcid-identifier']['common:path']
        b = content['record:record']['activities:activities-summary']['activities:employments'].get('activities:affiliation-group', None)
        c = content['record:record']['activities:activities-summary']['activities:educations'].get('activities:affiliation-group', None)
        return (p, a, b, c)    
    except:
        return (None, None, None, None)

ORCID's public data can be found at https://support.orcid.org/hc/en-us/articles/360006897394-How-do-I-get-the-public-data-file-

In [None]:
%%time 
# ~ 7.5 hours in the author's computing machine. Execution time varies by user. 

tar = tarfile.open("/data02/orcid/ORCID_2022_10_summaries.tar.gz")

rec = []
idx = 0
for member in tar:
    f = tar.extractfile(member)
    if f is not None:
        content = f.read().decode()
        content = xmltodict.parse(content)
        p, a, b, c = get_rel_info(content)
        rec.append([p, a, b, c])
        idx+=1
        if idx%1000000==999999:
            print(idx+1)

# Save 
with open("../dat/orcid_2022_all_recs.pkl", "wb") as f:
    pickle.dump(rec, f)

### Load the extracted information

In [None]:
%%time 
# ~ 6 minutes in the author's computing machine

with open('../dat/orcid_2022_all_recs.pkl', 'rb') as f:
    rec = pickle.load(f)

### The number of unique ORCID iDs in the ORCID public data 2022

In [None]:
uniq_ids = set([r[1] for r in rec])
print(len(uniq_ids)) # It returns 14,845,876

### Selecting ORCID iDs having both education and employement information

In [None]:
rec_with_info = []
for r in rec:
    if r[2] and r[3]:
        rec_with_info.append(r)
        
uniq_ids_with_info = set([r[1] for r in rec_with_info])
print(len(uniq_ids_with_info)) # It returns 2,199,705

### Convert the reduced file into a csv format

In [None]:
def clean_rec(a):
    """
    This function preprocesses the data further into a csv file for future use. 
    """
    person = a[0]
    if person:
        first_name = person.get('personal-details:given-names', '')
        last_name = person.get('personal-details:family-name', '')
    else:
        first_name = ''
        last_name = ''
    orcid = a[1]
    emp = a[2]
    edu = a[3]
    
    v = []
    
    if type(emp)==dict:
        emp = [emp]
    if type(edu)==dict:
        edu = [edu]
        
    fields = [
        'common:organization_common:name',
        'common:organization_common:address_common:city',
        'common:organization_common:address_common:region',
        'common:organization_common:address_common:country',
        'common:organization_common:disambiguated-organization_common:disambiguated-organization-identifier',
        'common:department-name',
        'common:role-title',
        'common:start-date_common:year',
        'common:start-date_common:month',
        'common:start-date_common:day',              
        'common:end-date_common:year',
        'common:end-date_common:month',
        'common:end-date_common:day'              
    ]    
    
    rec = defaultdict(list)
    for e in emp:
        rec['emp'].append(pd.json_normalize(e['employment:employment-summary'], sep='_').to_dict(orient='records')[0])
    for e in edu:
        rec['edu'].append(pd.json_normalize(e['education:education-summary'], sep='_').to_dict(orient='records')[0])
        
    
    for t in rec:
        for r in rec[t]:
            tmp = []
            for f in fields:
                tmp.append(r.get(f, ''))
            tmp = [orcid, first_name, last_name] + tmp + [t]
            v.append(tmp)

    return v        

In [None]:
%%time
# ~ 3.5 hours in the author's computing machine

cleaned_rec = list(map(clean_rec, rec_with_info))
cleaned_rec = list(chain(*cleaned_rec))

print(len(cleaned_rec)) # It returns xxx, the number of ORCID iDs having both employment and education records

In [None]:
df = pd.DataFrame(cleaned_rec)
df.to_csv('../dat/orcid_2022_cleaned_df.csv', index=False)
print(df.shape)