# Imports

In [1]:
#for model-building
import joblib
import pandas as pd
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
pd.set_option("max_colwidth", None)
from simcse import SimCSE
from tqdm import tqdm
def preprocess(text):
    text = text.lower() 
    text=text.strip()  
    text=re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return text


def simcse_preproc(model,names,device_name="cpu"):
    return model.encode(names,
                          device=device_name,
                          batch_size=100, 
                          return_numpy=True)


  from .autonotebook import tqdm as notebook_tqdm


# Read in data and run STEM classifier

In [53]:
d = pd.read_parquet("data/final_cleaning_dataset.parquet")

In [54]:
clf = joblib.load("data/final_stem_classifier.joblib")

In [55]:
from glob import glob
all_stem_pred = pd.concat([pd.read_parquet(x) for x in glob("/data/orcid/stem_pred*.parquet")])

In [56]:
full_data_tst =  d.groupby("clean_affiliation").size().reset_index()
full_data_tst.columns = ['cmd','n']
full_data_tst = full_data_tst.sort_values('n', ascending=False)


In [57]:
remaining_to_classify = full_data_tst[~full_data_tst.cmd.isin(all_stem_pred.cmd)]

In [59]:
del d

In [58]:
len(remaining_to_classify)

0

In [51]:
from joblib import Parallel, delayed
from tqdm import tqdm
import numpy as np

def process_batch(batch_df, batch_id):
    """Process a single batch"""
    print(f"starting batch: {batch_id}")
    model = SimCSE("princeton-nlp/sup-simcse-roberta-large")
    embs = simcse_preproc(model,batch_df.cmd.values.tolist())
    predictions = clf.predict_proba(embs)[:, 1]
    batch_df['stem_pred'] = predictions
    print(f"done w/ batch: {batch_id}")
    batch_df.to_parquet(f"/data/orcid/stem_pred_remaining2_{batch_id}.parquet")

batch_size = 4000
n_jobs = 15

# Create batches
batches = [remaining_to_classify.iloc[i:i+batch_size] for i in range(0, len(remaining_to_classify), batch_size)]

# Process in parallel
results = Parallel(n_jobs=n_jobs)(
    delayed(process_batch)(batch,i) for i,batch in enumerate(batches)
)


starting batch: 8
starting batch: 0
starting batch: 1
starting batch: 4
starting batch: 3
starting batch: 7
starting batch: 5
starting batch: 2
starting batch: 6


  0%|          | 0/40 [00:00<?, ?it/s]

starting batch: 9


100%|██████████| 3/3 [00:20<00:00,  6.74s/it]]
100%|██████████| 40/40 [04:54<00:00,  7.36s/it]
100%|██████████| 40/40 [05:00<00:00,  7.52s/it]
100%|██████████| 40/40 [05:07<00:00,  7.69s/it]
100%|██████████| 40/40 [05:18<00:00,  7.97s/it]
100%|██████████| 40/40 [05:19<00:00,  7.98s/it]
 98%|█████████▊| 39/40 [05:20<00:07,  7.83s/it]

done w/ batch: 9


100%|██████████| 40/40 [05:26<00:00,  8.15s/it]
100%|██████████| 40/40 [05:26<00:00,  8.16s/it]
100%|██████████| 40/40 [05:30<00:00,  8.25s/it]
100%|██████████| 40/40 [06:10<00:00,  9.27s/it]


# Read in classifier results

In [60]:
from glob import glob
all_stem_pred = pd.concat([pd.read_parquet(x) for x in glob("/data/orcid/stem_pred*.parquet")])

# Medicine Classifier

In [80]:
import re
medroots = [
"intensive care",
"medical",
"neurology",
"psychiatry",
'anatomy',
'cancer',
'cardio',
'dentist',
'dermato',
'endocr',
'Epidemi',
'gastroent',
'genetic',
'geriatr',
'Immuno',
'medicine',
'oncol',
'Ophtha',
'Optome',
'pathol',
'patholog',
'pediatric',
'Pharma',
'physio',
'physici',
'Podiatr',
'radiolo',
'surgery',
'surgeon',
'Veterin',
'obstetric',
'gynecolog',
'urolog',
'anaesthes',
'nephrolog'
]
med_root_re = re.compile(r"("+r"|".join([x.strip().lower() for x in medroots])+")", re.U|re.I)

In [81]:
len(medroots)

36

## Eval med classifier

In [82]:
from sklearn.metrics import precision_recall_curve,classification_report

validation_data = pd.read_csv("data/stem_validation_data.csv")
res = []
for field in validation_data.cmd.values:
    res.append(med_root_re.search(field) is not None and (
        'engineer' not in field.lower() and 
        ' plant ' not in field.lower())
              )
validation_data['med_clf'] = res

print(classification_report(validation_data['medicine final'], validation_data.med_clf))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       213
           1       0.96      0.70      0.81        37

    accuracy                           0.95       250
   macro avg       0.96      0.85      0.89       250
weighted avg       0.95      0.95      0.95       250



In [64]:
medroots+=[    'hospital',
    'toxicology',
    'neonatal',
    'emergency',
    'disease',
    'orthodonti',
    'physiology',
    'kinesiology',
    'audiology',
    "neonatology",
    "immune",
    "immunology",
            
          ]
med_root_re = re.compile(r"("+r"|".join([x.strip().lower() for x in medroots])+")", re.U|re.I)

In [65]:
from sklearn.metrics import precision_recall_curve,classification_report

validation_data = pd.read_csv("data/stem_validation_data.csv")
res = []
for field in validation_data.cmd.values:
    res.append(med_root_re.search(field) is not None and (
        'engineer' not in field.lower() and 
        ' plant ' not in field.lower())
              )
validation_data['med_clf'] = res

print(classification_report(validation_data['medicine final'], validation_data.med_clf))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       213
           1       0.97      0.97      0.97        37

    accuracy                           0.99       250
   macro avg       0.98      0.98      0.98       250
weighted avg       0.99      0.99      0.99       250



In [66]:
validation_data[(validation_data['medicine final'] == 0) & (validation_data.med_clf == True)]

Unnamed: 0,cmd,stem final,medicine final,med_clf
201,pharmacognosy &amp; pharmaceutical botany,1,0,True


In [67]:
validation_data[(validation_data['medicine final'] == 1) & (validation_data.med_clf == False)]

Unnamed: 0,cmd,stem final,medicine final,med_clf
149,forensic and applied sciences,1,1,False


In [68]:
res = []
for field in tqdm(all_stem_pred.cmd.values):
    res.append(med_root_re.search(field) is not None and (
        'biomedical engineer' not in field.lower() and 
        'plant' not in field.lower())
              )
all_stem_pred['med_clf'] = res

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3368097/3368097 [01:04<00:00, 52281.15it/s]


In [69]:
all_stem_pred.to_parquet("data/stem_and_med_classifications.parquet")

# Generate Roles

In [70]:
d = pd.read_parquet("data/final_cleaning_dataset.parquet")

In [71]:
clean_role = d.role.apply(lambda x : x.strip().replace(".","").lower() if x and type(x) is str else None)
d = d.assign(clean_role=clean_role)
roles = d.clean_role.value_counts()

done w/ batch: 7
done w/ batch: 2
done w/ batch: 6


In [72]:
regexes = [
 ["research", re.compile("research|scientist|scholar|ricercatore")],
 ["bachelors", re.compile("(\b|^|\s)((b[ \.]?(a|s|e|se|sc|s\.e|sn)(\b|$|\s|\.))|bachelor|btech|underg|licenciatura|graduação)")],
 ["masters/postgrad", re.compile("(\b|^|\s)((m[\. ]?(a|s|p|sc|as)(\b|$|\s|\.))|master|^me$|mlis|mba|mbbs|postgrad|m[ \.]?tech|mphil|mph|mestrado|magister|mbchb|meng|mlis)")],
 ["phd",re.compile("^doctor(ate)?$|^d[\.]?r[\.]?$|^m[\.]?d[\.]?$|ph[ \.]?d(\b|$|\s|\.)|doctor of|d(\. )?phil|doctorado|mestre|graduate student|(\b|\s|^)graduate (research|teaching)? ?assistant|pharmd|^jd$|doctoral student|doutor|doctorat")],
 ["postdoc", re.compile(r"""(?ix)
        post[\s\-]?doc(?:toral)?(?:\s+(?:fellow|researcher|scholar|associate|scientist))?
        |post[\s\-]?doctoral
        |fellow(?:\s+(?:in|at))?(?!\s+(?:of|and))  # Fellow but not "Fellow of..." or "Fellow and Dean"
        |research\s+fellow(?!\s+and\s+(?:professor|director))
        |postdoctoral\s+(?:fellow|researcher|associate|trainee)
        |pdf(?:\b|$)
        |junior\s+(?:research\s+)?fellow
        |visiting\s+fellow
    """)],
 ["prof",re.compile(r"""(?ix)
        prof(?:essor|\.)?(?:\s+(?:emeritus|emerita))?
        |(?:\b|^|\s)(?:assist(?:ant)?|assoc(?:iate)?|adj(?:unct)?|clin(?:ical)?|visit(?:ing)?)\s+prof
        |lect(?:urer)?(?:\b|$)
        |faculty(?:\s+member)?
        |reader(?:\s+in)?
        |docent[ie]
        |instructor
        |доцент
        |privatdozent
        |teaching\s+(?:professor|staff|faculty|fellow)
        |senior\s+(?:lecturer|instructor)
        |assistant\s+professor
        |associate\s+professor
        |full\s+professor
        |emeritus|emerita
        |distinguished\s+professor
        |endowed\s+(?:professor|chair)
    """)],
 ['head', re.compile("dean|director|head")]
]


In [73]:
from collections import defaultdict
matches = defaultdict(list)
captured = 0
total = 0
for i, x in roles.reset_index().iterrows():
    matched = ""
    for name, reg in regexes:
        if reg.search(x[0]):
            matched = name
    if matched != "":
        captured += x[1]
    total += x[1]
    matches[matched].append(x[0])

  if reg.search(x[0]):
  captured += x[1]
  total += x[1]
  matches[matched].append(x[0])


done w/ batch: 1
done w/ batch: 4
done w/ batch: 8
done w/ batch: 3
done w/ batch: 5
done w/ batch: 0


In [74]:
print(captured, total )
print('Percentage of non-null affiliations we can identify a role for: ', float(captured)/total)

9805519 15021573
Percentage of non-null affiliations we can identify a role for:  0.6527624636913857


In [75]:
print(len(matches['']), len(roles))
print('Percent of all unique role fields we can identify: ', len(matches[''])/float(len(roles)))

1302607 2734392
Percent of all unique role fields we can identify:  0.47637902685496447


In [76]:
role_df=pd.DataFrame([x for k,v in matches.items() for x in zip(v,[k]*len(v))],
                     columns=['clean_role','role_category'])

In [77]:
role_df.head()

Unnamed: 0,clean_role,role_category
0,phd,phd
1,phd student,phd
2,md,phd
3,phd candidate,phd
4,doctor,phd


In [78]:
role_df.to_parquet("data/roles.parquet",index=False)