# Imports

In [1]:
import gzip
import pandas as pd
from collections import Counter
import string
from collections import defaultdict
from bs4 import BeautifulSoup
import requests
import langid
import sys
from glob import glob
import os
import pickle
from tqdm import tqdm
import numpy as np
import random
import re, string
from glob import glob


STOPWORDS = set(['and','not','of','the'])

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
#for model-building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
#for word embedding
import joblib
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
pd.set_option("max_colwidth", None)
import spacy
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score
from simcse import SimCSE


def preprocess(text):
    text = text.lower() 
    text=text.strip()  
    text=re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return text

# STOPWORD REMOVAL
def stopword(x):
    a= [i for i in x.split() if i not in stopwords.words('english')]
    return ' '.join(a)
#LEMMATIZATION
# Initialize the lemmatizer
wl = WordNetLemmatizer()
 
# This is a helper function to map NTLK position tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
# Tokenize the sentence
def lemmatizer(x):
    word_pos_tags = nltk.pos_tag(word_tokenize(x)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)


def finalpreprocess(x):
    return lemmatizer(stopword(preprocess(x)))

def return_linear_clf(name,data,preproc_fun, target_field= "is_STEM"):
    print("Preprocessed the datasets...")

    df_dept_vector = preproc_fun(data['fieldname'].values.tolist())
    X_train = df_dept_vector
    y_train = data[target_field].astype(int)
    base_clf = LogisticRegression(random_state=0)
    clf = CalibratedClassifierCV(base_clf, cv=5).fit(X_train, y_train)
    print("Trained model...")

    return name,clf,-1,-1,-1


def avg_emb_preproc(names):
    return np.asarray([nlp(x).vector for x in names], dtype="object")


def simcse_preproc(names,device_name="cpu"):
    return model.encode(names,
                          device=device_name,
                          batch_size=100, 
                          return_numpy=True)


[nltk_data] Downloading package punkt to
[nltk_data]     /home/csefaculty/kjoseph/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/csefaculty/kjoseph/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/csefaculty/kjoseph/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/csefaculty/kjoseph/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


# Read in data

In [None]:
d = pd.read_parquet("data/final_cleaning_dataset.parquet")

In [None]:
d.clean_affiliation.nunique()

In [None]:
d.clean_affiliation.sample(5)

In [None]:
len(d)

## Model Training

In [9]:
nlp = spacy.load("en_core_web_md")
model = SimCSE("princeton-nlp/sup-simcse-roberta-large")

### Load in Data from Larremore study

In [10]:
df_dept = pd.read_csv('data/larremore_field_data.csv')
df_dept = df_dept.dropna(subset=['DepartmentName', 'is_STEM'])

print(len(df_dept))

df_dept['orig_cleaned'] = df_dept['DepartmentName'].apply(lambda x: finalpreprocess(x))
df_dept['simple_cleaned'] = df_dept['DepartmentName'].str.replace(", (Graduate )?(Department|School|Institute|Division|College) of$","")

6119


In [11]:
name_cols = ['orig_cleaned','simple_cleaned','Field','Area','Umbrella']

In [12]:
dat = []
for x in name_cols:
    named_df = df_dept[[x,'is_STEM']].rename(columns={x:"fieldname"})
    named_df['ty'] = x
    dat.append(named_df)
fld = pd.concat(dat, ignore_index=True).drop_duplicates()
k = fld.groupby("fieldname").size()
k[k >1].index[:5]

Index(['Accounting and Information Systems, Department of',
       'Accounting, Finance and Information Systems, Department of',
       'Advanced Structures, School of',
       'Agricultural Education, Department of',
       'Agricultural Sciences and Forestry, School of'],
      dtype='object', name='fieldname')

In [13]:
# Toss things not labeled STEM/Not-STEM or things labeled both STEM/Not STEM
fld = fld[~fld.fieldname.isin(k[k>1].index.values)] 
fld = fld[pd.notnull(fld.fieldname)]

In [14]:
fld.sample(3)

Unnamed: 0,fieldname,is_STEM,ty
3271,hospitality business school,False,orig_cleaned
11657,"Agricultural Education, Communication and Marketing Division",True,simple_cleaned
9232,Philosophy Department,False,simple_cleaned


### Load in ORCID labeled training data

In [15]:
orcid_training = pd.read_csv("data/stem_training_data.csv",sep="\t")
print(orcid_training.groupby(['error in translation?']).n.sum())
orcid_training = orcid_training.loc[orcid_training['error in translation?'] == 0,:]
orcid_training.loc[:,'is_STEM'] = orcid_training['stem?'] == 1

m = orcid_training[['cmd','is_STEM']]
m.columns = ['fieldname','is_STEM']
m.loc[:,'ty'] = 'manual'
m.loc[:,'fieldname'] = m['fieldname'].str.replace("(Graduate )?(Department|School|Institute|Division|College) of ","",regex=True,flags=re.I)
fld = pd.concat([fld,m],axis=0)

error in translation?
0    1259313
1      64860
Name: n, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  m.loc[:,'ty'] = 'manual'


### Build models 

In [16]:
res = []
for preproc_fun in [simcse_preproc]:#avg_emb_preproc,
    for fieldlist in [ ['orig_cleaned'],
                       ['simple_cleaned'],
                       ['simple_cleaned','manual'],
                       ['orig_cleaned','Field','Area','Umbrella'],
                       ['simple_cleaned','Field','Area','Umbrella']]:
        name= "_".join(fieldlist)+"_"+preproc_fun.__name__
        print(name)
        res.append(return_linear_clf(name,fld[fld.ty.isin(fieldlist)],preproc_fun))

orig_cleaned_simcse_preproc
Preprocessed the datasets...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 38/38 [00:29<00:00,  1.28it/s]


Trained model...
simple_cleaned_simcse_preproc
Preprocessed the datasets...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:38<00:00,  1.03it/s]


Trained model...
simple_cleaned_manual_simcse_preproc
Preprocessed the datasets...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 42/42 [00:40<00:00,  1.04it/s]


Trained model...
orig_cleaned_Field_Area_Umbrella_simcse_preproc
Preprocessed the datasets...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39/39 [00:26<00:00,  1.48it/s]


Trained model...
simple_cleaned_Field_Area_Umbrella_simcse_preproc
Preprocessed the datasets...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 41/41 [00:41<00:00,  1.01s/it]


Trained model...


## Validate and Test

In [17]:
validation_data = pd.read_csv("data/stem_validation_data.csv")
validation_data.shape

(250, 3)

In [18]:
validation_data.head()

Unnamed: 0,cmd,stem final,medicine final
0,chemistry,1,0
1,physics,1,0
2,psychology,0,0
3,medicine,1,1
4,mechanical engineering,1,0


In [19]:
from sklearn.metrics import precision_recall_curve,classification_report
pd.options.mode.chained_assignment = None 
for name, clf in [(v[0],v[1]) for v in res]:
    print(name)
    if 'avg_emb' in name:
        embs = avg_emb_preproc(validation_data.cmd.values.tolist())
    else:
        embs = simcse_preproc(validation_data.cmd.values.tolist())
    validation_data[name] =  clf.predict_proba(embs)[:, 1]
    print(classification_report(validation_data['stem final'],validation_data[name] > .5))

orig_cleaned_simcse_preproc


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.40it/s]


              precision    recall  f1-score   support

           0       0.87      0.88      0.88       113
           1       0.90      0.89      0.90       137

    accuracy                           0.89       250
   macro avg       0.89      0.89      0.89       250
weighted avg       0.89      0.89      0.89       250

simple_cleaned_simcse_preproc


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.40it/s]


              precision    recall  f1-score   support

           0       0.87      0.87      0.87       113
           1       0.89      0.89      0.89       137

    accuracy                           0.88       250
   macro avg       0.88      0.88      0.88       250
weighted avg       0.88      0.88      0.88       250

simple_cleaned_manual_simcse_preproc


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.20it/s]


              precision    recall  f1-score   support

           0       0.90      0.86      0.88       113
           1       0.89      0.92      0.90       137

    accuracy                           0.89       250
   macro avg       0.89      0.89      0.89       250
weighted avg       0.89      0.89      0.89       250

orig_cleaned_Field_Area_Umbrella_simcse_preproc


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.35it/s]


              precision    recall  f1-score   support

           0       0.88      0.88      0.88       113
           1       0.90      0.90      0.90       137

    accuracy                           0.89       250
   macro avg       0.89      0.89      0.89       250
weighted avg       0.89      0.89      0.89       250

simple_cleaned_Field_Area_Umbrella_simcse_preproc


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:02<00:00,  1.24it/s]

              precision    recall  f1-score   support

           0       0.87      0.87      0.87       113
           1       0.89      0.89      0.89       137

    accuracy                           0.88       250
   macro avg       0.88      0.88      0.88       250
weighted avg       0.88      0.88      0.88       250






In [20]:
name = 'simple_cleaned_manual_simcse_preproc'
validation_data[(validation_data['stem final'] == 0) & (validation_data[name] > .5)][['cmd',name]]

Unnamed: 0,cmd,simple_cleaned_manual_simcse_preproc
12,lime,0.798437
31,anthropology,0.665309
79,nutrition and dietetics,0.793249
118,school science education,0.86499
120,viticulture and enology,0.992879
127,applied human sciences,0.706979
129,conservation department,0.704696
157,aquatic animal health management,0.982278
164,transport planning and systems,0.559129
168,health care studies,0.510153


In [21]:
name = 'simple_cleaned_manual_simcse_preproc'
validation_data[(validation_data['stem final'] == 1) & (validation_data[name] < .5)][['cmd',name]]

Unnamed: 0,cmd,simple_cleaned_manual_simcse_preproc
45,dentistry,0.236027
65,it,0.282301
75,emergency department,0.255311
86,pediatric dentistry,0.41093
98,engenharia de alimentos,0.466185
111,head and neck surgery,0.226332
161,inibioma,0.034367
171,standing committee on medical education,0.20642
180,geography &amp; spatial planning,0.221268
203,"\""alexandra\"" hospital, 1st ob/gyn department",0.183869


# Save best model

In [None]:
import joblib

clf =[x[1] for x in res if x[0] == 'simple_cleaned_manual_simcse_preproc'][0]
# Save the model
joblib.dump(clf, 'data/final_stem_classifier.joblib')
