In [27]:
import sys
import pandas as pd
import numpy as np
import re
from io import StringIO
from html.parser import HTMLParser
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support as multi_score
from collections import Counter
from bs4 import BeautifulSoup

In [6]:
CIT_AUTHOR = '../../data/processed/large/openalex_citation_author_df.csv'
REF_AUTHOR = '../../data/processed/openalex_reference_author_df_unique.csv'
# openalex author df for VIS papers:
OA_AUTHOR = '../../data/interim/openalex_author_df.csv'

In [7]:
def get_simple_df(fname):
	"""
		- remove nan, 
		- get only two target columns, i.e., raw string and aff type
		- drop duplicates
	"""
	raw_string = 'Raw Affiliation String'
	aff_type = 'First Institution Type'
	df = pd.read_csv(fname)
	df = df[(df[raw_string].notnull()) & (df[aff_type].notnull())]
	df = df[[raw_string, aff_type]]
	df = df.drop_duplicates()
	return df

In [8]:
def get_df(cit_author, ref_author, oa_author):
	"""concatenate, drop_duplicates, reset index, rename columns,
		factorize label_raw

	Returns:
		the df used for model training and testing. It contains five columns:
			1. aff, which is pre-processed strings of affiliations
			2. label_raw, which is country codes in strings,
			3. label: which is factorized version of country codes
			4. binary_label_raw
			5. binary_label
	"""

	df = pd.concat(
		[oa_author, ref_author, cit_author], ignore_index = True
		).drop_duplicates().reset_index(drop=True)
	df.columns = ['aff', 'label_raw']
	df = df.assign(label = pd.factorize(df['label_raw'])[0])
	df = df.assign(binary_label_raw = np.where(
		df.label_raw == 'education', 'education', 'non-education'))
	df = df.assign(binary_label = pd.factorize(df['binary_label_raw'])[0])
	return df 

In [9]:
def get_dicts(df):
	"""get four dicts; id <--> type, for both binary and multiclass
	"""
	multi_type_to_id = dict(zip(df.label_raw, df.label))
	id_to_multi_type = dict(zip(df.label, df.label_raw))
	binary_type_to_id = dict(zip(df.binary_label_raw, df.binary_label))
	id_to_binary_type = dict(zip(df.binary_label, df.binary_label_raw))
	return multi_type_to_id, id_to_multi_type, binary_type_to_id, id_to_binary_type

In [10]:
cit_author = get_simple_df(CIT_AUTHOR)
ref_author = get_simple_df(REF_AUTHOR)
oa_author = get_simple_df(OA_AUTHOR)

In [11]:
df = get_df(cit_author, ref_author, oa_author)

In [12]:
df.sample(10)

Unnamed: 0,aff,label_raw,label,binary_label_raw,binary_label
32493,"Western Carolina Univ., Cullowhee NC US",education,0,education,0
54116,"Fraunhofer IDM@NTU, Singapore, Singapore",education,0,education,0
74508,"Intelligent Media Laboratory, Department of So...",education,0,education,0
32069,"Center for Bioinformatics, Harvard Center for ...",education,0,education,0
5988,"Department of Mathematics, University Of South...",education,0,education,0
48783,"Department of Psychological Sciences, Purdue U...",education,0,education,0
13627,"Behavioral Medicine Research Group, School of ...",education,0,education,0
39968,"Department of Software Engineering, Faculty of...",education,0,education,0
27543,"INRIA Moais research team, CNRS LIG Laboratory...",education,0,education,0
26435,"Brown University, CS Dept., Box 1910, Providen...",education,0,education,0


In [13]:
multi_type_to_id, id_to_multi_type, binary_type_to_id, id_to_binary_type = get_dicts(df)

In [15]:
def clean_text(text):
    """
    Takes a string and returns a string
    """
    # remove html tags, lowercase, remove nonsense, remove non-letter
    aff = BeautifulSoup(text, "lxml").text 
    aff = aff.lower()
    aff = re.sub(r'xa0|#n#‡#n#|#tab#|#r#|\[|\]', "", aff)
    aff = re.sub(r'[^a-z]+', ' ', aff)
    return aff

In [16]:
df['aff'].apply(lambda x: len(x.split(' '))).sum()

787865

In [17]:
df['aff'] = df['aff'].apply(clean_text)

In [19]:
df['aff'].apply(lambda x: len(x.split(' '))).sum()

806619

## Binary

In [20]:
X = df.aff
y = df.binary_label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [21]:
logreg_binary = Pipeline([('vect', CountVectorizer(stop_words='english', min_df = 5)),
                ('clf', LogisticRegression(max_iter=600)),
               ])
logreg_binary.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer(min_df=5, stop_words='english')),
                ('clf', LogisticRegression(max_iter=600))])

In [22]:
%%time

y_pred = logreg_binary.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=list(set(df.binary_label_raw))))

accuracy 0.9505436223813312
               precision    recall  f1-score   support

    education       0.96      0.98      0.97     11682
non-education       0.91      0.87      0.89      3402

     accuracy                           0.95     15084
    macro avg       0.94      0.92      0.93     15084
 weighted avg       0.95      0.95      0.95     15084

CPU times: user 261 ms, sys: 5.56 ms, total: 266 ms
Wall time: 270 ms


## Multi

In [23]:
X = df.aff
y = df.label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [24]:
logreg_multi = Pipeline([('vect', CountVectorizer(stop_words='english', min_df = 5)),
                ('clf', LogisticRegression(max_iter=600)),
               ])
logreg_multi.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer(min_df=5, stop_words='english')),
                ('clf', LogisticRegression(max_iter=600))])

In [25]:
%%time

y_pred = logreg_multi.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=list(set(df.label_raw))))

accuracy 0.9231636170776982
              precision    recall  f1-score   support

       other       0.96      0.98      0.97     11682
   education       0.82      0.85      0.83       768
   nonprofit       0.79      0.70      0.74      1138
     company       0.74      0.70      0.72       558
  healthcare       0.81      0.64      0.71       318
    facility       0.87      0.33      0.47        80
  government       0.81      0.72      0.76       503
     archive       0.96      0.73      0.83        37

    accuracy                           0.92     15084
   macro avg       0.84      0.71      0.76     15084
weighted avg       0.92      0.92      0.92     15084

CPU times: user 285 ms, sys: 15.9 ms, total: 301 ms
Wall time: 313 ms


## Predict

In [156]:
MERGED_AUTHOR = '../../data/processed/merged_author_df.csv'

In [157]:
merged = pd.read_csv(MERGED_AUTHOR)

In [160]:
pred = logreg.predict(merged['IEEE Author Affiliation Filled'])
Counter(pred)

Counter({0: 9259, 1: 3169})

In [161]:
merged['IEEE Author Affiliation Filled'].apply(lambda x: len(x.split(' '))).sum()

84150

In [162]:
merged['IEEE Author Affiliation Filled'] = merged['IEEE Author Affiliation Filled'].apply(clean_text)

In [163]:
merged['IEEE Author Affiliation Filled'].apply(lambda x: len(x.split(' '))).sum()

86344

In [174]:
pred_binary = logreg_binary.predict(merged['IEEE Author Affiliation Filled'])
pred_binary_type = [id_to_binary_type[x] for x in pred_binary]
Counter(pred_binary_type)

Counter({'education': 9396, 'non-education': 3032})

In [176]:
pred_multi = logreg_multi.predict(merged['IEEE Author Affiliation Filled'])
pred_multi_type = [id_to_multi_type[x] for x in pred_multi]
Counter(pred_multi_type)

Counter({'education': 9534,
         'company': 1342,
         'facility': 1090,
         'government': 247,
         'healthcare': 140,
         'archive': 14,
         'other': 11,
         'nonprofit': 50})

In [177]:
good = 'label_raw'

In [178]:
df[good]

0        education
1        education
2          company
3        education
4         facility
           ...    
75412    education
75413    education
75414    education
75415    education
75416    education
Name: label_raw, Length: 75417, dtype: object