In [24]:
import sys
import pandas as pd
import numpy as np
import re
from io import StringIO
from html.parser import HTMLParser
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support as multi_score
from collections import Counter
from bs4 import BeautifulSoup
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [11]:
CIT_AUTHOR = '../../data/processed/large/openalex_citation_author_df.csv'
REF_AUTHOR = '../../data/processed/openalex_reference_author_df_unique.csv'
# openalex author df for VIS papers:
OA_AUTHOR = '../../data/interim/openalex_author_df.csv'
MERGED_AUTHOR = '../../data/processed/merged_author_df.csv'
MERGED_AFF_TYPE_PREDICTED = 'merged_aff_type_predicted.csv'
TYPE_CLASSIFICATION_REPORT = 'aff_type_classification_report.txt'

In [12]:
def get_simple_df(fname):
    """
        - remove nan, 
        - get only two target columns, i.e., raw string and aff type
        - drop duplicates
    """
    raw_string = 'Raw Affiliation String'
    aff_type = 'First Institution Type'
    df = pd.read_csv(fname)
    df = df[(df[raw_string].notnull()) & (df[aff_type].notnull())]
    df = df[[raw_string, aff_type]]
    df = df.drop_duplicates()
    return df

def get_df(cit_author, ref_author, oa_author):
    """concatenate, drop_duplicates, reset index, rename columns,
        factorize label_str

    Returns:
        the df used for model training and testing. It contains five columns:
            1. aff, which is pre-processed strings of affiliations
            2. label_str, which is country codes in strings,
            3. label: which is factorized version of country codes
            4. binary_label_str
            5. binary_label
    """

    df = pd.concat(
        [oa_author, ref_author, cit_author], ignore_index = True
        ).drop_duplicates().reset_index(drop=True)
    df.columns = ['aff', 'label_str']
    df = df.assign(label = pd.factorize(df['label_str'])[0])
    df = df.assign(binary_label_str = np.where(
        df.label_str == 'education', 'education', 'non-education'))
    df = df.assign(binary_label = pd.factorize(df['binary_label_str'])[0])
    return df 

def get_dicts(df):
    """get four dicts; id <--> type, for both binary and multiclass
    """
    multi_type_to_id = dict(zip(df.label_str, df.label))
    id_to_multi_type = dict(zip(df.label, df.label_str))
    binary_type_to_id = dict(zip(df.binary_label_str, df.binary_label))
    id_to_binary_type = dict(zip(df.binary_label, df.binary_label_str))
    return multi_type_to_id, id_to_multi_type, binary_type_to_id, id_to_binary_type

def clean_text(text):
    """
    Takes a string and returns a string
    """
    # remove html tags, lowercase, remove nonsense, remove non-letter
    aff = BeautifulSoup(text, "lxml").text 
    aff = aff.lower()
    aff = re.sub(r'xa0|#n#‡#n#|#tab#|#r#|\[|\]', "", aff)
    aff = re.sub(r'[^a-z]+', ' ', aff)
    return aff

def logist_regression(df, LABEL):
    '''
    Input: 
        df: df
        LABEL: 'label' if multiclass and 'binary_label' if binary
    Returns:
        logreg: logistic regression classifier (model)

    '''
    X = df.aff
    y = df[LABEL]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state = 42)
    logreg = Pipeline([('vect', CountVectorizer(stop_words='english', min_df = 5)),
                ('clf', LogisticRegression(max_iter=600)),
               ])
    print('model training now...')
    logreg.fit(X_train, y_train)

    y_pred = logreg.predict(X_test)

    target_names = list(set(df.label_str)) if LABEL == 'label' else list(set(df.binary_label_str))
    logreg_type = 'multiclass classification' if LABEL == 'label' else 'binary classification'

    f = open(TYPE_CLASSIFICATION_REPORT,'a')
    f.write('The following is the result for aff type' + ' : ' + logreg_type + '\n')
    f.write('accuracy %s' % accuracy_score(y_pred, y_test))
    f.write('\n')
    f.write(classification_report(y_test, y_pred, target_names=target_names))
    f.write('\n')
    f.write('\n')

    return logreg

def get_processed_merged_author(DF, LOGREG_MULTI, LOGREG_BINARY):
    '''
    Input: 
        - DF: merged
        - LOGREG_MULTI
        - LOGREG_BINARY
    Returns:
        - DF with binary and multiclass classification results
    '''
    # clean text for affs to be predicted
    DF['IEEE Author Affiliation Filled'] = DF[
        'IEEE Author Affiliation Filled'].apply(clean_text)
    pred_binary = LOGREG_BINARY.predict(DF['IEEE Author Affiliation Filled'])
    pred_binary_type = [id_to_binary_type[x] for x in pred_binary]
    pred_multi = LOGREG_MULTI.predict(DF['IEEE Author Affiliation Filled'])
    pred_multi_type = [id_to_multi_type[x] for x in pred_multi]
    DF['aff_type_results_binary'] = pred_binary_type
    DF['aff_type_results_multiclass'] = pred_multi_type
    return DF

In [13]:
# load datasets:
cit_author = get_simple_df(CIT_AUTHOR)
ref_author = get_simple_df(REF_AUTHOR)
oa_author = get_simple_df(OA_AUTHOR)
merged = pd.read_csv(MERGED_AUTHOR)

In [14]:
df = get_df(cit_author, ref_author, oa_author)

In [15]:
df['aff'] = df['aff'].apply(clean_text)

In [16]:
# get dicts
multi_type_to_id, id_to_multi_type, binary_type_to_id, id_to_binary_type = get_dicts(df)

# get logreg
logreg_multi = logist_regression(df, 'label')
logreg_binary = logist_regression(df, 'binary_label')

model training now...
model training now...


In [17]:
def get_processed_merged_author(DF, LOGREG_MULTI, LOGREG_BINARY):
    '''
    Input: 
        - DF: merged
        - LOGREG_MULTI
        - LOGREG_BINARY
    Returns:
        - DF with binary and multiclass classification results
    '''
    # clean text for affs to be predicted
    DF['IEEE Author Affiliation Filled'] = DF[
        'IEEE Author Affiliation Filled'].apply(clean_text)
    pred_binary = LOGREG_BINARY.predict(DF['IEEE Author Affiliation Filled'])
    pred_binary_type = [id_to_binary_type[x] for x in pred_binary]
    pred_multi = LOGREG_MULTI.predict(DF['IEEE Author Affiliation Filled'])
    pred_multi_type = [id_to_multi_type[x] for x in pred_multi]
    DF['aff_type_results_binary'] = pred_binary_type
    DF['aff_type_results_multiclass'] = pred_multi_type
    return DF

In [18]:
merged_processed = get_processed_merged_author(merged, logreg_multi, logreg_binary)

## Check

The following is to check how many rows have different results between my handcoded type and the inferred type by the classifier. 

In [21]:
merged_processed = merged_processed.assign(aff_type_results_binary_updated = 
    np.where(merged_processed['Binary Institution Type By Hand'].notnull(), 
         merged_processed['Binary Institution Type By Hand'],
         merged_processed['aff_type_results_binary']
        ))

In [28]:
merged_processed = merged_processed.assign(aff_type_results_multiclass_updated = 
    np.where(merged_processed['First Institution Type By Hand'].notnull(), 
         merged_processed['First Institution Type By Hand'],
         merged_processed['aff_type_results_multiclass']
        ))

In [29]:
merged_processed[
    merged_processed['aff_type_results_binary'] != merged_processed['aff_type_results_binary_updated']]

Unnamed: 0,Year,DOI,Title,IEEE Number of Authors,IEEE Author Position,IEEE Author Name,IEEE Author ID,IEEE Author Affiliation Updated,IEEE One Affiliation,Number of Authors,Author Name,Author Position,Author Position Type,OpenAlex Author ID,Author ORCID,Number of Affiliations,First Institution Name Updated,Raw Affiliation String Updated,First Institution ID,First Institution ROR,First Institution Type,First Institution Country Code,First Institution Country Code By Hand,First Institution Type By Hand,Binary Institution Type,Binary Institution Type By Hand,IEEE Author Affiliation Filled,aff_type_results_binary,aff_type_results_multiclass,aff_type_results_binary_updated,aff_type_results_multiclass_updated
1790,2004,10.1109/VISUAL.2004.88,Real-time motion estimation and visualization on graphics cards,2.0,2.0,C. Garbe,https://ieeexplore.ieee.org/author/37282542100,"Interdisciplinary Center for Scientific Computing, Heidelberg, Germany",,2.0,Christoph S. Garbe,2.0,last,https://openalex.org/A1974301229,,1.0,"Interdisciplinary Center for Scientific Computing, Heidelberg, Germany","Interdisciplinary Center for Scientific Computing, Heidelberg, Germany",https://openalex.org/I68265846,,,,DE,education,,education,interdisciplinary center for scientific computing heidelberg germany,non-education,facility,education,education
4100,2000,10.1109/VISUAL.2000.885722,Fairing of non-manifolds for visualization,2.0,2.0,M. Gross,,,,2.0,Markus Gross,2.0,last,https://openalex.org/A2289975239,https://orcid.org/0000-0002-6838-9775,1.0,ETH Zurich,"ETH Zentrum, CH - 8092 Switzerland",https://openalex.org/I35440088,https://ror.org/05a28rw58,education,CH,CH,education,education,education,eth zentrum ch switzerland,non-education,facility,education,education
6134,1996,10.1109/VISUAL.1996.568115,FEL: The Field Encapsulation Library,3.0,3.0,M. Gerald-Yamasaki,,NASA Ames Research Center,,3.0,Gerald-Yamasaki,3.0,last,https://openalex.org/A3142045872,,,NASA Ames Research Center,NASA Ames Research Center,,,,,US,education,,education,nasa ames research center,non-education,facility,education,education
7173,2006,10.1109/TVCG.2006.182,Techniques for the Visualization of Topological Defect Behavior in Nematic Liquid Crystals,5.0,1.0,Vadim Slavin,https://ieeexplore.ieee.org/author/38185192000,"Brown University, United States",,5.0,Vadim Slavin,1.0,first,https://openalex.org/A2544274926,,,"Brown University, United States","Brown University, United States",,,,,US,company,,non-education,brown university united states,education,education,non-education,company
7353,2000,10.1109/VISUAL.2000.885735,Visualization of time dependent confocal microscopy data,6.0,3.0,P.J. Verschure,https://ieeexplore.ieee.org/author/37340174000,"Swammerdam Inst. for Life Sciences, BioCentrum Amsterdam, Amsterdam, Netherlands",,6.0,Pernette J. Verschure,3.0,middle,https://openalex.org/A2109779337,https://orcid.org/0000-0003-2922-4836,,"Swammerdam Inst. for Life Sciences, BioCentrum Amsterdam, Amsterdam, Netherlands","Swammerdam Inst. for Life Sciences, BioCentrum Amsterdam, Amsterdam, Netherlands",,,,,NL,education,,education,swammerdam inst for life sciences biocentrum amsterdam amsterdam netherlands,non-education,facility,education,education
7354,2000,10.1109/VISUAL.2000.885735,Visualization of time dependent confocal microscopy data,6.0,4.0,A.E. Visser,https://ieeexplore.ieee.org/author/37340814600,"Swammerdam Inst. for Life Sciences, BioCentrum Amsterdam, Amsterdam, Netherlands",,6.0,A.E. Visser,4.0,middle,https://openalex.org/A2650925780,,,"Swammerdam Inst. for Life Sciences, BioCentrum Amsterdam, Amsterdam, Netherlands","Swammerdam Inst. for Life Sciences, BioCentrum Amsterdam, Amsterdam, Netherlands",,,,,NL,education,,education,swammerdam inst for life sciences biocentrum amsterdam amsterdam netherlands,non-education,facility,education,education
7355,2000,10.1109/VISUAL.2000.885735,Visualization of time dependent confocal microscopy data,6.0,5.0,E.M.M. Manders,https://ieeexplore.ieee.org/author/37327214100,"Swammerdam Inst. for Life Sciences, BioCentrum Amsterdam, Amsterdam, Netherlands",,6.0,Erik M. M. Manders,5.0,middle,https://openalex.org/A2556846578,,1.0,"Swammerdam Inst. for Life Sciences, BioCentrum Amsterdam, Amsterdam, Netherlands","Swammerdam Inst. for Life Sciences, BioCentrum Amsterdam, Amsterdam, Netherlands",,,,,NL,education,,education,swammerdam inst for life sciences biocentrum amsterdam amsterdam netherlands,non-education,facility,education,education
7356,2000,10.1109/VISUAL.2000.885735,Visualization of time dependent confocal microscopy data,6.0,6.0,R. Van Drielf,https://ieeexplore.ieee.org/author/37086980868,"Swammerdam Inst. for Life Sciences, BioCentrum Amsterdam, Amsterdam, Netherlands",,6.0,R. Van Drielf,6.0,last,https://openalex.org/A3160531289,,1.0,"Swammerdam Inst. for Life Sciences, BioCentrum Amsterdam, Amsterdam, Netherlands","Swammerdam Inst. for Life Sciences, BioCentrum Amsterdam, Amsterdam, Netherlands",,,,,NL,education,,education,swammerdam inst for life sciences biocentrum amsterdam amsterdam netherlands,non-education,facility,education,education


In [32]:
merged_processed[
    merged_processed['aff_type_results_multiclass'] != merged_processed['aff_type_results_multiclass_updated']]

Unnamed: 0,Year,DOI,Title,IEEE Number of Authors,IEEE Author Position,IEEE Author Name,IEEE Author ID,IEEE Author Affiliation Updated,IEEE One Affiliation,Number of Authors,Author Name,Author Position,Author Position Type,OpenAlex Author ID,Author ORCID,Number of Affiliations,First Institution Name Updated,Raw Affiliation String Updated,First Institution ID,First Institution ROR,First Institution Type,First Institution Country Code,First Institution Country Code By Hand,First Institution Type By Hand,Binary Institution Type,Binary Institution Type By Hand,IEEE Author Affiliation Filled,aff_type_results_binary,aff_type_results_multiclass,aff_type_results_binary_updated,aff_type_results_multiclass_updated
145,2004,10.1109/INFVIS.2004.1,A Comparison of the Readability of Graphs Using Node-Link and Matrix-Based Representations,3.0,2.0,J.-D. Fekete,https://ieeexplore.ieee.org/author/37407972900,INRIA,True,3.0,Jean-Daniel Fekete,2.0,middle,https://openalex.org/A2154968417,https://orcid.org/0000-0003-3770-8726,1.0,INRIA,INRIA,https://openalex.org/I1326498283,https://ror.org/02kvxyf05,government,FR,FR,nonprofit,non-education,non-education,inria,non-education,government,non-education,nonprofit
1218,1999,10.1109/VISUAL.1999.809907,Collapsing Flow Topology Using Area Metrics,2.0,1.0,W. De Leeuw,,"Center for Mathematics and Computer Science (CWI), Netherlands",True,2.0,W. de Leeuw,1.0,first,https://openalex.org/A2103119713,,1.0,"Center for Mathematics and Computer Science (CWI), Netherlands","Center for Mathematics and Computer Science (CWI), Netherlands",,,,,NL,government,,non-education,center for mathematics and computer science cwi netherlands,non-education,facility,non-education,government
1219,1999,10.1109/VISUAL.1999.809907,Collapsing Flow Topology Using Area Metrics,2.0,2.0,R. Van Liere,,"Center for Mathematics and Computer Science (CWI), Netherlands",,2.0,R. van Liere,2.0,last,https://openalex.org/A2125901948,,1.0,"Center for Mathematics and Computer Science (CWI), Netherlands","Center for Mathematics and Computer Science (CWI), Netherlands",,,,,NL,government,,non-education,center for mathematics and computer science cwi netherlands,non-education,facility,non-education,government
1789,2004,10.1109/VISUAL.2004.88,Real-time motion estimation and visualization on graphics cards,2.0,1.0,R. Strzodka,https://ieeexplore.ieee.org/author/37282542600,"Caesar Research Center, Bonn, Germany",True,2.0,Robert Strzodka,1.0,first,https://openalex.org/A2679871087,,1.0,"Caesar Research Center, Bonn, Germany","Caesar Research Center, Bonn, Germany",,,,,DE,nonprofit,,non-education,caesar research center bonn germany,non-education,facility,non-education,nonprofit
1790,2004,10.1109/VISUAL.2004.88,Real-time motion estimation and visualization on graphics cards,2.0,2.0,C. Garbe,https://ieeexplore.ieee.org/author/37282542100,"Interdisciplinary Center for Scientific Computing, Heidelberg, Germany",,2.0,Christoph S. Garbe,2.0,last,https://openalex.org/A1974301229,,1.0,"Interdisciplinary Center for Scientific Computing, Heidelberg, Germany","Interdisciplinary Center for Scientific Computing, Heidelberg, Germany",https://openalex.org/I68265846,,,,DE,education,,education,interdisciplinary center for scientific computing heidelberg germany,non-education,facility,education,education
2511,1990,10.1109/VISUAL.1990.146398,A system for three-dimensional acoustic 'visualization' in a virtual environment workstation,4.0,1.0,E.M. Wenzel,https://ieeexplore.ieee.org/author/37618567100,"NASA Ames Research Center, Moffett Field, CA, USA",True,4.0,Elizabeth M. Wenzel,1.0,first,https://openalex.org/A2066527351,,1.0,"NASA Ames Research Center, Moffett Field, CA, USA","NASA Ames Research Center, Moffett Field, CA, USA",https://openalex.org/I1280536761,https://ror.org/02acart68,facility,US,US,government,non-education,non-education,nasa ames research center moffett field ca usa,non-education,facility,non-education,government
2913,2011,10.1109/TVCG.2011.207,GPU-based Real-Time Approximation of the Ablation Zone for Radiofrequency Ablation,4.0,1.0,Christian Rieder,https://ieeexplore.ieee.org/author/38017035400,"Fraunhofer MEVIS, Germany",True,4.0,Christian Rieder,1.0,first,https://openalex.org/A2277397071,,1.0,"Fraunhofer MEVIS, Germany","Fraunhofer MEVIS, Germany",,,,,DE,company,,non-education,fraunhofer mevis germany,non-education,facility,non-education,company
2915,2011,10.1109/TVCG.2011.207,GPU-based Real-Time Approximation of the Ablation Zone for Radiofrequency Ablation,4.0,3.0,Christian Schumann,https://ieeexplore.ieee.org/author/37089038269,"Fraunhofer MEVIS, Germany",True,4.0,Christian Schumann,3.0,middle,https://openalex.org/A2134832738,,1.0,"Fraunhofer MEVIS, Germany","Fraunhofer MEVIS, Germany",,,,,DE,company,,non-education,fraunhofer mevis germany,non-education,facility,non-education,company
2916,2011,10.1109/TVCG.2011.207,GPU-based Real-Time Approximation of the Ablation Zone for Radiofrequency Ablation,4.0,4.0,Horst K. Hahn,https://ieeexplore.ieee.org/author/37729702200,"Fraunhofer MEVIS, Germany",True,4.0,Horst K. Hahn,4.0,last,https://openalex.org/A2157164624,https://orcid.org/0000-0001-7512-5762,1.0,"Fraunhofer MEVIS, Germany","Fraunhofer MEVIS, Germany",https://openalex.org/I193619901,https://ror.org/02yrs2n53,education,DE,DE,company,education,non-education,fraunhofer mevis germany,non-education,facility,non-education,company
2969,2004,10.1109/VISUAL.2004.113,Visualization of intricate flow structures for vortex breakdown analysis,7.0,6.0,M. Ruetten,https://ieeexplore.ieee.org/author/37088267007,,,7.0,Markus Ruetten,6.0,middle,https://openalex.org/A2941944884,,1.0,DLR Goettingen,DLR Goettingen,,,,,DE,government,,non-education,dlr goettingen,non-education,company,non-education,government
