## Aim:

In this notebook, I mainly tested the cntry code classification script. 

In [33]:
import sys
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support as multi_score
from collections import Counter
from bs4 import BeautifulSoup
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [13]:
CIT_AUTHOR = '../../data/processed/large/openalex_citation_author_df.csv'
REF_AUTHOR = '../../data/processed/openalex_reference_author_df_unique.csv'
# openalex author df for VIS papers:
OA_AUTHOR = '../../data/interim/openalex_author_df.csv'
MERGED_AUTHOR = '../../data/processed/merged_author_df.csv'
MERGED_CNTRY_PREDICTED = 'merged_cntry_predicted.csv'
CNTRY_CLASSIFICATION_REPORT = 'cntry_classification_report.txt'

In [14]:
def get_simple_df(fname):
    """
        - remove nan, 
        - get only two target columns, i.e., raw string and aff type
        - drop duplicates
    """
    raw_string = 'Raw Affiliation String'
    aff_type = 'First Institution Country Code'
    df = pd.read_csv(fname)
    df = df[(df[raw_string].notnull()) & (df[aff_type].notnull())]
    df = df[[raw_string, aff_type]]
    df = df.drop_duplicates()
    return df

def get_df(cit_author, ref_author, oa_author):
    """concatenate, drop_duplicates, reset index, rename columns,
        factorize label_str

    Returns:
        the df used for model training and testing. It contains three columns:
            1. aff, which is pre-processed strings of affiliations
            2. label_str, which is country codes in strings,
            3. label: which is factorized version of country codes
    """

    df = pd.concat(
        [oa_author, ref_author, cit_author], ignore_index = True
        ).drop_duplicates().reset_index(drop=True)
    df.columns = ['aff', 'label_str']
    df = df.assign(label = pd.factorize(df['label_str'])[0])
    return df 

def get_dicts(df):
    """get two dicts; id <--> cntry
    """
    cntry_to_id = dict(zip(df.label_str, df.label))
    id_to_cntry = dict(zip(df.label, df.label_str))
    return cntry_to_id, id_to_cntry

def clean_text(text):
    """
    Takes a string and returns a string
    """
    # remove html tags, lowercase, remove nonsense, remove non-letter
    aff = BeautifulSoup(text, "lxml").text 
    aff = aff.lower()
    aff = re.sub(r'xa0|#n#‡#n#|#tab#|#r#|\[|\]', "", aff)
    aff = re.sub(r'[^a-z]+', ' ', aff)
    return aff

def logist_regression(df):
    '''
    Input: 
        df: df
    Returns:
        logreg: logistic regression model
    '''
    X = df.aff
    y = df.label
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state = 42)
    logreg = Pipeline([('vect', CountVectorizer(stop_words='english', min_df = 5)),
                ('clf', LogisticRegression(max_iter=600)),
               ])
    print('model training now...')
    logreg.fit(X_train, y_train)

    y_pred = logreg.predict(X_test)

    target_names = list(set([id_to_cntry[x] for x in y_test]))
    
    f = open(CNTRY_CLASSIFICATION_REPORT,'a')
    f.write('The following is the result for affiliation country code classification' + '\n')
    f.write('accuracy %s' % accuracy_score(y_pred, y_test))
    f.write('\n')
    f.write(classification_report(y_test, y_pred, target_names=target_names))
    f.write('\n')
    f.write('\n')

    return logreg

def get_processed_merged_author(DF, LOGREG):
    '''
    Input: 
        - DF: merged
        - LOGREG
    Returns:
        - DF with cntry classification results
    '''
    # clean text for affs to be predicted
    DF['IEEE Author Affiliation Filled'] = DF[
        'IEEE Author Affiliation Filled'].apply(clean_text)
    pred = LOGREG.predict(DF['IEEE Author Affiliation Filled'])
    results = [id_to_cntry[x] for x in pred]
    DF['country_code_results'] = results
    return DF

In [15]:
# load datasets:
cit_author = get_simple_df(CIT_AUTHOR)
ref_author = get_simple_df(REF_AUTHOR)
oa_author = get_simple_df(OA_AUTHOR)
merged = pd.read_csv(MERGED_AUTHOR)

In [16]:
# get df for model trainig and testing
df = get_df(cit_author, ref_author, oa_author)

In [17]:
# clean affiliation texts 
df['aff'] = df['aff'].apply(clean_text)

In [18]:
df.head(10)

Unnamed: 0,aff,label_str,label
0,computer science department stanford universit...,US,0
1,dept of comput sci maryland univ college park ...,US,0
2,ibm sci center los angeles ca usa,US,0
3,technische univ eindhoven,NL,1
4,los alamos national laboratory and lawrence li...,US,0
5,los alamos national laboratory,US,0
6,lawrence livermore national laboratory,US,0
7,rwth aachen,DE,2
8,comput sci div california univ berkeley ca,US,0
9,school of information management and systems u...,US,0


In [19]:
# get dicts
cntry_to_id, id_to_cntry = get_dicts(df)

In [20]:
# get logreg
logreg = logist_regression(df)

model training now...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
merged_processed = get_processed_merged_author(merged, logreg)

## Compare with OpenAlex
I want to check whether the predicted cntry is the same as the original cntry in openalex

In [26]:
merged_processed_cntry_nonan = merged_processed[merged_processed['First Institution Country Code'].notnull()]

In [27]:
merged_processed_cntry_nonan.head(1)

Unnamed: 0,Year,DOI,Title,IEEE Number of Authors,IEEE Author Position,IEEE Author Name,IEEE Author ID,IEEE Author Affiliation Updated,IEEE One Affiliation,Number of Authors,...,First Institution ID,First Institution ROR,First Institution Type,First Institution Country Code,First Institution Country Code By Hand,First Institution Type By Hand,Binary Institution Type,Binary Institution Type By Hand,IEEE Author Affiliation Filled,country_code_results
0,2011,10.1109/TVCG.2011.185,D³ Data-Driven Documents,3.0,1.0,Michael Bostock,https://ieeexplore.ieee.org/author/37591067400,"Computer Science Department, Stanford Universi...",True,3.0,...,https://openalex.org/I97018004,https://ror.org/00f54p054,education,US,,,education,,computer science department stanford universit...,US


In [29]:
merged_processed_cntry_nonan[
    merged_processed_cntry_nonan[
        'First Institution Country Code'] != 
    merged_processed_cntry_nonan[
        'country_code_results']
].shape

(401, 28)

## Compare with HANDCODED 

In [30]:
merged_processed = merged_processed.assign(country_code_results_updated = 
    np.where(merged_processed['First Institution Country Code By Hand'].notnull(), 
         merged_processed['First Institution Country Code By Hand'],
         merged_processed['country_code_results']
        ))

In [34]:
merged_processed[
    merged_processed['country_code_results'] != merged_processed[
        'country_code_results_updated']]

Unnamed: 0,Year,DOI,Title,IEEE Number of Authors,IEEE Author Position,IEEE Author Name,IEEE Author ID,IEEE Author Affiliation Updated,IEEE One Affiliation,Number of Authors,Author Name,Author Position,Author Position Type,OpenAlex Author ID,Author ORCID,Number of Affiliations,First Institution Name Updated,Raw Affiliation String Updated,First Institution ID,First Institution ROR,First Institution Type,First Institution Country Code,First Institution Country Code By Hand,First Institution Type By Hand,Binary Institution Type,Binary Institution Type By Hand,IEEE Author Affiliation Filled,country_code_results,country_code_results_updated
1930,1999,10.1109/VISUAL.1999.809884,Exploring geo-scientific data in virtual environments,5.0,1.0,B. Frohlich,https://ieeexplore.ieee.org/author/37431929300,"German National Research Center for Information Technology, GMD, Germany",True,5.0,Bernd Fröhlich,1.0,first,https://openalex.org/A2155478217,,1.0,Center for Information Technology,"German National Research Center for Information Technology, GMD, Germany",https://openalex.org/I29955533,https://ror.org/03jh5a977,facility,US,DE,,non-education,,german national research center for information technology gmd germany,US,DE
1931,1999,10.1109/VISUAL.1999.809884,Exploring geo-scientific data in virtual environments,5.0,2.0,S. Barrass,https://ieeexplore.ieee.org/author/37087768605,"German National Research Center for Information Technology, GMD, Germany",True,5.0,Stephen Barrass,2.0,middle,https://openalex.org/A3180659629,,1.0,Center for Information Technology,"German National Research Center for Information Technology, GMD, Germany",https://openalex.org/I29955533,https://ror.org/03jh5a977,facility,US,DE,,non-education,,german national research center for information technology gmd germany,US,DE
1932,1999,10.1109/VISUAL.1999.809884,Exploring geo-scientific data in virtual environments,5.0,3.0,B. Zehner,https://ieeexplore.ieee.org/author/37087769201,"German National Research Center for Information Technology, GMD, Germany",True,5.0,Björn Zehner,3.0,middle,https://openalex.org/A2199354637,,1.0,Center for Information Technology,"German National Research Center for Information Technology, GMD, Germany",https://openalex.org/I29955533,https://ror.org/03jh5a977,facility,US,DE,,non-education,,german national research center for information technology gmd germany,US,DE
1933,1999,10.1109/VISUAL.1999.809884,Exploring geo-scientific data in virtual environments,5.0,4.0,J. Plate,https://ieeexplore.ieee.org/author/37087771711,"German National Research Center for Information Technology, GMD, Germany",True,5.0,John Plate,4.0,middle,https://openalex.org/A2250361850,,1.0,Center for Information Technology,"German National Research Center for Information Technology, GMD, Germany",https://openalex.org/I29955533,https://ror.org/03jh5a977,facility,US,DE,,non-education,,german national research center for information technology gmd germany,US,DE
1934,1999,10.1109/VISUAL.1999.809884,Exploring geo-scientific data in virtual environments,5.0,5.0,M. Gobel,https://ieeexplore.ieee.org/author/37378253100,"German National Research Center for Information Technology, GMD, Germany",True,5.0,Martin Göbel,5.0,last,https://openalex.org/A2955121751,,1.0,Center for Information Technology,"German National Research Center for Information Technology, GMD, Germany",https://openalex.org/I29955533,https://ror.org/03jh5a977,facility,US,DE,,non-education,,german national research center for information technology gmd germany,US,DE
2969,2004,10.1109/VISUAL.2004.113,Visualization of intricate flow structures for vortex breakdown analysis,7.0,6.0,M. Ruetten,https://ieeexplore.ieee.org/author/37088267007,,,7.0,Markus Ruetten,6.0,middle,https://openalex.org/A2941944884,,1.0,DLR Goettingen,DLR Goettingen,,,,,DE,government,,non-education,dlr goettingen,US,DE
7964,1999,10.1109/VISUAL.1999.809929,An interactive framework for visualizing foreign currency exchange options,4.0,4.0,E.J. Mayland,https://ieeexplore.ieee.org/author/37388974800,UBS Group AG,,4.0,E. J. Mayland,4.0,last,https://openalex.org/A2777810764,,,UBS Group AG,UBS Group AG,,,,,US,company,,non-education,ubs group ag,DE,US
9040,1999,10.1109/VISUAL.1999.809920,DELTA's Virtual Physics Laboratory: a comprehensive learning platform on physics and astronomy,5.0,1.0,S. Chakaveh,https://ieeexplore.ieee.org/author/37088125325,"GMD Forschungszentrum Informationtechnik GmBH, German National Research Centre for Information Technology, IMK-DELTA, Saint Augustine, USA",True,5.0,S. Chakaveh,1.0,first,https://openalex.org/A2704969141,,1.0,Center for Information Technology,"GMD Forschungszentrum Informationtechnik GmBH, German National Research Centre for Information Technology, IMK-DELTA, Saint Augustine, USA",https://openalex.org/I29955533,https://ror.org/03jh5a977,facility,US,DE,,non-education,,gmd forschungszentrum informationtechnik gmbh german national research centre for information technology imk delta saint augustine usa,US,DE
9041,1999,10.1109/VISUAL.1999.809920,DELTA's Virtual Physics Laboratory: a comprehensive learning platform on physics and astronomy,5.0,2.0,U. Zlender,https://ieeexplore.ieee.org/author/37088123100,"GMD Forschungszentrum Informationtechnik GmBH, German National Research Centre for Information Technology, IMK-DELTA, Saint Augustine, USA",True,5.0,U. Zlender,2.0,middle,https://openalex.org/A2345898586,,,,"GMD Forschungszentrum Informationtechnik GmBH, German National Research Centre for Information Technology, IMK-DELTA, Saint Augustine, USA",,,,,DE,,,,gmd forschungszentrum informationtechnik gmbh german national research centre for information technology imk delta saint augustine usa,US,DE
9042,1999,10.1109/VISUAL.1999.809920,DELTA's Virtual Physics Laboratory: a comprehensive learning platform on physics and astronomy,5.0,3.0,D. Skaley,https://ieeexplore.ieee.org/author/37088123679,"GMD Forschungszentrum Informationtechnik GmBH, German National Research Centre for Information Technology, IMK-DELTA, Saint Augustine, USA",True,5.0,Detlef Skaley,3.0,middle,https://openalex.org/A2358689908,,,,"GMD Forschungszentrum Informationtechnik GmBH, German National Research Centre for Information Technology, IMK-DELTA, Saint Augustine, USA",,,,,DE,,,,gmd forschungszentrum informationtechnik gmbh german national research centre for information technology imk delta saint augustine usa,US,DE


## Output

In [101]:
# export merged_processed
cols_to_keep = [
    'Year',
    'DOI',
    'Title',
    'IEEE Number of Authors',
    'IEEE Author Position', 
    'IEEE Author Name',
    'OpenAlex Author ID',
    'IEEE Author Affiliation Filled',
    'country_code_results', 
    ]
col_renamer = {
    'Year':'Year',
    'DOI':'DOI',
    'Title':'Title',
    'IEEE Number of Authors':'Number of Authors',
    'IEEE Author Position':'Author Position', 
    'IEEE Author Name':'Author Name',
    'OpenAlex Author ID':'OpenAlex Author ID',
    'IEEE Author Affiliation Filled':'Affiliation Name',
    'country_code_results':'Affiliation Country Code', 
    }
merged_cntry_predicted = merged_processed[cols_to_keep]
merged_cntry_predicted.rename(columns = col_renamer).to_csv(
    MERGED_CNTRY_PREDICTED, index = False
)

In [102]:
merged_cntry_predicted.head(2)

Unnamed: 0,Year,DOI,Title,IEEE Number of Authors,IEEE Author Position,IEEE Author Name,OpenAlex Author ID,IEEE Author Affiliation Filled,country_code_results
0,2011,10.1109/TVCG.2011.185,D³ Data-Driven Documents,3.0,1.0,Michael Bostock,https://openalex.org/A2048345123,computer science department stanford universit...,US
1,2011,10.1109/TVCG.2011.185,D³ Data-Driven Documents,3.0,2.0,Vadim Ogievetsky,https://openalex.org/A2668634103,computer science department stanford universit...,US
