# Affiliation

In order to create the dictionary of `Affiliations_mapping`, we go over the papers in the training set. For each paper and for each affiliation of this paper (see below for the details of extracting “clean” affiliations), we add the normalized citations (that is, the average number of citations per year) for this paper to the `affiliation_score` for this affiliation


## Setup

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import string
import unicodedata
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
import json

UNIVERSITIES = {'UCSD': 'University of California, San Diego', 'USC': 'University of Southern California',
                'UCLA': 'University of California, Los Angeles', 'MIT': 'Massachusetts Institute of Technology', 
                'UC Montreal': 'University of Montreal', 'Caltech': 'California Institute of Technology',
                'UCSB': 'University of California, Santa Barbara', 'NYU': 'New York University',
                'UCI': 'University of California, Irvine', 'JPL': 'Jet Propulsion Laboratory',
                'Georgia Tech': 'Georgia Institute of Technology',
                'CMU': 'Carnegie Mellon University', 'Oxford': 'University of Oxford',
                'Berkeley': 'University of California, Berkeley', 'Stanford': 'Stanford University'
}

NLP = en_core_web_sm.load()

USER_DEFINED_STOP_WORDS = ['Computer Science', 'Electrical Engineering','Faculty', 'Professor', 'Founder', 'CEO', 
                           'Emeritus', 'Postdoctoral Research Fellow', 'EECS', 'Statistics', 'Economics', 'Physics',
                          "Research scientist", "Associate Professor", "Assistant Professor", "Marketing", 
                           "Postdoctoral Researcher", "Psychology", "Biology", "ECE", "CS", "Industrial Engineering", "ISE", 
                           "Research Staff", "Inc", 'Engineering', 'Biomedical', "Signal", "Processing", 'Information',
                          "Data Sciences", "PhD", "Student", "Physicist", "Mechanical", "Consultant", "Organizational Behavior",
                          "Pure Mathematics", "Bioinformatics", 'Department', 'Medical', 'School', 'Biochemistry', 
                           'Technical Staff', 'Genomics',"Statistical Sciences"] 

ENGLISH_STOP = set(nltk.corpus.stopwords.words('english'))

STOP_WORDS = set(list(string.punctuation))
STOP_WORDS.remove(',')

SPECIAL_ORG = ['Google', 'Facebook', 'Apple', 'CNRS', 'Stanford', 'Microsoft', 'IBM', 'NVIDIA', 'Yahoo', 'Adobe', 'Xerox', 'Nokia', 'Samsung', 
              'Simons', 'Berkeley', 'Stanford', 'Princeton', 'Oxford', 'ETH Zurich', 'Uber', 'Lyft', 'OpenAI'] + UNIVERSITIES.keys()

CURRENT_YEAR = 2018

class MyDF(object):
    
    def __init__(self, path, name):
        self.name = name
        self.df = pd.read_csv(path)
        self.df_orig = self.df.copy()
        self.drop_redundant_columns()
        self.max_num_names = len(self.df.columns) -1
        self.make_all_object()
        self.num_non_nan = self.count_non_nan()
        self.list_unique_non_nan = self.list_unique_non_nan()
        self.num_unique_non_nan = len(self.list_unique_non_nan)
    
    def drop_redundant_columns(self):
        self.df = self.df.drop('year', axis=1)
        self.df = self.df.drop('index', axis=1)
        self.df = self.df.drop('citations', axis=1)
        
    def make_all_object(self):
        for i in range(0,self.max_num_names):
            self.df[self.name + '{x}'.format(x=i)] = self.df[self.name + '{x}'.format(x=i)].astype(object)
        
    def list_unique_non_nan(self):
        return pd.concat([self.df[self.name + '{x}'.format(x=i)] for i in range(0,self.max_num_names) ]).dropna().unique().tolist()
    
    def count_non_nan(self):
        return pd.concat([self.df[self.name + '{x}'.format(x=i)] for i in range(0,self.max_num_names) ]).dropna().count()
       
    def making_name_score_df(self):
        df_copy = self.df.copy()
        # keep index as index, stack other columns 'against' it
        stacked = df_copy.set_index('citations_average').stack()
        # set the name of the new series created
        df_temp = stacked.reset_index(name=self.name)
        # drop the 'source' level (key.*)
        df_temp.drop('level_1', axis=1, inplace=True)
        
        unique_names = df_temp[self.name].unique()
        names_score = []
        for name in unique_names:
            names_score.append(df_temp.loc[df_temp[self.name] == name]['citations_average'].sum())
        
        table = [unique_names, names_score]
        output_df = pd.DataFrame(table)
        output_df = output_df.transpose()
        output_df.columns=[self.name, 'Score']
        
        output_dict = dict(zip(unique_names, names_score))
        
        return [output_df, output_dict]
    
    def polish_data(self, target):
        
        df_processed = self.df
        for n in self.list_unique_non_nan:
            score = target.loc[target[self.name] == n]['Score'].sum()
            df_processed = df_processed.replace(n, score)
            
        df_processed['predicted_citations']= df_processed.iloc[:, 2:self.max_num_names+2].sum(axis=1)
        return df_processed
    
    def find_best_ORG(self, aff):
        
        for org in SPECIAL_ORG:
            if org in aff:
                return org
        
        text = unicode(aff,"utf-8")
        doc = NLP(text)
        texts = []
        lengths = []
        for ent in doc.ents:
            if ent.label_ == u"ORG":
                texts.append(ent.text)
                lengths.append(ent.end_char - ent.start_char)
        if len(texts) >0:
            return texts[lengths.index(max(lengths))]
        else:
            return ''
        
    def strip_affiliation(self, x):
        # replace abbreviations
        x = x.replace('Univ.', "University")
        x = x.replace('U.', "University of")
        decoded = x.decode('string_escape')
        text = unicode(decoded, 'utf8', errors='ignore')
        x1 = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore') 
        x1 = x1.strip()
        for name in USER_DEFINED_STOP_WORDS:
            x1 = x1.replace(name, '')
        return x1.strip()
    
    def map_abbr(self, x):
        try:
            y = UNIVERSITIES[x]
            return y
        except KeyError:
            return x
    
    def remove_punctuations(self, x):
        
        x1 = [w for w in x.split() if w not in set(STOP_WORDS)]  # remove stopwords
        if len(x1) > 1:
            if x1[0] in ENGLISH_STOP:
                del x1[0]
        if len(x1) > 1:        
            if x1[-1] in ENGLISH_STOP:
                del x1[-1]
        return ' '.join(x1).strip()                                     # join the list
    
    def polish_affiliations(self):
        
        polished_affiliations = []
        for aff in self.list_unique_non_nan:
            stripped_affiliation = self.strip_affiliation(aff)
            recognized_affiliation = self.find_best_ORG(stripped_affiliation)
            semi_polished_affiliation = self.remove_punctuations(recognized_affiliation)
            polished_affiliation = self.map_abbr(semi_polished_affiliation)
            if polished_affiliation != "":
                polished_affiliations.append(polished_affiliation)
            else:
                polished_affiliations.append(np.nan)
            
        self.df = self.df.replace(self.list_unique_non_nan, polished_affiliations)
        
        self.list_unique_non_nan = list(set(polished_affiliations))
        self.num_unique_non_nan = len(self.list_unique_non_nan)
        

## Extracting clean affiliations: 

The way we obtained the affiliation for each author was using the google scholar of this author (if exists). However, in the google scholar page of authors, usually there exists extra information in the affiliation part and authors describe their position (Faculty, Professor, Assistant Professor, Associate Professor, Researcher, Student, Post-doc, …), the department they are in (Electrical Engineering, Computer Science, …), and finally the name of the organization they are in.

There are total number of 1102 unique affiliations in our training set.

In order to extract the correct affiliation names, what we have done is as follows: for each affiliation string s we obtained from google scholar,

1-	We removed any word in this string that matches the following list of positions or department names:

['Computer Science', 'Electrical Engineering', 'Faculty', 'Professor', 'Founder', 'CEO', 'Emeritus', 'Postdoctoral Research Fellow', 'EECS', 'Statistics', 'Economics', 'Physics', "Research scientist", "Associate Professor", "Assistant Professor", "Marketing", "Postdoctoral Researcher", "Psychology", "Biology", "ECE", "CS", "Industrial Engineering", "ISE", "Research Staff", "Inc", 'Engineering', 'Biomedical', "Signal", "Processing", 'Information', "Data Sciences", "PhD", "Student", "Physicist", "Mechanical", "Consultant", "Organizational Behavior", "Pure Mathematics", "Bioinformatics", 'Department', 'Medical', 'School', 'Biochemistry', 'Technical Staff', 'Genomics']

2-	We replaced “U.“ with “University of” and “Univ.” with “University” in the string s.
3-	Some organizations appeared with different names in our dataset obtained from google scholar. For example, we had Google, Google Brain, Google Deepmind, Google AI, … We created the following list of special organizations: 

['Google', 'Facebook', 'Apple', 'CNRS', 'Stanford', 'Microsoft', 'IBM', 'NVIDIA', 'Yahoo', 'Adobe', 'Xerox', 'Nokia', 'Samsung', 'Simons', 'Berkeley', 'Stanford', 'Princeton', 'Oxford', 'ETH Zurich', 'Uber', 'Lyft', 'OpenAI']

and if any name in this list exists as a word in string s, we consider that word as the organization for string s.

4-	Some organizations appeared with their abbreviations in our dataset obtained from google scholar. For example, we had ‘MIT’ instead of ‘Massachusetts Institute of Technology’, ‘USC’ instead of ‘University of Southern California’, …. We created the following dictionary with universities’ abbreviations at the keys and the full name of these organizations as values:

{'UCSD': 'University of California, San Diego', 'USC': 'University of Southern California', 'UCLA': 'University of California, Los Angeles', 'MIT': 'Massachusetts Institute of Technology',  'UC Montreal': 'University of Montreal', 'Caltech': 'California Institute of Technology', 'UCSB': 'University of California, Santa Barbara', 'NYU': 'New York University', 'UCI': 'University of California, Irvine', 'JPL': 'Jet Propulsion Laboratory', 'Georgia Tech': 'Georgia Institute of Technology', 'CMU': 'Carnegie Mellon University', 'Oxford': 'University of Oxford', 'Berkeley': 'University of California, Berkeley', 'Stanford': 'Stanford University'}
	 
Now if any of the keys of this dictionary exists as a word in string s, we consider the value for that key as the organization for string s.

5-	We used named entity recognition (NER) from library Spacy to extract words with label ORG for string s. If there exist more than one word with label ORG, we choose the longest word. 

In the following, we provide some examples for extracting affiliations from the string we obtained for affiliations of the authors using Google Scholar.

Original string: Professor, Computer Science, University of California Irvine 
Extracted affiliation: University of California Irvine

Original string: Professor Emeritus, School of ECE & Dept. of Statistical Sciences, Cornell University 
Extracted affiliation: Cornell University

Original string: Google Research, NY 
Extracted affiliation: Google

Original string: Professor of Psychology, Co-Director of Princeton Neuroscience Institute, Princeton 
Extracted affiliation: Princeton

Original string: Professor MIT 
Extracted affiliation: Massachusetts Institute of Technology


## Create the dataframe of affiliations and their scores for all affiliations in the training set



In [2]:
df_affiliation_training = MyDF("./data/data_processed/Affiliation_training.csv", "Affiliation")
df_affiliation_training.polish_affiliations()
[df_affiliation_score, dict_affiliation_score] = df_affiliation_training.making_name_score_df()

  mask = arr == x


In [3]:
with open("./data/data_processed/json/Affiliations_score.json", "w") as fp:
    json.dump(dict_affiliation_score , fp)

Below we indicate the top-10 affiliations sorted based on their author_score.

In [4]:
df_affiliation_score.sort_values(['Score'], ascending=[0])[0:10]

Unnamed: 0,Affiliation,Score
7,Google,14553.0
45,Facebook,5950.87
481,OpenAI,4500.97
68,Stanford University,2199.47
10,Massachusetts Institute of Technology,2096.46
89,Microsoft,1817.68
35,"University of California, Berkeley",1530.83
281,NVIDIA,1368.59
39,University of Oxford,1209.49
24,University of Montreal,1200.75


### Correlation between citations and score_mean in Training for Affiliations


In [5]:
train_affiliations_df = df_affiliation_training.polish_data(df_affiliation_score)
train_affiliations_df.citations_average.corr(train_affiliations_df.predicted_citations)

0.1532766022380167

### Save the training data with predicated values

In [6]:
train_affiliations_df.to_csv('./data/data_processed/Affiliation_training_predicted.csv', index=False)

## Create the dataframe of affiliations and their scores for all affiliations in the test set


In [7]:
df_affiliation_test = MyDF("./data/data_processed/Affiliation_test.csv", "Affiliation")
df_affiliation_test.polish_affiliations()
test_affiliations_df = df_affiliation_test.polish_data(df_affiliation_score)

### Correlation between citations and score_mean in Test for Affiliation


In [8]:
test_affiliations_df.citations_average.corr(test_affiliations_df.predicted_citations)

0.12052455206648188

### Save the test data with predicated values

In [9]:
test_affiliations_df.to_csv('./data/data_processed/Affiliation_test_predicted.csv', index=False)