# Author

In order to create the dictionary of `Authors_mapping`, we go over the papers in the training set. For each paper and for each author of this paper, we add the normalized citations (that is, the average number of citations per year) for this paper to the `author_score` for this author. There are total number of 8494  unique authors in our training set.

## Setup

In [8]:
import pandas as pd
import numpy as np
import re
import nltk
import string
import unicodedata
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
import json

CURRENT_YEAR = 2018

class MyDF(object):
    
    def __init__(self, path, name):
        self.name = name
        self.df = pd.read_csv(path)
        self.df_orig = self.df.copy()
        self.drop_redundant_columns()
        self.max_num_names = len(self.df.columns) -1
        self.make_all_object()
        self.num_non_nan = self.count_non_nan()
        self.list_unique_non_nan = self.list_unique_non_nan()
        self.num_unique_non_nan = len(self.list_unique_non_nan)
    
    def drop_redundant_columns(self):
        self.df = self.df.drop('year', axis=1)
        self.df = self.df.drop('index', axis=1)
        self.df = self.df.drop('citations', axis=1)
        
    def make_all_object(self):
        for i in range(0,self.max_num_names):
            self.df[self.name + '{x}'.format(x=i)] = self.df[self.name + '{x}'.format(x=i)].astype(object)
        
    def list_unique_non_nan(self):
        return pd.concat([self.df[self.name + '{x}'.format(x=i)] for i in range(0,self.max_num_names) ]).dropna().unique().tolist()
    
    def count_non_nan(self):
        return pd.concat([self.df[self.name + '{x}'.format(x=i)] for i in range(0,self.max_num_names) ]).dropna().count()
       
    def making_name_score_df(self):
        df_copy = self.df.copy()
        # keep index as index, stack other columns 'against' it
        stacked = df_copy.set_index('citations_average').stack()
        # set the name of the new series created
        df_temp = stacked.reset_index(name=self.name)
        # drop the 'source' level (key.*)
        df_temp.drop('level_1', axis=1, inplace=True)
        
        unique_names = df_temp[self.name].unique()
        names_score = []
        for name in unique_names:
            names_score.append(df_temp.loc[df_temp[self.name] == name]['citations_average'].sum())
        
        table = [unique_names, names_score]
        output_df = pd.DataFrame(table)
        output_df = output_df.transpose()
        output_df.columns=[self.name, 'Score']
        
        output_dict = dict(zip(unique_names, names_score))
        
        return [output_df, output_dict]
    
    def polish_data(self, target):
        
        df_processed = self.df
        for n in self.list_unique_non_nan:
            score = target.loc[target[self.name] == n]['Score'].sum()
            df_processed = df_processed.replace(n, score)
            
        df_processed['predicted_citations']= df_processed.iloc[:, 2:self.max_num_names+2].sum(axis=1)
        return df_processed

## Create the dataframe of authors and their scores for all authors in the training set



In [9]:
df_author_training = MyDF("./data/data_processed/Author_training.csv", "Author")
[df_author_score, dict_author_score] = df_author_training.making_name_score_df()

In [10]:
with open("./data/data_processed/json/Authors_score.json", "w") as fp:
    json.dump(dict_author_score , fp)

Below we indicate the top-10 authors sorted based on their author_score.

In [11]:
df_author_score.sort_values(['Score'], ascending=[0])[0:10]

Unnamed: 0,Author,Score
3118,Ilya Sutskever,3996.55
248,Michael I. Jordan,2817.29
1747,David M. Blei,2466.64
1478,Andrew Y. Ng,2464.47
4051,Oriol Vinyals,2109.08
4656,Tomas Mikolov,2103.4
4659,Jeff Dean,2043.4
4657,Kai Chen,2043.4
4658,Greg S. Corrado,2043.4
3447,Xi Chen,1792.88


### Correlation between citations and score_mean in Training for Authors


In [12]:
train_authors_df = df_author_training.polish_data(df_author_score)

train_authors_df.citations_average.corr(train_authors_df.predicted_citations)

0.48684344424383963

### Save the training data with predicated values

In [13]:
train_authors_df.to_csv('./data/data_processed/Author_training_predicted.csv', index=False)

## Create the dataframe of authors and their scores for all authors in the test set


In [14]:
df_author_test = MyDF("./data/data_processed/Author_test.csv", "Author")
test_authors_df = df_author_test.polish_data(df_author_score)

### Correlation between citations and score_mean in Test for Authors


In [15]:
test_authors_df.citations_average.corr(test_authors_df.predicted_citations)

0.3646282301669749

### Save the test data with predicated values

In [16]:
test_authors_df.to_csv('./data/data_processed/Author_test_predicted.csv', index=False)