# Checking correlation between gini of papers and gini of citing papers

In [161]:
import re
import pandas as pd
import numpy as np
import pickle
import ethnicity_funcs as ef
import pylab as pl
from disambig_funcs import Gini
from scipy.stats import pearsonr

## Reading in the data files and some pre-processing

In [None]:
baseField='MechanicalEngineering'
ethFile='ethnicitiesMech.pkl'

In [166]:
# Loading in the CSV files
ethdict=pickle.load(open(ethFile,"rb"))
paa=pd.read_csv(baseField+"SelectedPaperAuthorAffiliations.txt",'\t',usecols=[0,1],names=['pid','aid'])
authors=pd.read_csv(baseField+'SelectedAuthors.txt','\t',names=['aid','name'])
refs=pd.read_csv(baseField+"SelectedPaperReferences.txt",'\t',names=['pid','refpid'])

In [167]:
# Creating file with pid,aid,name,eth
df=pd.merge(paa,authors,on='aid')
df['eth']=df.name.apply(lambda tmp: ef.name2ethnicity(tmp,ethdict))

In [178]:
# Creating lookup dict from pid to ';' delimited eths
pid_eths=df.groupby('pid')['eth'].apply(lambda tmp:';'.join(tmp)).reset_index().rename(columns={'eth':'eths'})
pideths_dict=dict(pid_eths.itertuples(index=False))

## Calculating gini indices for pids, and for all the refs of a pids

In [195]:
# Function to take a list of pids and return Gini of pooled authors
def pids2pooledgini(pids):
    eths=';'.join([pideths_dict[tmp] for tmp in pids.split(';')])
    return Gini(eths.split(';'))

In [151]:
# Creating lookup dict for pid -> gini of the pids
pid_gini=df.groupby('pid')['eth'].apply(lambda tmp:Gini(tmp)).reset_index().rename(columns={'eth':'gini'})
ginidict=dict(pid_gini.itertuples(index=False))

In [197]:
# Find the list of all papers that cite a paper
refpid_pids=refs.groupby('refpid')['pid'].apply(lambda tmp:';'.join(tmp)).reset_index().rename(columns={'pid':'citingpids','refpid':'pid'})

# Average Gini of all these papers
refpid_pids['citinggini1']=refpid_pids['citingpids'].apply(lambda tmp:np.mean([ginidict[tmp2] for tmp2 in tmp.split(';')]))

# Gini of pooled authors for all these papers
refpid_pids['citinggini2']=refpid_pids['citingpids'].apply(pids2pooledgini)

# Gini of the papers themselves
refpid_pids=pd.merge(refpid_pids,pid_gini,on='pid')

In [203]:
pearsonr(refpid_pids.gini,refpid_pids.citinggini1)

(0.29432677996224271, 0.0)