# Effect of most common ethnicity on distribution in population

In [70]:
import re
import pandas as pd
import numpy as np
import pickle
import ethnicity_funcs as ef
import pylab as pl
from disambig_funcs import Gini
from scipy.stats import pearsonr
from statistics import mode,StatisticsError
from collections import Counter
from random import choice

In [19]:
# Utility functions

# Return mode, or NaN if no unique mode found
def uniqueMode(tmp):
    try:
        return mode(tmp)
    except StatisticsError:
        return np.nan

## Reading in the data files and some pre-processing

In [3]:
baseField='MechanicalEngineering'
ethFile='ethnicitiesMech.pkl'

In [4]:
# Loading in the CSV files
ethdict=pickle.load(open(ethFile,"rb"))
paa=pd.read_csv(baseField+"SelectedPaperAuthorAffiliations.txt",'\t',usecols=[0,1],names=['pid','aid'])
authors=pd.read_csv(baseField+'SelectedAuthors.txt','\t',names=['aid','name'])
refs=pd.read_csv(baseField+"SelectedPaperReferences.txt",'\t',names=['pid','refpid'])

In [5]:
# Creating file with pid,aid,name,eth
df=pd.merge(paa,authors,on='aid')
df['eth']=df.name.apply(lambda tmp: ef.name2ethnicity(tmp,ethdict))

In [6]:
# Creating lookup dict from pid to ';' delimited eths
pid_eths=df.groupby('pid')['eth'].apply(lambda tmp:';'.join(tmp)).reset_index().rename(columns={'eth':'eths'})
pideths_dict=dict(pid_eths.itertuples(index=False))

# Most common ethnicity for each paper
pid_eths['ethmode']=pid_eths['eths'].apply(lambda tmp:uniqueMode(tmp.split(';')))
pid_eths=pid_eths.dropna()

In [35]:
# Calculating default ethnicity frequencies
ethpdf=df.eth.value_counts(normalize=True).to_dict()

## Calculating gini indices for pids, and for all the refs of a pids

In [20]:
# Find the list of all papers that cite a paper
refpid_pids=refs.groupby('refpid')['pid'].apply(lambda tmp:';'.join(tmp)).reset_index().rename(columns={'pid':'citingpids','refpid':'pid'})

# Create master list of citing ethnicities for each pid
refpid_pids['citingeths']=refpid_pids['citingpids'].apply(lambda tmp: ';'.join([pideths_dict[tmp2] for tmp2 in tmp.split(';')]))

In [78]:
# Calculating "boost"
soldf=pd.merge(pid_eths,refpid_pids,on="pid")[['pid','ethmode','citingeths']].set_index('pid')

res=[]
fakeres=[]
eths=list(ethpdf.keys())
for ind in soldf.index:
    
    ethmode=soldf.loc[ind]['ethmode']
    randeth=choice(eths)
    
    citingeths=soldf.loc[ind]['citingeths'].split(';')
    cc=Counter(citingeths)
    ccsum=len(citingeths)
    res.append((cc[ethmode]/ccsum)/ethpdf[ethmode])
    fakeres.append((cc[randeth]/ccsum)/ethpdf[randeth])

In [79]:
print(np.mean(res))
print(np.mean(fakeres))

3.20536901742
0.98195183148


In [38]:
pid_eths.head()

Unnamed: 0,pid,eths,ethmode
0,00003957,"GreaterAfrican,Muslim","GreaterAfrican,Muslim"
1,0001F1E4,"GreaterEuropean,British","GreaterEuropean,British"
2,00032959,"GreaterEuropean,British","GreaterEuropean,British"
3,0005C675,"Asian,GreaterEastAsian,EastAsian","Asian,GreaterEastAsian,EastAsian"
4,00139132,"GreaterEuropean,British","GreaterEuropean,British"


In [32]:
refpid_pids.head()

Unnamed: 0,pid,citingpids,citingeths
0,00003957,75C78237;777D79D3;1267E545,"GreaterEuropean,EastEuropean;GreaterEuropean,B..."
1,0023450C,78D59D2B,"Asian,GreaterEastAsian,EastAsian;Asian,Greater..."
2,003CBCF1,39C66BE6,"Asian,GreaterEastAsian,EastAsian;Asian,Greater..."
3,00670764,7F81F026;7A57933E;7AB74B84;7CA4E43E;7BA3A3F5;7...,"unknown;Asian,GreaterEastAsian,Japanese;Asian,..."
4,0078A6D8,7A1B98E1,"Asian,IndianSubContinent;Asian,IndianSubContinent"


In [203]:
pearsonr(refpid_pids.gini,refpid_pids.citinggini1)

(0.29432677996224271, 0.0)