## Aim

With this notebook, I want to see whether all coders have coded the race var. I need to make sure that no one is omitted because there is no race coded for ica authors.

### Result

It turns out that every author has race prediction. So no authors were deleted for lack of race prediction. 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
# names = ['haley', 'jongmin', 'jaemin', 'jeff', 'matthew', 'michelle']
names = ['haley', 'jaemin', 'matthew', 'michelle', 'jongmin', 'jeff']
data_dir = '../../data/interim/gender_race_result'

In [3]:
haley = pd.read_csv(f'{data_dir}/haley.csv')
haley['Coder'] = 'Haley'
matthew = pd.read_csv(f'{data_dir}/matthew.csv')
matthew['Coder'] = 'Matthew'
michelle = pd.read_csv(f'{data_dir}/michelle.csv')
michelle['Coder'] = 'Michelle'
jeff = pd.read_csv(f'{data_dir}/jeff.csv')
jeff['Coder'] = 'Jeff'
jongmin = pd.read_csv(f'{data_dir}/jongmin.csv')
jongmin['Coder'] = 'Jongmin'
jaemin = pd.read_csv(f'{data_dir}/jaemin.csv')
jaemin['Coder'] = 'Jaemin'
jaemin['gender_prediction'] = jaemin['gender_prediction'].str.replace(' M', 'M')

In [4]:
jongmin[jongmin.authorID == '10.1111/j.1460-2466.1962.tb01529.x+2.0']

Unnamed: 0.1,Unnamed: 0,authorID,doi,url,year,title,journal,numberOfAuthors,authorPosition,authorFullName,...,affiliation.2,ROR_AFFNAME,matchMethod,ROR_ID,type,gender_prediction,genderpred_api,race_prediction,racepred_api,Coder
1842,4761,10.1111/j.1460-2466.1962.tb01529.x+2.0,10.1111/j.1460-2466.1962.tb01529.x,https://academic.oup.com/joc/article/12/2/90/4...,1962,An Attempt to Quantify the “Abstraction Ladder”,Journal of Communication,3,2,W. W. Lewis,...,2 Dr. Lewis is temporarily Chief Assessment Br...,The New Teacher Project,API_QUERY,https://ror.org/04p9zhq85,R,M,,0,,Jongmin


In [5]:
set(matthew.gender_prediction)

{'F', 'M', 'f', 'm', 'n', nan}

In [6]:
matthew_upperclass_dict = {
    'm': 'M',
    'n': 'N',
    'f': 'F'
}

In [7]:
'''https://stackoverflow.com/a/68046167
'''
for old, new in matthew_upperclass_dict.items():
    matthew['gender_prediction'] = matthew['gender_prediction'].str.replace(old, new, regex=False)

In [8]:
set(jaemin.gender_prediction), set(jongmin.gender_prediction)

({'F', 'M', 'N'}, {'F', 'M'})

In [9]:
set(jeff.gender_prediction), set(matthew.gender_prediction)

({'F', 'M'}, {'F', 'M', 'N', nan})

## Deal with Haley and Michelle data

### Gender

In [10]:
# for haley: if genderpred_api is nan, use gender_prediction, otherwise, use genderize result
haley_genderpred = np.where(
    haley.genderpred_api.isnull(), haley["gender_prediction"], haley.genderize)
haley_genderpred[haley_genderpred == 'female'] = 'F'
haley_genderpred[haley_genderpred == 'male'] = 'M'
set(haley_genderpred)

{'F', 'M', 'N', nan}

In [11]:
# for michelle: if genderpred_api is nan, use gender_prediction, otherwise, use genderize result
michelle_genderpred = np.where(
    michelle.genderpred_api.isnull(), michelle["gender_prediction"], michelle.genderize)
michelle_genderpred[michelle_genderpred == 'female'] = 'F'
michelle_genderpred[michelle_genderpred == 'male'] = 'M'
set(michelle_genderpred)

{'F', 'M', 'N', nan}

In [12]:
# there are two nans because haley said to rely on genderize which provides nan
np.argwhere(haley_genderpred!=haley_genderpred)

array([[1594],
       [1657]])

In [13]:
# haley.iloc[1657, :]

In [14]:
# there are two nans because michelle said to rely on genderize which provides nan
np.argwhere(michelle_genderpred!=michelle_genderpred)

array([[1586],
       [1606]])

In [15]:
# michelle.iloc[1606, :]

### Race

In [16]:
set(haley.race)

{'api', 'black', 'hispanic', 'white'}

In [17]:
# for haley: if racepred_api is nan, use race_prediction, otherwise, use racepred_api result
haley_racepred = np.where(
    haley.racepred_api.isnull(), haley["race_prediction"], haley.race)
haley_racepred[haley_racepred == 'api'] = 2
haley_racepred[haley_racepred == 'hispanic'] = 3
haley_racepred[haley_racepred == 'white'] = 0
set(haley_racepred)

{0.0, 1.0, 2.0, 3.0, 4.0, 5.0}

In [18]:
# for michelle: if racepred_api is nan, use race_prediction, otherwise, use racepred_api result
michelle_racepred = np.where(
    michelle.racepred_api.isnull(), michelle["race_prediction"], michelle.race)
michelle_racepred[michelle_racepred == 'api'] = 2
michelle_racepred[michelle_racepred == 'hispanic'] = 3
michelle_racepred[michelle_racepred == 'white'] = 0
set(michelle_racepred)

{0.0, 1.0, 2.0, 3.0, 4.0, 5.0}

### Update data

In [21]:
haley['gender_prediction'] = haley_genderpred
michelle['gender_prediction'] = michelle_genderpred
haley['race_prediction'] = haley_racepred
michelle['race_prediction'] = michelle_racepred

## Matthew

In [22]:
Counter(matthew.gender_prediction)

Counter({'M': 1177, 'F': 703, nan: 3, 'N': 1})

In [23]:
Counter(matthew.race_prediction)

Counter({0: 1508, 2: 232, 3: 73, 4: 35, 1: 34, 5: 2})

## Jaemin

In [24]:
Counter(jaemin.gender_prediction)

Counter({'M': 1110, 'F': 773, 'N': 1})

In [25]:
Counter(jaemin.race_prediction)

Counter({0: 1541, 2: 240, 3: 57, 1: 30, 4: 15, 5: 1})

## Jongmin

In [26]:
Counter(jongmin.gender_prediction)

Counter({'M': 1135, 'F': 749})

In [27]:
Counter(jongmin.race_prediction)

Counter({0: 1552, 2: 212, 3: 61, 1: 32, 4: 24, 5: 3})

## Jeff

In [28]:
Counter(jeff.gender_prediction)

Counter({'M': 1184, 'F': 700})

In [29]:
# Counter(jeff.race_prediction)

## Organize data

In [30]:
haley = haley[['Coder', 'firstName', 'genderize', 'authorID', 'doi', 'gender_prediction', 'race_prediction']]
set(haley.gender_prediction), set(haley.race_prediction)

({'F', 'M', 'N', nan}, {0.0, 1.0, 2.0, 3.0, 4.0, 5.0})

In [31]:
matthew = matthew[['Coder', 'firstName', 'genderize', 'authorID', 'doi', 'gender_prediction', 'race_prediction']]
set(matthew.gender_prediction), set(matthew.race_prediction)

({'F', 'M', 'N', nan}, {0, 1, 2, 3, 4, 5})

In [32]:
michelle = michelle[['Coder', 'firstName', 'genderize', 'authorID', 'doi', 'gender_prediction', 'race_prediction']]
set(michelle.gender_prediction), set(michelle.race_prediction)

({'F', 'M', 'N', nan}, {0.0, 1.0, 2.0, 3.0, 4.0, 5.0})

In [33]:
jaemin = jaemin[['Coder', 'firstName', 'genderize', 'authorID', 'doi', 'gender_prediction', 'race_prediction']]
set(jaemin.gender_prediction), set(jaemin.race_prediction)

({'F', 'M', 'N'}, {0, 1, 2, 3, 4, 5})

In [34]:
jeff = jeff[['Coder', 'firstName', 'genderize', 'authorID', 'doi', 'gender_prediction', 'race_prediction']]
set(jeff.gender_prediction)

{'F', 'M'}

In [35]:
jongmin = jongmin[['Coder', 'firstName', 'genderize', 'authorID', 'doi', 'gender_prediction', 'race_prediction']]
set(jongmin.gender_prediction), set(jongmin.race_prediction)

({'F', 'M'}, {0, 1, 2, 3, 4, 5})

In [36]:
df = pd.concat([haley, michelle, matthew, jaemin, jeff, jongmin], ignore_index = True)
df.sample(10)

Unnamed: 0,Coder,firstName,genderize,authorID,doi,gender_prediction,race_prediction
8286,Jeff,Paul,male,10.1111/j.1468-2958.2009.01357.x+1.0,10.1111/j.1468-2958.2009.01357.x,M,0.0
5271,Matthew,Mark,male,10.1111/j.1468-2958.1985.tb00078.x+1.0,10.1111/j.1468-2958.1985.tb00078.x,M,0.0
3348,Michelle,Frances,female,10.1111/j.1468-2958.1986.tb00087.x+3.0,10.1111/j.1468-2958.1986.tb00087.x,M,0.0
857,Haley,Jennifer,female,10.1111/j.1083-6101.2007.00331.x+2.0,10.1111/j.1083-6101.2007.00331.x,F,0.0
8982,Jeff,Barbara,female,10.1111/j.1468-2958.1987.tb00119.x+1.0,10.1111/j.1468-2958.1987.tb00119.x,F,0.0
2482,Michelle,Marion,female,10.1111/j.1753-9137.2012.01123.x+2.0,10.1111/j.1753-9137.2012.01123.x,F,0.0
5478,Matthew,Margaret,female,10.1111/j.1468-2958.1977.tb00515.x+1.0,10.1111/j.1468-2958.1977.tb00515.x,F,0.0
1840,Haley,Chables,male,10.1111/j.1460-2466.1964.tb02927.x+2.0,10.1111/j.1460-2466.1964.tb02927.x,M,0.0
9795,Jongmin,Leihan,,10.1111/jcc4.12145+1.0,10.1111/jcc4.12145,M,2.0
1412,Haley,Janet,female,10.1111/j.1468-2958.1988.tb00158.x+1.0,10.1111/j.1468-2958.1988.tb00158.x,F,0.0


In [37]:
# These are the nans without considering initial_df

first_try_nan = df[df.gender_prediction.isnull()]
first_try_nan.shape[0]

7

In [37]:
first_try_nans = first_try_nan.authorID.tolist()

In [40]:
## see how many nans for race

race_nans = df[df.race_prediction.isnull()]
race_nans.shape[0]

0