# Get PI and co-PI gender from first names
Created by Ivan Lima on 2020-03-22 10:44:57 

In this notebook we:

- Replace initials in PI & co-PI names with names found on the Web (by Jennie)
- Extract first names for PIs & co-PIs
- Assign gender to PIs & co-PIs using previously created name-gender data set
- Compute female fraction of co-PIs

In [1]:
import pandas as pd
import numpy as np
import datetime, re
from tqdm import tnrange, notebook
pd.options.display.max_columns = 50
print('Last updated on {}'.format(datetime.datetime.now().ctime()))

Last updated on Sat Mar 28 18:38:11 2020


## Read NSF-OCE data

In [2]:
data_types = {'Abstract':'string', 'Title':'string', 'Programs':'string', 'PI':'string',
              'Organization':'string', 'State':'string', 'Instrument':'string', 'co-PIs':'string',
              'program':'string'}
awards = pd.read_csv('data/awards_1985-2020_clean_edited_grouped.csv', index_col=0, parse_dates=[4,5],
                     dtype=data_types)
awards.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11513 entries, 8911427 to 844394
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Abstract             11513 non-null  string        
 1   Title                11513 non-null  string        
 2   Programs             11491 non-null  string        
 3   StartDate            11513 non-null  datetime64[ns]
 4   EndDate              11513 non-null  datetime64[ns]
 5   PI                   11481 non-null  string        
 6   Organization         11513 non-null  string        
 7   State                11364 non-null  string        
 8   Instrument           11513 non-null  string        
 9   n_awards             11513 non-null  int64         
 10  co-PIs               5784 non-null   string        
 11  num_co-PIs           5784 non-null   float64       
 12  total_amount         11513 non-null  float64       
 13  total_ARRA           115

## Read Jennie's name-gender data set

In [3]:
names = pd.read_csv('data/unknown_gender_found.csv', index_col=0)
names['name'] = [' '.join(n.split()) for n in names.name]
names['fname'] = [s.split()[0] for s in names.name]
names = names[names.fname.str.len() == 1].reset_index()
names = names[names['First Name'].notnull()]
names['new_name'] = names['First Name'] + ' ' + names['name']
# names[['name', 'new_name']]

Replace initials with names found on the Web.

In [4]:
new_name_map = {name:new_name for name, new_name in zip(names.name, names.new_name)}
new_name_map['H Melosh'] = 'H Jay Melosh'
new_name_map['Dijk Peter'] = 'Peter Dijk'

for name in new_name_map:
    awards.loc[awards['co-PIs'].str.contains(name).fillna(0),'co-PIs'] = [
        s.replace(name, new_name_map[name])
        for s in awards.loc[awards['co-PIs'].str.contains(name).fillna(0), 'co-PIs']]
    awards.loc[awards['PI'].str.contains(name).fillna(0),'PI'] = [
        s.replace(name, new_name_map[name])
        for s in awards.loc[awards['PI'].str.contains(name).fillna(0), 'PI']]

## Extract first name for PIs and Co-PIs

In [5]:
def get_first_name(fullname):
    names = fullname.split()
    if len(names)>2:
        if (len(names[0])>1) or (len(names[1])==1):
            first_name = names[0]
        else:
            first_name = names[1]
    else:
        first_name = names[0]
        
    return first_name.lower()
        
awards.loc[awards.PI.notnull(),'PI_first_name'] = (
    awards.PI.dropna().map(get_first_name))

awards.loc[awards['co-PIs'].notnull(),'co-PI_first_name'] = [
    ','.join([get_first_name(s) for s in rec.split(',')]) for rec in awards['co-PIs'].dropna()]

awards.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11513 entries, 8911427 to 844394
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Abstract             11513 non-null  string        
 1   Title                11513 non-null  string        
 2   Programs             11491 non-null  string        
 3   StartDate            11513 non-null  datetime64[ns]
 4   EndDate              11513 non-null  datetime64[ns]
 5   PI                   11481 non-null  string        
 6   Organization         11513 non-null  string        
 7   State                11364 non-null  string        
 8   Instrument           11513 non-null  string        
 9   n_awards             11513 non-null  int64         
 10  co-PIs               5784 non-null   string        
 11  num_co-PIs           5784 non-null   float64       
 12  total_amount         11513 non-null  float64       
 13  total_ARRA           115

## Read PI & Co-PI name-gender data

In [6]:
pi_copi_names = pd.read_hdf('data/pi_copi_name_gender.h5', 'pi_copi_names')
name_gender_map = {name: gender for name, gender in zip(pi_copi_names.name, pi_copi_names.gender)}

for name in ['josh','zachary','joost','dustin','harper','kanchan','blake','clement','gangfeng','kaustubh',
             'chang-sheng','amir','antarpreet','dwight','hyodae','jingfeng','jian','xiaohui','daijiro',
             'jordan','frisbee','hua-wei','tyler','wenyuan','oleg','cole','alexandre','sebastian','viktor',
             'gifford','konstantinos','ossama','alastair','rene','weifu','lin','kaixuan','zunli','guangyu',
             'ugo','kun','maxime','garriet','tommaso','giuseppe','jacob','marty','rusty','ethan','luca',
             'dalton','emmanouil','gi','jud','sourabh','feili','hieu','puspa','leocadio','ramon','bill',
             'jim','zhixiong','winsor','morteza','kuo-chuin']:
    name_gender_map[name] = 'male'
    
for name in ['anita','torrance','ginger','rita','lael','juita-elena','ju-chin','aibing','shana','carolina',
             'natalia','cara','lory','randelle','yurena','leigh','ran','brandi','yuko','valentina',
             'georgianna','luciana','hayley','viktoria','libusha','roxanna','eunsoo','meagan','dayanthie',
             'alexandrina','ingunn','an','wooyoung','arielle','jong-mi','shuyi','mercedes','sonia','devon',
             'hollie','feixue','eman']:
    name_gender_map[name] = 'female'

# name_gender_map

## Assign PI & co-PI gender based on first name

In [7]:
awards['PI_gender'] = awards.PI_first_name.map(name_gender_map)

awards.loc[awards['co-PI_first_name'].notnull(),'co-PI_gender'] = [
    ','.join([name_gender_map.get(x,'unknown') for x in s.split(',')])
    for s in awards.loc[awards['co-PI_first_name'].notnull(),'co-PI_first_name']]

In [8]:
# unknown = awards.loc[awards['co-PI_gender'].str.contains('unknown').fillna(False),
#                      ['co-PIs', 'co-PI_first_name', 'co-PI_gender']]
# unknown

## Compute female fraction of co-PIs

In [9]:
def gender_frac(genstr, gender='female'):
    s = pd.Series(genstr.split(','))
    s[s=='unknown'] = np.nan
    v = s.dropna().value_counts()
    s0 = pd.Series({'female':0, 'male':0})
    v = v.add(s0, fill_value=0)
    return v[gender]/v.sum()

awards.loc[awards['co-PI_gender'].notnull(),'co-PI_female_frac'] = [
    gender_frac(x) for x in awards.loc[awards['co-PI_gender'].notnull(),'co-PI_gender']]

  import sys


## Save data set to CSV file

In [10]:
awards.to_csv('data/awards_1985-2020_clean_edited_grouped_gender.csv', encoding='utf-8-sig')
awards.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11513 entries, 8911427 to 844394
Data columns (total 23 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Abstract             11513 non-null  string        
 1   Title                11513 non-null  string        
 2   Programs             11491 non-null  string        
 3   StartDate            11513 non-null  datetime64[ns]
 4   EndDate              11513 non-null  datetime64[ns]
 5   PI                   11481 non-null  string        
 6   Organization         11513 non-null  string        
 7   State                11364 non-null  string        
 8   Instrument           11513 non-null  string        
 9   n_awards             11513 non-null  int64         
 10  co-PIs               5784 non-null   string        
 11  num_co-PIs           5784 non-null   float64       
 12  total_amount         11513 non-null  float64       
 13  total_ARRA           115

In [11]:
# len(awards.loc[awards.Abstract.str.contains('OCE\s*-'),'Abstract'])
# for i, s in awards.loc[awards.Abstract.str.contains('OCE\s*\d{2,}'),'Abstract'].iteritems():
#     print('{}\n{}\n'.format(i,s))