# Education variables summary

In [1]:
import pandas as pd
import numpy as np
import weightedcalcs as wc

df = pd.read_csv('/Users/jugalmarfatia/Documents/summer2018/RA Work/ipumsi_00002.csv')

# Function to calculate education variable difference (Table 3)
Below I am defining a function (method) which caculates average for a particular variable for men and women and the difference.

## Variable Name: Description 
			
- ***country***: Name of country 
- ***var_name***: Name of variable for which averages are calculate i.e Literacy rate, University completion rate etc. 
- ***year***: Year of dataset
- ***Outcome for Men***: Average or rate of variable specified for men. 
- ***Outcome for Women***: Average or rate of variable specified for men.
- ***Difference***: Average for Men minus Average for Women

## Details
1. Not in Universe excluded.
2. Missing excluded. 
3. Weighted according to person weight.

In [2]:
def var_diff(df, country, country_name, var_name):
    # Keep observation from country specified
    df = df.loc[df['COUNTRY'] == country].reset_index()
    
    # Drop all null or missing values
    df = df.loc[df[var_name] >=0]
    
    # Special case for educ attainment. Remove 9 which is equal unknown
    if var_name == "EDATTAIN":
        df = df.loc[df[var_name] > 0]
        df = df.loc[df[var_name] < 9]
        df['uni_completed'] = (df[var_name] == 4).astype(int)
        var_name = 'uni_completed'
        
    # Special case for literacy. lit = 2 implies literate.     
    elif var_name == 'LIT':
        df = df.loc[df[var_name] > 0]
        df = df.loc[df[var_name] < 9]
        df['literacy_rate'] = (df[var_name] == 2).astype(int)
        var_name = 'literacy_rate'
    
    # Seprate men and women dataframe
    men = df.loc[df['SEX'] == 1]
    women = df.loc[df['SEX'] == 2]
    
    # Create output table
    d = {'country': [country_name], 'var_name': [var_name]}
    
    # Define weights
    calc = wc.Calculator("PERWT")
    
    # Convert output to pandas object
    final_df = pd.DataFrame(data=d)
    final_df['year']  = df['YEAR'].mean()
    
    #Calcuate the avg/ rate
    final_df['Outcome for Men'] =  round(calc.mean(men, var_name),3)
    final_df['Outcome for Women'] = round(calc.mean(women, var_name), 3)
    final_df['Difference'] = round(calc.mean(men, var_name) - calc.mean(women, var_name) ,3)
    
    return final_df

# Calculate years of schooling

In [3]:
mexico_yrs_school = var_diff(df, 484, 'Mexico' ,'YRSCHOOL')
dr_yrs_school = var_diff(df, 214, 'Dominican Republic' ,'YRSCHOOL')
jamaica_yrs_school = var_diff(df, 388, 'Jamaica' ,'YRSCHOOL')
panama_yrs_school = var_diff(df, 591, 'Panama' ,'YRSCHOOL')
puerto_rico_yrs_school = var_diff(df, 630, 'Puerto Rico' ,'YRSCHOOL')
tnt_yrs_school = var_diff(df, 780, 'Trinadad and Tabago' ,'YRSCHOOL')
uruguay_yrs_school = var_diff(df, 858, 'Uruguay' ,'YRSCHOOL')
venezuela_yrs_school = var_diff(df, 862, 'venezuela' ,'YRSCHOOL')

combined= pd.concat([mexico_yrs_school, dr_yrs_school, jamaica_yrs_school, panama_yrs_school,
                     puerto_rico_yrs_school, tnt_yrs_school, uruguay_yrs_school,
                     venezuela_yrs_school], ignore_index=True)

combined


Unnamed: 0,country,var_name,year,Outcome for Men,Outcome for Women,Difference
0,Mexico,YRSCHOOL,2000.0,19.431,18.73,0.701
1,Dominican Republic,YRSCHOOL,2002.0,12.755,12.651,0.104
2,Jamaica,YRSCHOOL,2001.0,21.121,20.753,0.369
3,Panama,YRSCHOOL,2010.0,16.18,16.121,0.059
4,Puerto Rico,YRSCHOOL,2010.0,13.251,13.322,-0.071
5,Trinadad and Tabago,YRSCHOOL,2000.0,13.226,13.35,-0.123
6,Uruguay,YRSCHOOL,2006.0,6.975,7.468,-0.493
7,venezuela,YRSCHOOL,2001.0,13.318,13.085,0.233


# Calculate bachelors degree completed rate (University completed)

In [4]:
brazil_edu_att = var_diff(df, 76, 'Brazil' ,'EDATTAIN')
india_edu_att = var_diff(df, 356, 'India' ,'EDATTAIN')
mexico_edu_att = var_diff(df, 484, 'Mexico' ,'EDATTAIN')
dr_edu_att = var_diff(df, 214, 'Dominican Republic' ,'EDATTAIN')
jamaica_edu_att = var_diff(df, 388, 'Jamaica' ,'EDATTAIN')
panama_edu_att = var_diff(df, 591, 'Panama' ,'EDATTAIN')
puerto_rico_edu_att = var_diff(df, 630, 'Puerto Rico' ,'EDATTAIN')
tnt_edu_att = var_diff(df, 780, 'Trinadad and Tabago' ,'EDATTAIN')
uruguay_edu_att = var_diff(df, 858, 'Uruguay' ,'EDATTAIN')
venezuela_edu_att = var_diff(df, 862, 'venezuela' ,'EDATTAIN')

combined= pd.concat([brazil_edu_att, india_edu_att, mexico_edu_att, dr_edu_att,
                    jamaica_edu_att, panama_edu_att, puerto_rico_edu_att, tnt_edu_att
                    , uruguay_edu_att, venezuela_edu_att], ignore_index=True)

combined

Unnamed: 0,country,var_name,year,Outcome for Men,Outcome for Women,Difference
0,Brazil,uni_completed,2010.0,0.06,0.08,-0.02
1,India,uni_completed,2004.0,0.048,0.027,0.021
2,Mexico,uni_completed,2000.0,0.063,0.046,0.017
3,Dominican Republic,uni_completed,2002.0,0.043,0.049,-0.007
4,Jamaica,uni_completed,2001.0,0.015,0.017,-0.002
5,Panama,uni_completed,2010.0,0.083,0.118,-0.035
6,Puerto Rico,uni_completed,2010.0,0.126,0.187,-0.062
7,Trinadad and Tabago,uni_completed,2000.0,0.019,0.015,0.004
8,Uruguay,uni_completed,2006.0,0.04,0.042,-0.002
9,venezuela,uni_completed,2001.0,0.001,0.002,-0.0


# Calculate Literacy Rate

In [5]:
brazil_lit = var_diff(df, 76, 'Brazil' ,'LIT')
india_lit = var_diff(df, 356, 'India' ,'LIT')
mexico_lit = var_diff(df, 484, 'Mexico' ,'LIT')
dr_lit = var_diff(df, 214, 'Dominican Republic' ,'LIT')
jamaica_lit = var_diff(df, 388, 'Jamaica' ,'LIT')
panama_lit = var_diff(df, 591, 'Panama' ,'LIT')
puerto_rico_lit = var_diff(df, 630, 'Puerto Rico' ,'LIT')
tnt_lit = var_diff(df, 780, 'Trinadad and Tabago' ,'LIT')
uruguay_lit = var_diff(df, 858, 'Uruguay' ,'LIT')
venezuela_lit = var_diff(df, 862, 'venezuela' ,'LIT')

combined= pd.concat([brazil_lit, india_lit, mexico_lit, dr_lit,
                    jamaica_lit, panama_lit, puerto_rico_lit, tnt_lit
                    , uruguay_lit, venezuela_lit], ignore_index=True)
combined

  return (values * weights).sum() / total_weight


Unnamed: 0,country,var_name,year,Outcome for Men,Outcome for Women,Difference
0,Brazil,literacy_rate,2010.0,0.889,0.9,-0.011
1,India,literacy_rate,2004.0,0.68,0.51,0.169
2,Mexico,literacy_rate,2000.0,0.889,0.864,0.025
3,Dominican Republic,literacy_rate,2002.0,0.777,0.787,-0.01
4,Jamaica,literacy_rate,,,,
5,Panama,literacy_rate,2010.0,0.95,0.94,0.01
6,Puerto Rico,literacy_rate,,,,
7,Trinadad and Tabago,literacy_rate,,,,
8,Uruguay,literacy_rate,2006.0,0.888,0.907,-0.019
9,venezuela,literacy_rate,2001.0,0.861,0.867,-0.006
