In [1]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind

In [2]:
df = pd.read_parquet('data/health-indicators.parquet')
cohorts = pd.read_parquet('data/hh-cohorts.parquet')
hh = pd.read_parquet('data/hh_details.parquet')
cons = pd.read_parquet('data/baskets-consumption-value.parquet')

In [3]:
df = df[df['is_hospitalization'] != 4]
df['mult'] = hh['multiplier']
df['cohort_id'] = cohorts['cohort_id']

In [4]:
df['hosp_spend'] = cons.loc[df.index, "HOSP"]

In [5]:
df.head()

Unnamed: 0_level_0,is_hhmem_pmjay,is_hospitalization,is_benefit_healthscheme,mult,cohort_id,hosp_spend
hhid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
22300304,2,2,2.0,28599,21773,3250.0
22300308,2,2,2.0,28599,21722,2650.0
22300313,2,2,2.0,28599,21728,3916.666667
22301201,2,3,2.0,20463,21732,500.0
22301310,2,1,2.0,16099,21782,4.166667


In [6]:
counts = df.groupby('cohort_id')['is_benefit_healthscheme'].value_counts().unstack().dropna()
counts = counts[counts.min(axis=1) > 1]

In [7]:
def compare_within_cohort(xdf):
    treatment = xdf[xdf['is_benefit_healthscheme'] == 1][['mult', 'hosp_spend']]
    control = xdf[xdf['is_benefit_healthscheme'] != 1][['mult', 'hosp_spend']]
    
    tspend = treatment['hosp_spend'] * treatment['mult']
    cspend = control['hosp_spend'] * control['mult']
    pval = ttest_ind(tspend, cspend, equal_var=False, alternative='less').pvalue
    diff = tspend.sum() / treatment['mult'].sum() - cspend.sum() / control['mult'].sum()
    return pd.Series({'pval': pval, 'diff': diff,
                      'tsize': len(treatment), 'csize': len(control)})
    
xdf = df[df['cohort_id'].isin(counts.index)]
sig = xdf.groupby('cohort_id').apply(compare_within_cohort)

  res = hypotest_fun_out(*samples, **kwds)
  sig = xdf.groupby('cohort_id').apply(compare_within_cohort)


In [10]:
sig = pd.concat([sig, cohorts.drop_duplicates().set_index('cohort_id', verify_integrity=True).loc[sig.index]], axis=1, verify_integrity=True)

In [12]:
sig.to_csv("/tmp/sig.csv", index=True)

In [13]:
sig.head()

Unnamed: 0_level_0,pval,diff,tsize,csize,sector,nss_region,max_income_from,hoh_religion,family_size,has_child,has_elderly,caste
cohort_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
21,0.253369,-551.545699,2.0,2.0,rural,11,regular wage/salary earning,Hinduism,nuclear,True,False,scheduled caste (SC)
96,0.339139,-13.080173,3.0,2.0,rural,12,casual labour,Islam,nuclear,True,False,scheduled tribe (ST)
106,0.66736,378.132775,3.0,3.0,rural,12,casual labour,Islam,large,True,True,scheduled tribe (ST)
263,0.995152,921.66748,8.0,6.0,rural,13,self-employment,Islam,nuclear,False,False,others
268,0.610804,551.703394,4.0,5.0,rural,13,self-employment,Islam,nuclear,True,False,others


In [17]:
sig.to_parquet('data/summaries/01.parquet')

In [18]:
df['nohosp_spend'] = cons.loc[df.index, "NOHOSP"]

In [19]:
def compare_within_cohort(xdf):
    treatment = xdf[xdf['is_benefit_healthscheme'] == 1][['mult', 'nohosp_spend']]
    control = xdf[xdf['is_benefit_healthscheme'] != 1][['mult', 'nohosp_spend']]
    
    tspend = treatment['nohosp_spend'] * treatment['mult']
    cspend = control['nohosp_spend'] * control['mult']
    pval = ttest_ind(tspend, cspend, equal_var=False, alternative='less').pvalue
    diff = tspend.sum() / treatment['mult'].sum() - cspend.sum() / control['mult'].sum()
    return pd.Series({'pval': pval, 'diff': diff,
                      'tsize': len(treatment), 'csize': len(control)})
    
xdf = df[df['cohort_id'].isin(counts.index)]
sig = xdf.groupby('cohort_id').apply(compare_within_cohort)

  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  sig = xdf.groupby('cohort_id').apply(compare_within_cohort)


In [21]:
sig = pd.concat([sig, cohorts.drop_duplicates().set_index('cohort_id', verify_integrity=True).loc[sig.index]], axis=1, verify_integrity=True)

In [25]:
sig[sig['pval'] < 0.05].sort_values('diff')

Unnamed: 0_level_0,pval,diff,tsize,csize,sector,nss_region,max_income_from,hoh_religion,family_size,has_child,has_elderly,caste
cohort_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
19927,0.03754477,-312.487329,2.0,8.0,urban,281,,Hinduism,nuclear,False,True,other backward class (OBC)
10037,3.471111e-05,-300.857498,4.0,14.0,rural,321,self-employment,Islam,large,True,True,other backward class (OBC)
863,0.009783334,-283.791214,2.0,16.0,rural,32,self-employment,Sikhism,large,True,True,others
10174,0.01626952,-220.884439,10.0,15.0,rural,322,self-employment,Hinduism,nuclear,False,True,other backward class (OBC)
12071,0.01630427,-208.922818,2.0,7.0,urban,51,regular wage/salary earning,Hinduism,nuclear,False,False,others
6120,0.006300324,-183.336007,2.0,14.0,rural,211,self-employment,Hinduism,nuclear,False,False,other backward class (OBC)
16151,0.009538115,-176.652177,2.0,16.0,urban,194,regular wage/salary earning,Hinduism,nuclear,False,True,others
21684,0.006077079,-171.042336,4.0,31.0,urban,334,,Hinduism,nuclear,False,True,other backward class (OBC)
2033,0.03628985,-166.600444,3.0,5.0,rural,85,self-employment,Hinduism,nuclear,True,False,other backward class (OBC)
6266,0.009735422,-166.071118,3.0,12.0,rural,212,self-employment,Hinduism,nuclear,False,False,other backward class (OBC)


In [26]:
sig.to_parquet('data/summaries/02.parquet')

In [27]:
# Food?

In [29]:
df['FOOD'] = cons.loc[df.index, "FOOD"]

In [30]:
def compare_within_cohort(xdf):
    treatment = xdf[xdf['is_benefit_healthscheme'] == 1][['mult', 'FOOD']]
    control = xdf[xdf['is_benefit_healthscheme'] != 1][['mult', 'FOOD']]
    
    tspend = treatment['FOOD'] * treatment['mult']
    cspend = control['FOOD'] * control['mult']
    pval = ttest_ind(tspend, cspend, equal_var=False, alternative='greater').pvalue
    diff = tspend.sum() / treatment['mult'].sum() - cspend.sum() / control['mult'].sum()
    return pd.Series({'pval': pval, 'diff': diff,
                      'tsize': len(treatment), 'csize': len(control)})
    
xdf = df[df['cohort_id'].isin(counts.index)]
sig = xdf.groupby('cohort_id').apply(compare_within_cohort)

  sig = xdf.groupby('cohort_id').apply(compare_within_cohort)


In [36]:
sig = pd.concat([sig, cohorts.drop_duplicates().set_index('cohort_id', verify_integrity=True).loc[sig.index]], axis=1, verify_integrity=True)

In [42]:
sig[sig['pval'] < 0.05].sort_values('diff', ascending=False)

Unnamed: 0_level_0,pval,diff,tsize,csize,sector,nss_region,max_income_from,hoh_religion,family_size,has_child,has_elderly,caste
cohort_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
14787,0.000106,8566.085065,2.0,4.0,urban,131,,Christianity,nuclear,False,True,scheduled tribe (ST)
21038,0.023101,8121.541671,2.0,5.0,urban,322,casual labour,Hinduism,large,True,True,other backward class (OBC)
430,0.021685,7991.825518,5.0,7.0,rural,22,regular wage/salary earning,Hinduism,large,True,True,others
9110,0.006597,6321.793244,2.0,5.0,rural,282,self-employment,Hinduism,nuclear,True,False,others
5569,0.005915,6184.57102,3.0,3.0,rural,195,casual labour,Hinduism,large,True,True,scheduled caste (SC)
6588,0.010835,6073.204946,2.0,3.0,rural,222,self-employment,Hinduism,nuclear,False,False,scheduled tribe (ST)
1682,0.002278,5732.902717,10.0,6.0,rural,82,self-employment,Hinduism,nuclear,False,True,other backward class (OBC)
4013,0.043734,5622.48324,2.0,12.0,rural,161,regular wage/salary earning,Hinduism,nuclear,False,False,other backward class (OBC)
8950,0.005427,4983.498018,3.0,13.0,rural,281,self-employment,Hinduism,nuclear,False,True,other backward class (OBC)
7786,0.035463,4442.01066,2.0,2.0,rural,245,casual labour,Hinduism,nuclear,False,True,other backward class (OBC)


In [43]:
# EDUCATION?
df['EDUCATION'] = cons.loc[df.index, "EDUCATION"]

def compare_within_cohort(xdf):
    treatment = xdf[xdf['is_benefit_healthscheme'] == 1][['mult', 'EDUCATION']]
    control = xdf[xdf['is_benefit_healthscheme'] != 1][['mult', 'EDUCATION']]
    
    tspend = treatment['EDUCATION'] * treatment['mult']
    cspend = control['EDUCATION'] * control['mult']
    pval = ttest_ind(tspend, cspend, equal_var=False, alternative='greater').pvalue
    diff = tspend.sum() / treatment['mult'].sum() - cspend.sum() / control['mult'].sum()
    return pd.Series({'pval': pval, 'diff': diff,
                      'tsize': len(treatment), 'csize': len(control)})
    
xdf = df[df['cohort_id'].isin(counts.index)]
sig = xdf.groupby('cohort_id').apply(compare_within_cohort)

  sig = xdf.groupby('cohort_id').apply(compare_within_cohort)


In [45]:
sig = pd.concat([sig, cohorts.drop_duplicates().set_index('cohort_id', verify_integrity=True).loc[sig.index]], axis=1, verify_integrity=True)

In [49]:
sig[sig['pval'] < 0.05].sort_values('diff', ascending=False)

Unnamed: 0_level_0,pval,diff,tsize,csize,sector,nss_region,max_income_from,hoh_religion,family_size,has_child,has_elderly,caste
cohort_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1083,0.000686,2291.870902,2.0,6.0,rural,51,self-employment,Hinduism,large,True,True,other backward class (OBC)
14937,0.040598,1805.632056,2.0,21.0,urban,141,self-employment,Hinduism,nuclear,True,False,other backward class (OBC)
1545,0.037413,1715.214939,7.0,8.0,rural,81,self-employment,Hinduism,nuclear,True,False,other backward class (OBC)
20129,0.016411,1715.005537,3.0,2.0,urban,283,regular wage/salary earning,Hinduism,nuclear,True,False,scheduled caste (SC)
6145,0.040498,1218.568001,5.0,3.0,rural,211,self-employment,Hinduism,large,True,True,other backward class (OBC)
14955,0.037189,1121.330104,3.0,10.0,urban,141,self-employment,Hinduism,large,True,True,other backward class (OBC)
17175,0.049543,697.553907,4.0,3.0,urban,222,self-employment,Hinduism,nuclear,True,False,other backward class (OBC)
20920,0.045313,564.617184,3.0,9.0,urban,321,casual labour,Islam,large,True,True,other backward class (OBC)
11388,0.0036,460.701285,2.0,2.0,urban,13,casual labour,Islam,nuclear,False,False,others
4176,0.049519,325.271346,4.0,2.0,rural,171,casual labour,Others,large,True,False,scheduled tribe (ST)


In [50]:
# Utilities
df['UTILITIES'] = cons.loc[df.index, "UTILITIES"]

def compare_within_cohort(xdf):
    treatment = xdf[xdf['is_benefit_healthscheme'] == 1][['mult', 'UTILITIES']]
    control = xdf[xdf['is_benefit_healthscheme'] != 1][['mult', 'UTILITIES']]
    
    tspend = treatment['UTILITIES'] * treatment['mult']
    cspend = control['UTILITIES'] * control['mult']
    pval = ttest_ind(tspend, cspend, equal_var=False, alternative='greater').pvalue
    diff = tspend.sum() / treatment['mult'].sum() - cspend.sum() / control['mult'].sum()
    return pd.Series({'pval': pval, 'diff': diff,
                      'tsize': len(treatment), 'csize': len(control)})
    
xdf = df[df['cohort_id'].isin(counts.index)]
sig = xdf.groupby('cohort_id').apply(compare_within_cohort)
sig = pd.concat([sig, cohorts.drop_duplicates().set_index('cohort_id', verify_integrity=True).loc[sig.index]], axis=1, verify_integrity=True)
sig[sig['pval'] < 0.05].sort_values('diff', ascending=False)

  res = hypotest_fun_out(*samples, **kwds)
  sig = xdf.groupby('cohort_id').apply(compare_within_cohort)


Unnamed: 0_level_0,pval,diff,tsize,csize,sector,nss_region,max_income_from,hoh_religion,family_size,has_child,has_elderly,caste
cohort_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1616,0.03430517,1720.146489,3.0,2.0,rural,82,casual labour,Hinduism,nuclear,True,True,scheduled caste (SC)
14725,0.04481415,1341.095265,2.0,4.0,urban,131,regular wage/salary earning,Christianity,nuclear,False,False,scheduled tribe (ST)
4176,0.008616475,1125.583472,4.0,2.0,rural,171,casual labour,Others,large,True,False,scheduled tribe (ST)
399,0.04144672,995.942832,2.0,3.0,rural,22,casual labour,Hinduism,large,True,False,scheduled caste (SC)
6115,0.01085078,792.04807,3.0,4.0,rural,211,regular wage/salary earning,Hinduism,large,True,True,others
6773,0.03977917,770.959736,2.0,2.0,rural,231,self-employment,Hinduism,large,True,False,scheduled tribe (ST)
8623,1.294678e-08,726.587851,2.0,11.0,rural,275,casual labour,Hinduism,nuclear,False,True,other backward class (OBC)
3797,0.03146881,724.383781,5.0,4.0,rural,141,self-employment,Hinduism,large,True,True,others
1682,0.006004973,690.918662,10.0,6.0,rural,82,self-employment,Hinduism,nuclear,False,True,other backward class (OBC)
6588,0.01099621,663.041673,2.0,3.0,rural,222,self-employment,Hinduism,nuclear,False,False,scheduled tribe (ST)


In [51]:
# Assets
df['ASSETS'] = cons.loc[df.index, "ASSETS"]

def compare_within_cohort(xdf):
    treatment = xdf[xdf['is_benefit_healthscheme'] == 1][['mult', 'ASSETS']]
    control = xdf[xdf['is_benefit_healthscheme'] != 1][['mult', 'ASSETS']]
    
    tspend = treatment['ASSETS'] * treatment['mult']
    cspend = control['ASSETS'] * control['mult']
    pval = ttest_ind(tspend, cspend, equal_var=False, alternative='greater').pvalue
    diff = tspend.sum() / treatment['mult'].sum() - cspend.sum() / control['mult'].sum()
    return pd.Series({'pval': pval, 'diff': diff,
                      'tsize': len(treatment), 'csize': len(control)})
    
xdf = df[df['cohort_id'].isin(counts.index)]
sig = xdf.groupby('cohort_id').apply(compare_within_cohort)
sig = pd.concat([sig, cohorts.drop_duplicates().set_index('cohort_id', verify_integrity=True).loc[sig.index]], axis=1, verify_integrity=True)
sig[sig['pval'] < 0.05].sort_values('diff', ascending=False)

  sig = xdf.groupby('cohort_id').apply(compare_within_cohort)


Unnamed: 0_level_0,pval,diff,tsize,csize,sector,nss_region,max_income_from,hoh_religion,family_size,has_child,has_elderly,caste
cohort_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2003,0.02822659,2162.662533,3.0,3.0,rural,85,regular wage/salary earning,Hinduism,large,True,False,other backward class (OBC)
5187,0.03992509,2126.787518,2.0,2.0,rural,192,self-employment,Hinduism,nuclear,False,True,other backward class (OBC)
3673,0.04497647,1743.696887,2.0,2.0,rural,131,,Christianity,nuclear,False,True,scheduled tribe (ST)
21142,1.733402e-09,1727.108397,2.0,13.0,urban,322,,Christianity,nuclear,False,True,others
10604,0.03005045,1504.791235,3.0,17.0,rural,334,casual labour,Hinduism,nuclear,False,False,scheduled caste (SC)
9109,0.007768504,1406.490369,4.0,2.0,rural,282,self-employment,Hinduism,nuclear,True,False,other backward class (OBC)
2501,0.01983233,1359.321595,2.0,24.0,rural,93,self-employment,Hinduism,nuclear,False,False,other backward class (OBC)
14754,0.02622262,1273.660577,2.0,6.0,urban,131,self-employment,Christianity,nuclear,True,False,scheduled tribe (ST)
14787,0.0177875,1028.004259,2.0,4.0,urban,131,,Christianity,nuclear,False,True,scheduled tribe (ST)
9103,0.01968053,858.114253,4.0,2.0,rural,282,self-employment,Hinduism,nuclear,False,False,scheduled caste (SC)
