In [1]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind

In [2]:
df = pd.read_parquet('data/health-indicators.parquet')
cohorts = pd.read_parquet('data/hh-cohorts.parquet')
hh = pd.read_parquet('data/hh_details.parquet')
cons = pd.read_parquet('data/baskets-consumption-value.parquet')

In [4]:
df = df[df['is_hospitalization'] != 4]
df['mult'] = hh['multiplier']
df['cohort_id'] = cohorts['cohort_id']

In [6]:
df['hosp_spend'] = cons.loc[df.index, "HOSP"]

In [56]:
df['nohosp'] = cons.loc[df.index, "NOHOSP"]

In [7]:
df.head()

Unnamed: 0_level_0,is_hhmem_pmjay,is_hospitalization,is_benefit_healthscheme,mult,cohort_id,hosp_spend
hhid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
22300304,2,2,2.0,28599,21773,3250.0
22300308,2,2,2.0,28599,21722,2650.0
22300313,2,2,2.0,28599,21728,3916.666667
22301201,2,3,2.0,20463,21732,500.0
22301310,2,1,2.0,16099,21782,4.166667


In [32]:
counts = df.groupby('cohort_id')['is_benefit_healthscheme'].value_counts().unstack().dropna()
counts = counts[counts.min(axis=1) > 1]

In [70]:
def compare_within_cohort(xdf):
    treatment = xdf[xdf['is_benefit_healthscheme'] == 1][['mult', 'hosp_spend']]
    control = xdf[xdf['is_benefit_healthscheme'] != 1][['mult', 'hosp_spend']]
    
    tspend = treatment['hosp_spend'] * treatment['mult']
    cspend = control['hosp_spend'] * control['mult']
    pval = ttest_ind(tspend, cspend, equal_var=False, alternative='less').pvalue
    diff = tspend.sum() / treatment['mult'].sum() - cspend.sum() / control['mult'].sum()
    return pd.Series({'pval': pval, 'diff': diff,
                      'tsize': len(treatment), 'csize': len(control)})
    
xdf = df[df['cohort_id'].isin(counts.index)]
sig = xdf.groupby('cohort_id').apply(compare_within_cohort)

  res = hypotest_fun_out(*samples, **kwds)


In [71]:
sig = sig[sig['pval'] < 0.05]
sig['frac'] = abs(sig['tsize'] / sig['csize'] - 1)

In [73]:
sig.sort_values('diff').head()

Unnamed: 0_level_0,pval,diff,tsize,csize,frac
cohort_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8881,0.02473,-3079.351367,3.0,2.0,0.5
12071,0.007425,-2842.14483,2.0,7.0,0.714286
20957,0.045737,-2706.562429,2.0,3.0,0.333333
1274,0.001767,-2336.576311,2.0,8.0,0.75
18245,0.04155,-2201.874268,2.0,13.0,0.846154


In [74]:
good = cohorts[cohorts['cohort_id'].isin(sig.index)].drop_duplicates()

In [77]:
good

Unnamed: 0_level_0,sector,nss_region,max_income_from,hoh_religion,family_size,has_child,has_elderly,caste,cohort_id
hhid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
46736202,rural,22,self-employment,Hinduism,nuclear,False,True,others,445
46798204,rural,21,,Hinduism,nuclear,False,True,others,368
46954301,rural,51,regular wage/salary earning,Hinduism,nuclear,True,False,others,1022
46499303,rural,61,casual labour,Hinduism,large,True,True,scheduled caste (SC),1158
46427301,rural,62,casual labour,Hinduism,large,True,True,scheduled caste (SC),1274
46896201,rural,85,self-employment,Hinduism,large,True,False,other backward class (OBC),2048
46890203,rural,85,self-employment,Hinduism,nuclear,False,True,other backward class (OBC),2029
46000302,rural,83,casual labour,Hinduism,nuclear,True,False,scheduled tribe (ST),1765
47960301,rural,92,self-employment,Hinduism,large,True,True,scheduled caste (SC),2349
43528309,rural,101,self-employment,Hinduism,nuclear,True,False,scheduled caste (SC),3001


In [54]:
good['max_income_from'].value_counts(normalize=True)

max_income_from
self-employment                0.5625
casual labour                  0.2500
regular wage/salary earning    0.1875
Name: proportion, dtype: float64

In [53]:
good[good['nss_region'] == 321]

Unnamed: 0_level_0,sector,nss_region,max_income_from,hoh_religion,family_size,has_child,has_elderly,caste,cohort_id
hhid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
40376308,rural,321,,Islam,nuclear,False,True,other backward class (OBC),10060
40373310,rural,321,,Hinduism,nuclear,False,True,others,10050
24256203,urban,321,casual labour,Hinduism,large,True,True,other backward class (OBC),20908
24253309,urban,321,self-employment,Hinduism,nuclear,False,True,other backward class (OBC),20957


In [57]:
def compare_within_cohort(xdf):
    treatment = xdf[xdf['is_benefit_healthscheme'] == 1][['mult', 'nohosp']]
    control = xdf[xdf['is_benefit_healthscheme'] != 1][['mult', 'nohosp']]
    
    tspend = treatment['nohosp'] * treatment['mult']
    cspend = control['nohosp'] * control['mult']
    pval = ttest_ind(tspend, cspend, equal_var=False, alternative='less').pvalue
    diff = tspend.sum() / treatment['mult'].sum() - cspend.sum() / control['mult'].sum()
    return pd.Series({'pval': pval, 'diff': diff,
                      'tsize': len(treatment), 'csize': len(control)})
    
xdf = df[df['cohort_id'].isin(counts.index)]
sig = xdf.groupby('cohort_id').apply(compare_within_cohort)

  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)


In [58]:
sig = sig[sig['pval'] < 0.05]
sig['frac'] = abs(sig['tsize'] / sig['csize'] - 1)

In [63]:
good = cohorts[cohorts['cohort_id'].isin(sig.index)].drop_duplicates()

In [66]:
good

Unnamed: 0_level_0,sector,nss_region,max_income_from,hoh_religion,family_size,has_child,has_elderly,caste,cohort_id
hhid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
46626307,rural,13,self-employment,Islam,nuclear,True,False,others,268
46319101,rural,32,self-employment,Sikhism,large,True,True,others,863
46427301,rural,62,casual labour,Hinduism,large,True,True,scheduled caste (SC),1274
46896305,rural,85,self-employment,Hinduism,nuclear,True,False,other backward class (OBC),2033
46060301,rural,81,self-employment,Hinduism,large,True,False,other backward class (OBC),1559
46000302,rural,83,casual labour,Hinduism,nuclear,True,False,scheduled tribe (ST),1765
46000308,rural,83,self-employment,Hinduism,large,True,False,scheduled tribe (ST),1825
47953313,rural,93,self-employment,Hinduism,nuclear,False,False,other backward class (OBC),2501
43041302,rural,102,self-employment,Hinduism,large,True,False,other backward class (OBC),3194
45336204,rural,121,self-employment,Christianity,nuclear,True,False,scheduled tribe (ST),3550


In [69]:
_ = sig.sort_values('diff')
cohorts[cohorts['cohort_id'] == 10174]

Unnamed: 0_level_0,sector,nss_region,max_income_from,hoh_religion,family_size,has_child,has_elderly,caste,cohort_id
hhid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
40573305,rural,322,self-employment,Hinduism,nuclear,False,True,other backward class (OBC),10174
40573310,rural,322,self-employment,Hinduism,nuclear,False,True,other backward class (OBC),10174
40574302,rural,322,self-employment,Hinduism,nuclear,False,True,other backward class (OBC),10174
40572201,rural,322,self-employment,Hinduism,nuclear,False,True,other backward class (OBC),10174
40570307,rural,322,self-employment,Hinduism,nuclear,False,True,other backward class (OBC),10174
...,...,...,...,...,...,...,...,...,...
40935305,rural,322,self-employment,Hinduism,nuclear,False,True,other backward class (OBC),10174
40939203,rural,322,self-employment,Hinduism,nuclear,False,True,other backward class (OBC),10174
40938203,rural,322,self-employment,Hinduism,nuclear,False,True,other backward class (OBC),10174
40938204,rural,322,self-employment,Hinduism,nuclear,False,True,other backward class (OBC),10174
