In [1]:
%matplotlib notebook
import pandas as pd
import numpy as np
from scipy import stats
import os


In [2]:
def crosscheck(df2update,bydf,byvar,checkvar,desc):
    
    # Find means and sums and append columns to the StatsBySmokingStatus dataframe
    df2update[f"Percent with {desc}"] = 100 * bydf[f"{checkvar}"].mean()
    df2update[f"Number with {desc}"] = bydf[f"{checkvar}"].sum()

    # Run the t-test
    (t_stat, p) = stats.ttest_ind(yeardata.loc[yeardata[f"{byvar}"]==1,[f"{checkvar}"]],\
                                  yeardata.loc[yeardata[f"{byvar}"]==0,[f"{checkvar}"]],equal_var=False)

    df2update[f"{desc} Difference P-Value"] = p[0]
    if p < 0.05:
        df2update[f"{desc} Difference Significant"] = "Yes"
    else:
        df2update[f"{desc} Difference Significant"] = "No"
        

In [3]:

year = 2017

# Bring in this year's combined data
outdatapath = os.path.join('..','output_data',f"year_{year}.csv")
yeardata = pd.read_csv(outdatapath)
    
# Use the value counts to determine the number of smoking households
smkcounts = yeardata['HH with Smoker'].value_counts()

# Group by the smoking status of the household so we can run stats
BySmokingStatus = yeardata.groupby(['HH with Smoker'])

# Convert the value counts into a dataframe and add year
StatsBySmokingStatus = pd.DataFrame({'Smoker Flag':smkcounts.index, 'Count':smkcounts.values})
StatsBySmokingStatus["Year"] = BySmokingStatus["SRVY_YR"].mean()

crosscheck(df2update=StatsBySmokingStatus, \
           bydf=BySmokingStatus, \
           byvar='HH with Smoker', \
           checkvar='HH with Asthmatic Child', \
           desc='Asthmatic Child')

crosscheck(df2update=StatsBySmokingStatus, \
           bydf=BySmokingStatus, \
           byvar='HH with Smoker', \
           checkvar='HH with ADHD/ADD Child', \
           desc='ADHD/ADD Child')

crosscheck(df2update=StatsBySmokingStatus, \
           bydf=BySmokingStatus, \
           byvar='HH with Smoker', \
           checkvar='HH with Autistic Child', \
           desc='Autistic Child')

crosscheck(df2update=StatsBySmokingStatus, \
           bydf=BySmokingStatus, \
           byvar='HH with Smoker', \
           checkvar='HH with Child With Concentration/Emotional/Behavior Issues', \
           desc='Concentration/Emotional/Behavior')
    
crosscheck(df2update=StatsBySmokingStatus, \
           bydf=BySmokingStatus, \
           byvar='HH with Smoker', \
           checkvar='HH in Poverty', \
           desc='in Poverty')
    

StatsBySmokingStatus.head()

Unnamed: 0,Smoker Flag,Count,Year,Percent with Asthmatic Child,Number with Asthmatic Child,Asthmatic Child Difference P-Value,Asthmatic Child Difference Significant,Percent with ADHD/ADD Child,Number with ADHD/ADD Child,ADHD/ADD Child Difference P-Value,...,Autistic Child Difference P-Value,Autistic Child Difference Significant,Percent with Concentration/Emotional/Behavior,Number with Concentration/Emotional/Behavior,Concentration/Emotional/Behavior Difference P-Value,Concentration/Emotional/Behavior Difference Significant,Percent with in Poverty,Number with in Poverty,in Poverty Difference P-Value,in Poverty Difference Significant
0,0,5875,2017,12.544681,737,0.003555,Yes,6.604255,388,7.025933e-09,...,0.00869,Yes,17.208511,1011,4.847708e-10,Yes,12.374468,727,4.2567859999999996e-19,Yes
1,1,1106,2017,16.003617,177,0.003555,Yes,12.748644,141,7.025933e-09,...,0.00869,Yes,26.039783,288,4.847708e-10,Yes,24.77396,274,4.2567859999999996e-19,Yes


In [4]:
def collapse2year(df2collapse,byvar,yesdesc,nodesc,compdesc):
    
    # Create two dataframes:  one with the yes-status.  One with the no-status
    yesdf = df2collapse[df2collapse[f'{byvar}']==1]
    nodf = df2collapse[df2collapse[f'{byvar}']==0]
    
    # on the yes dataframe keep the p-values and significance results as well as all of the flags
    yesdf = yesdf.filter(['Year','Count','Percent with Asthmatic Child', 'Number with Asthmatic Child', \
                      'Asthmatic Child Difference P-Value','Asthmatic Child Difference Significant',\
                      'Percent with Autistic Child', 'Number with Autistic Child', \
                      'Autistic Child Difference P-Value','Autistic Child Difference Significant',\
                      'Percent with ADHD/ADD Child', 'Number with ADHD/ADD Child', \
                      'ADHD/ADD Child Difference P-Value','ADHD/ADD Child Difference Significant',\
                      "Number with Concentration/Emotional/Behavior","Percent with Concentration/Emotional/Behavior",\
                      'Concentration/Emotional/Behavior Difference P-Value',\
                      'Concentration/Emotional/Behavior Difference Significant'])
    
    # Rename variables on the yes dataframe
    yesdf = yesdf.rename(columns={"Count": f"Count {yesdesc}", \
                         "Percent with Asthmatic Child": f"Asthma Rate {yesdesc}", \
                         "Number with Asthmatic Child": f"Count Asthma {yesdesc}", \
                         'Asthmatic Child Difference P-Value': f"Asthmatic Child Difference P-Value {compdesc}", \
                         'Asthmatic Child Difference Significant': f"Asthmatic Child Difference Significant {compdesc}", \
                         "Percent with ADHD/ADD Child": f"ADHD/ADD Rate {yesdesc}", \
                         "Number with ADHD/ADD Child": f"Count ADHD/ADD {yesdesc}", \
                         'ADHD/ADD Child Difference P-Value': f"ADHD/ADD Child Difference P-Value {compdesc}", \
                         'ADHD/ADD Child Difference Significant': f"ADHD/ADD Child Difference Significant {compdesc}", \
                         "Percent with Autistic Child": f"Autism Rate {yesdesc}", \
                         "Number with Autistic Child": f"Count Autism {yesdesc}", \
                         'Autistic Child Difference P-Value': f"Autistic Child Difference P-Value {compdesc}", \
                         'Autistic Child Difference Significant': f"Autistic Child Difference Significant {compdesc}", \
                         "Percent with Concentration/Emotional/Behavior": f"Concentration/Emotional/Behavior Rate {yesdesc}", \
                         "Number with Concentration/Emotional/Behavior": f"Count Concentration/Emotional/Behavior {yesdesc}", \
                         "Concentration/Emotional/Behavior Difference P-Value": f"Concentration/Emotional/Behavior Difference P-Value {compdesc}", \
                         "Concentration/Emotional/Behavior Difference Significant": f"Concentration/Emotional/Behavior Difference Significant {compdesc}"})

    nodf = nodf.filter(['Year','Count','Percent with Asthmatic Child', 'Number with Asthmatic Child', \
                      'Percent with Autistic Child', 'Number with Autistic Child', \
                      'Percent with ADHD/ADD Child', 'Number with ADHD/ADD Child', \
                      "Number with Concentration/Emotional/Behavior","Percent with Concentration/Emotional/Behavior"])
    
    nodf = nodf.rename(columns={"Count": f"Count {nodesc}", \
                              "Percent with Asthmatic Child": f"Asthma Rate {nodesc}", \
                              "Number with Asthmatic Child": f"Count Asthma {nodesc}", \
                              "Percent with ADHD/ADD Child": f"ADHD/ADD Rate {nodesc}", \
                              "Number with ADHD/ADD Child": f"Count ADHD/ADD {nodesc}", \
                              "Percent with Autistic Child": f"Autism Rate {nodesc}", \
                              "Number with Autistic Child": f"Count Autism {nodesc}", \
                              "Percent with Concentration/Emotional/Behavior": f"Concentration/Emotional/Behavior Rate {nodesc}", \
                              "Number with Concentration/Emotional/Behavior": f"Count Concentration/Emotional/Behavior {nodesc}"})

    # Merge the yes/no dataframes together to create the comparison for this year
    comparedf = pd.merge(yesdf,nodf,on=["Year"])
    comparedf[f"Percent {yesdesc}"] = 100 * comparedf[f"Count {yesdesc}"] / (comparedf[f"Count {yesdesc}"]  + comparedf[f"Count {nodesc}"] )
    comparedf[f"Percent {nodesc}"] = 100 * comparedf[f"Count {nodesc}"] / (comparedf[f"Count {yesdesc}"]  + comparedf[f"Count {nodesc}"] )
    
    # Add percentages to the combined dataframe
    return comparedf

compare_smoke = collapse2year(df2collapse=StatsBySmokingStatus,byvar='Smoker Flag', \
                               yesdesc='in Smoking HH',nodesc='in Non-Smoking HH', \
                               compdesc="for Smoking Status")
compare_smoke.head()

Unnamed: 0,Year,Count in Smoking HH,Asthma Rate in Smoking HH,Count Asthma in Smoking HH,Asthmatic Child Difference P-Value for Smoking Status,Asthmatic Child Difference Significant for Smoking Status,Autism Rate in Smoking HH,Count Autism in Smoking HH,Autistic Child Difference P-Value for Smoking Status,Autistic Child Difference Significant for Smoking Status,...,Asthma Rate in Non-Smoking HH,Count Asthma in Non-Smoking HH,Autism Rate in Non-Smoking HH,Count Autism in Non-Smoking HH,ADHD/ADD Rate in Non-Smoking HH,Count ADHD/ADD in Non-Smoking HH,Count Concentration/Emotional/Behavior in Non-Smoking HH,Concentration/Emotional/Behavior Rate in Non-Smoking HH,Percent in Smoking HH,Percent in Non-Smoking HH
0,2017,1106,16.003617,177,0.003555,Yes,3.526221,39,0.00869,Yes,...,12.544681,737,1.991489,117,6.604255,388,1011,17.208511,15.843002,84.156998


In [5]:
# Use the value counts to determine the number of impoverished families
povertycounts = yeardata['HH in Poverty'].value_counts()

# Group by the poverty status of the household so we can run stats
ByPovertyStatus = yeardata.groupby(['HH in Poverty'])

# Convert the value counts into a dataframe and add year
StatsByPovertyStatus = pd.DataFrame({'Poverty Flag':povertycounts.index, 'Count':povertycounts.values})
StatsByPovertyStatus["Year"] = ByPovertyStatus["SRVY_YR"].mean()

crosscheck(df2update=StatsByPovertyStatus, \
           bydf=ByPovertyStatus, \
           byvar='HH in Poverty', \
           checkvar='HH with Asthmatic Child', \
           desc='Asthmatic Child')

crosscheck(df2update=StatsByPovertyStatus, \
           bydf=ByPovertyStatus, \
           byvar='HH in Poverty', \
           checkvar='HH with ADHD/ADD Child', \
           desc='ADHD/ADD Child')

crosscheck(df2update=StatsByPovertyStatus, \
           bydf=ByPovertyStatus, \
           byvar='HH in Poverty', \
           checkvar='HH with Autistic Child', \
           desc='Autistic Child')

crosscheck(df2update=StatsByPovertyStatus, \
           bydf=ByPovertyStatus, \
           byvar='HH in Poverty', \
           checkvar='HH with Child With Concentration/Emotional/Behavior Issues', \
           desc='Concentration/Emotional/Behavior')
    
crosscheck(df2update=StatsByPovertyStatus, \
           bydf=ByPovertyStatus, \
           byvar='HH in Poverty', \
           checkvar='HH with Smoker', \
           desc='Smoking HH')

StatsByPovertyStatus.head()

Unnamed: 0,Poverty Flag,Count,Year,Percent with Asthmatic Child,Number with Asthmatic Child,Asthmatic Child Difference P-Value,Asthmatic Child Difference Significant,Percent with ADHD/ADD Child,Number with ADHD/ADD Child,ADHD/ADD Child Difference P-Value,...,Autistic Child Difference P-Value,Autistic Child Difference Significant,Percent with Concentration/Emotional/Behavior,Number with Concentration/Emotional/Behavior,Concentration/Emotional/Behavior Difference P-Value,Concentration/Emotional/Behavior Difference Significant,Percent with Smoking HH,Number with Smoking HH,Smoking HH Difference P-Value,Smoking HH Difference Significant
0,0,5980,2017,12.658863,757,0.01386,Yes,7.107023,425,0.00133,...,0.931694,No,17.892977,1070,0.000454,Yes,13.913043,832,3.6650149999999998e-19,Yes
1,1,1001,2017,15.684316,157,0.01386,Yes,10.38961,104,0.00133,...,0.931694,No,22.877123,229,0.000454,Yes,27.372627,274,3.6650149999999998e-19,Yes


In [6]:
compare_poverty = collapse2year(df2collapse=StatsByPovertyStatus,byvar='Poverty Flag', \
                               yesdesc='in Impoverished HH',nodesc='in Non-Impoverished HH', \
                               compdesc="for Smoking Status")

compare_poverty.head()

Unnamed: 0,Year,Count in Impoverished HH,Asthma Rate in Impoverished HH,Count Asthma in Impoverished HH,Asthmatic Child Difference P-Value for Smoking Status,Asthmatic Child Difference Significant for Smoking Status,Autism Rate in Impoverished HH,Count Autism in Impoverished HH,Autistic Child Difference P-Value for Smoking Status,Autistic Child Difference Significant for Smoking Status,...,Asthma Rate in Non-Impoverished HH,Count Asthma in Non-Impoverished HH,Autism Rate in Non-Impoverished HH,Count Autism in Non-Impoverished HH,ADHD/ADD Rate in Non-Impoverished HH,Count ADHD/ADD in Non-Impoverished HH,Count Concentration/Emotional/Behavior in Non-Impoverished HH,Concentration/Emotional/Behavior Rate in Non-Impoverished HH,Percent in Impoverished HH,Percent in Non-Impoverished HH
0,2017,1001,15.684316,157,0.01386,Yes,2.197802,22,0.931694,No,...,12.658863,757,2.240803,134,7.107023,425,1070,17.892977,14.33892,85.66108


In [7]:
# Break the dataframe into impoverished and non-impoverished families.  Then rerun the analyses.
poverty = yeardata[yeardata['HH in Poverty']==1] 
nopoverty = yeardata[yeardata['HH in Poverty']==0] 

# Find counts of smoking status in these dataframes
psmk_pov_counts = poverty['HH with Smoker'].value_counts()
npsmk_pov_counts = nopoverty['HH with Smoker'].value_counts()

# Group by the smoking status of the household so we can run stats
PovertyBySmoke = poverty.groupby(['HH with Smoker'])
NoPovertyBySmoke = nopoverty.groupby(['HH with Smoker'])

# Convert the value counts into a dataframe and add year
StatsPovertyBySmoke = pd.DataFrame({'Smoking Flag':psmk_pov_counts.index, 'Count':psmk_pov_counts.values})
StatsNoPovertyBySmoke = pd.DataFrame({'Smoking Flag':npsmk_pov_counts.index, 'Count':npsmk_pov_counts.values})

# Add the year
StatsPovertyBySmoke["Year"] = PovertyBySmoke["SRVY_YR"].mean()
StatsNoPovertyBySmoke["Year"] = NoPovertyBySmoke["SRVY_YR"].mean()


In [8]:
# Find means and run significance tests
crosscheck(df2update=StatsPovertyBySmoke, \
           bydf=PovertyBySmoke, \
           byvar='HH with Smoker', \
           checkvar='HH with Asthmatic Child', \
           desc='Asthmatic Child')

crosscheck(df2update=StatsPovertyBySmoke, \
           bydf=PovertyBySmoke, \
           byvar='HH with Smoker', \
           checkvar='HH with ADHD/ADD Child', \
           desc='ADHD/ADD Child')

crosscheck(df2update=StatsPovertyBySmoke, \
           bydf=PovertyBySmoke, \
           byvar='HH with Smoker', \
           checkvar='HH with Autistic Child', \
           desc='Autistic Child')

crosscheck(df2update=StatsPovertyBySmoke, \
           bydf=PovertyBySmoke, \
           byvar='HH with Smoker', \
           checkvar='HH with Child With Concentration/Emotional/Behavior Issues', \
           desc='Concentration/Emotional/Behavior')
    

StatsPovertyBySmoke.head()

Unnamed: 0,Smoking Flag,Count,Year,Percent with Asthmatic Child,Number with Asthmatic Child,Asthmatic Child Difference P-Value,Asthmatic Child Difference Significant,Percent with ADHD/ADD Child,Number with ADHD/ADD Child,ADHD/ADD Child Difference P-Value,ADHD/ADD Child Difference Significant,Percent with Autistic Child,Number with Autistic Child,Autistic Child Difference P-Value,Autistic Child Difference Significant,Percent with Concentration/Emotional/Behavior,Number with Concentration/Emotional/Behavior,Concentration/Emotional/Behavior Difference P-Value,Concentration/Emotional/Behavior Difference Significant
0,0,727,2017,14.580468,106,0.003555,Yes,8.253095,60,7.025933e-09,Yes,1.650619,12,0.00869,Yes,19.394773,141,4.847708e-10,Yes
1,1,274,2017,18.613139,51,0.003555,Yes,16.058394,44,7.025933e-09,Yes,3.649635,10,0.00869,Yes,32.116788,88,4.847708e-10,Yes


In [9]:
compare_poverty_bysmoking = collapse2year(df2collapse=StatsPovertyBySmoke,byvar='Smoking Flag', \
                               yesdesc='in Smoking Impoverished HH',nodesc='in Non-Smoking Impoverished HH', \
                               compdesc="for Smoking Status in Impoverished HH")

compare_poverty_bysmoking.head()

Unnamed: 0,Year,Count in Smoking Impoverished HH,Asthma Rate in Smoking Impoverished HH,Count Asthma in Smoking Impoverished HH,Asthmatic Child Difference P-Value for Smoking Status in Impoverished HH,Asthmatic Child Difference Significant for Smoking Status in Impoverished HH,Autism Rate in Smoking Impoverished HH,Count Autism in Smoking Impoverished HH,Autistic Child Difference P-Value for Smoking Status in Impoverished HH,Autistic Child Difference Significant for Smoking Status in Impoverished HH,...,Asthma Rate in Non-Smoking Impoverished HH,Count Asthma in Non-Smoking Impoverished HH,Autism Rate in Non-Smoking Impoverished HH,Count Autism in Non-Smoking Impoverished HH,ADHD/ADD Rate in Non-Smoking Impoverished HH,Count ADHD/ADD in Non-Smoking Impoverished HH,Count Concentration/Emotional/Behavior in Non-Smoking Impoverished HH,Concentration/Emotional/Behavior Rate in Non-Smoking Impoverished HH,Percent in Smoking Impoverished HH,Percent in Non-Smoking Impoverished HH
0,2017,274,18.613139,51,0.003555,Yes,3.649635,10,0.00869,Yes,...,14.580468,106,1.650619,12,8.253095,60,141,19.394773,27.372627,72.627373


In [10]:
# Find means and run significance tests
crosscheck(df2update=StatsNoPovertyBySmoke, \
           bydf=NoPovertyBySmoke, \
           byvar='HH with Smoker', \
           checkvar='HH with Asthmatic Child', \
           desc='Asthmatic Child')

crosscheck(df2update=StatsNoPovertyBySmoke, \
           bydf=NoPovertyBySmoke, \
           byvar='HH with Smoker', \
           checkvar='HH with ADHD/ADD Child', \
           desc='ADHD/ADD Child')

crosscheck(df2update=StatsNoPovertyBySmoke, \
           bydf=NoPovertyBySmoke, \
           byvar='HH with Smoker', \
           checkvar='HH with Autistic Child', \
           desc='Autistic Child')

crosscheck(df2update=StatsNoPovertyBySmoke, \
           bydf=NoPovertyBySmoke, \
           byvar='HH with Smoker', \
           checkvar='HH with Child With Concentration/Emotional/Behavior Issues', \
           desc='Concentration/Emotional/Behavior')
    

StatsNoPovertyBySmoke.head()

Unnamed: 0,Smoking Flag,Count,Year,Percent with Asthmatic Child,Number with Asthmatic Child,Asthmatic Child Difference P-Value,Asthmatic Child Difference Significant,Percent with ADHD/ADD Child,Number with ADHD/ADD Child,ADHD/ADD Child Difference P-Value,ADHD/ADD Child Difference Significant,Percent with Autistic Child,Number with Autistic Child,Autistic Child Difference P-Value,Autistic Child Difference Significant,Percent with Concentration/Emotional/Behavior,Number with Concentration/Emotional/Behavior,Concentration/Emotional/Behavior Difference P-Value,Concentration/Emotional/Behavior Difference Significant
0,0,5148,2017,12.257187,631,0.003555,Yes,6.371406,328,7.025933e-09,Yes,2.039627,105,0.00869,Yes,16.899767,870,4.847708e-10,Yes
1,1,832,2017,15.144231,126,0.003555,Yes,11.658654,97,7.025933e-09,Yes,3.485577,29,0.00869,Yes,24.038462,200,4.847708e-10,Yes


In [11]:
compare_nopoverty_bysmoking = collapse2year(df2collapse=StatsNoPovertyBySmoke,byvar='Smoking Flag', \
                               yesdesc='in Smoking Non-Impoverished HH',nodesc='in Non-Smoking Non-Impoverished HH', \
                               compdesc="for Smoking Status in Non-Impoverished HH")

compare_nopoverty_bysmoking.head()

Unnamed: 0,Year,Count in Smoking Non-Impoverished HH,Asthma Rate in Smoking Non-Impoverished HH,Count Asthma in Smoking Non-Impoverished HH,Asthmatic Child Difference P-Value for Smoking Status in Non-Impoverished HH,Asthmatic Child Difference Significant for Smoking Status in Non-Impoverished HH,Autism Rate in Smoking Non-Impoverished HH,Count Autism in Smoking Non-Impoverished HH,Autistic Child Difference P-Value for Smoking Status in Non-Impoverished HH,Autistic Child Difference Significant for Smoking Status in Non-Impoverished HH,...,Asthma Rate in Non-Smoking Non-Impoverished HH,Count Asthma in Non-Smoking Non-Impoverished HH,Autism Rate in Non-Smoking Non-Impoverished HH,Count Autism in Non-Smoking Non-Impoverished HH,ADHD/ADD Rate in Non-Smoking Non-Impoverished HH,Count ADHD/ADD in Non-Smoking Non-Impoverished HH,Count Concentration/Emotional/Behavior in Non-Smoking Non-Impoverished HH,Concentration/Emotional/Behavior Rate in Non-Smoking Non-Impoverished HH,Percent in Smoking Non-Impoverished HH,Percent in Non-Smoking Non-Impoverished HH
0,2017,832,15.144231,126,0.003555,Yes,3.485577,29,0.00869,Yes,...,12.257187,631,2.039627,105,6.371406,328,870,16.899767,13.913043,86.086957


In [13]:
comparisons_2017 = pd.merge(compare_smoke, \
                            pd.merge(compare_poverty, \
                                     pd.merge(compare_poverty_bysmoking,compare_nopoverty_bysmoking,on=['Year']), \
                                     on=['Year']), \
                            on=['Year'])
comparisons_2017.head()

Unnamed: 0,Year,Count in Smoking HH,Asthma Rate in Smoking HH,Count Asthma in Smoking HH,Asthmatic Child Difference P-Value for Smoking Status_x,Asthmatic Child Difference Significant for Smoking Status_x,Autism Rate in Smoking HH,Count Autism in Smoking HH,Autistic Child Difference P-Value for Smoking Status_x,Autistic Child Difference Significant for Smoking Status_x,...,Asthma Rate in Non-Smoking Non-Impoverished HH,Count Asthma in Non-Smoking Non-Impoverished HH,Autism Rate in Non-Smoking Non-Impoverished HH,Count Autism in Non-Smoking Non-Impoverished HH,ADHD/ADD Rate in Non-Smoking Non-Impoverished HH,Count ADHD/ADD in Non-Smoking Non-Impoverished HH,Count Concentration/Emotional/Behavior in Non-Smoking Non-Impoverished HH,Concentration/Emotional/Behavior Rate in Non-Smoking Non-Impoverished HH,Percent in Smoking Non-Impoverished HH,Percent in Non-Smoking Non-Impoverished HH
0,2017,1106,16.003617,177,0.003555,Yes,3.526221,39,0.00869,Yes,...,12.257187,631,2.039627,105,6.371406,328,870,16.899767,13.913043,86.086957
