# NHIS

**Notable assumptions being made**
* Sample data from has similar distribution as general US population (e.g. income class, region)

In [6]:
#imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from scipy import stats
import statsmodels.api as sm

import os
        
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score,\
f1_score, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from xgboost import plot_importance
from xgboost import plot_tree

In [7]:
pd.set_option('display.max_rows', 500)  # Set the option to display all rows
pd.set_option('display.max_columns', 100)  # Set the option to display all rows


In [8]:
#import data and reserve the originals as read only

dfc22_orig = pd.read_csv('source/child22.csv')
dfa22_orig = pd.read_csv('source/adult22.csv')

#Copying to working dataframe, also removing columns not planned to be used
dfc22 = dfc22_orig[["RECTYPE","SRVY_YR","HHX","REGION","INTV_MON","HHSTAT_C","RELTIVP_C","SEX_C","AGEP_C","PCNTLT18TC","PCNT18UPTC","PCNTADLT_C","PCNTKIDS_C","MLTFAMFLG_C","ADHDEV_C","ADHDNW_C","IDEV1_C","IDNW1_C","ASDEV_C","ASDNW_C","DDEV_C","DDNW_C","LDEV_C","LDNW_C","PICKUPDF_C","SELFCAREDF_C","UNDRSTYOU_C","UNDRSTCHD_C","UNDRSTIHH_C","UNDRSTOHH_C","LEARNDF_C","REMEMBERDF_C","ANXFREQ_C","DEPFREQ_C","BEHDFPLYG_C","BEHDFCNTR_C","BEHDFFCS_C","BEHDFCHG_C","BEHDFMKFR_C","BSCNWPPL_C","BSCNWPLCS_C","BSCCHG_C","BSCHLOPPL_C","BSCCRYALT_C","BSCCLMDWN_C","BSCFUSSY_C","BSCSTHE_C","BSCSCHD_C","BSCPTSLP_C","BSCSTYSLP_C","BSCPRLKSL_C","SCHSPEDEV_C","SCHSPED_C","SCHSPEDEM_C","MHRX_C","MHTHRPY_C","SUPPORT_C","COMSUPPORT_C","SDQ1_C","SDQ2_C","SDQ3_C","SDQ4_C","SDQ5_C","SDQ6_C","SDQ7_C","SDQ8_C","SDQ9_C","SDQ10_C","SDQ11_C","SDQ12_C","SDQ13_C","SDQ14_C","SDQ15_C","SDQ16_C","SDQ17_C","SDQ18_C","SDQ19_C","SDQ20_C","SDQ21_C","SDQ22_C","SDQ23_C","SDQ24_C","SDQ25_C","SDQIMP1_C","SDQIMP2_C","SDQIMP3_C","SDQIMP4_C","SDQIMP5_C","SDQIMP6_C","SDQIMP7_C","SDQIMP8_C","SDQEMOT_C","SDQCOND_C","SDQHYPE_C","SDQPEER_C","SDQPROS_C","SDQTOT_C","SDQIMPTOT_C","POVRATTC_C","RATCAT_C"]].copy()
dfa22 = dfa22_orig.copy()


In [10]:
#Also importing the column description table so it's easier to refer to
cookbook_child = pd.read_csv('source/child-summary.csv')

In [11]:
#Checking shape
print(f"Child shape: {dfc22.shape} \nAdult shape: {dfa22.shape}")

Child shape: (7464, 100) 
Adult shape: (27651, 637)


# Analysis

## Missing/null values

In [12]:
#Checking null values & percentages

num_of_rows = dfc22.shape[0]

dfc_nullcheck = pd.DataFrame(dfc22.isna().sum().reset_index())
dfc_nullcheck = dfc_nullcheck.rename({0:'nulls'},axis=1)
dfc_nullcheck['percentage'] = dfc_nullcheck['nulls'] / num_of_rows
dfc_nullcheck['description'] = pd.merge(dfc_nullcheck, cookbook_child, left_on='index', right_on='Variable Name')['Description']
dfc_nullcheck.sort_values('percentage',ascending=False)

Unnamed: 0,index,nulls,percentage,description
17,IDNW1_C,7346,0.984191,Currently has intellectual disability
19,ASDNW_C,7220,0.96731,Currently has autism
21,DDNW_C,7091,0.950027,Currently has developmental delay
23,LDNW_C,6953,0.931538,Currently has learning disability
15,ADHDNW_C,6781,0.908494,Currently has ADD/ADHD
53,SCHSPEDEM_C,6742,0.903269,Currently receives services for mental health
42,BSCHLOPPL_C,6633,0.888666,Held by other people
39,BSCNWPPL_C,6633,0.888666,Hard time with new people
40,BSCNWPLCS_C,6633,0.888666,Hard time in new places
50,BSCPRLKSL_C,6633,0.888666,Hard to get sleep due to SC


### Observations/Notes

* Questions such as having ADHD or Autisma have very high % of data missing (vetted with NHIS' original document). After quick analysis, determined the "Currently has autism" question will be skipped if the user answered No to "Ever had autism" question. Therefore we can assume null = No for "Currently has autism question"
* SDQ questions only happen every 3 years, so with the new design we only have 2019 and 2022


## BSC* fields

In [13]:
#Comparing data when the BSC fields are blank VS not blank. 
pd.concat([dfc22[~dfc22['BSCHLOPPL_C'].isna()].head(10),dfc22[dfc22['BSCHLOPPL_C'].isna()].head(10)],axis=0)




Unnamed: 0,RECTYPE,SRVY_YR,HHX,REGION,INTV_MON,HHSTAT_C,RELTIVP_C,SEX_C,AGEP_C,PCNTLT18TC,PCNT18UPTC,PCNTADLT_C,PCNTKIDS_C,MLTFAMFLG_C,ADHDEV_C,ADHDNW_C,IDEV1_C,IDNW1_C,ASDEV_C,ASDNW_C,DDEV_C,DDNW_C,LDEV_C,LDNW_C,PICKUPDF_C,SELFCAREDF_C,UNDRSTYOU_C,UNDRSTCHD_C,UNDRSTIHH_C,UNDRSTOHH_C,LEARNDF_C,REMEMBERDF_C,ANXFREQ_C,DEPFREQ_C,BEHDFPLYG_C,BEHDFCNTR_C,BEHDFFCS_C,BEHDFCHG_C,BEHDFMKFR_C,BSCNWPPL_C,BSCNWPLCS_C,BSCCHG_C,BSCHLOPPL_C,BSCCRYALT_C,BSCCLMDWN_C,BSCFUSSY_C,BSCSTHE_C,BSCSCHD_C,BSCPTSLP_C,BSCSTYSLP_C,BSCPRLKSL_C,SCHSPEDEV_C,SCHSPED_C,SCHSPEDEM_C,MHRX_C,MHTHRPY_C,SUPPORT_C,COMSUPPORT_C,SDQ1_C,SDQ2_C,SDQ3_C,SDQ4_C,SDQ5_C,SDQ6_C,SDQ7_C,SDQ8_C,SDQ9_C,SDQ10_C,SDQ11_C,SDQ12_C,SDQ13_C,SDQ14_C,SDQ15_C,SDQ16_C,SDQ17_C,SDQ18_C,SDQ19_C,SDQ20_C,SDQ21_C,SDQ22_C,SDQ23_C,SDQ24_C,SDQ25_C,SDQIMP1_C,SDQIMP2_C,SDQIMP3_C,SDQIMP4_C,SDQIMP5_C,SDQIMP6_C,SDQIMP7_C,SDQIMP8_C,SDQEMOT_C,SDQCOND_C,SDQHYPE_C,SDQPEER_C,SDQPROS_C,SDQTOT_C,SDQIMPTOT_C,POVRATTC_C,RATCAT_C
7,20,2022,H019456,4,1,1,2,2,1,2,3,3,2,2,,,2,,,,2,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,3.0,2.0,3.0,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.62,11
8,20,2022,H006250,4,1,1,1,2,1,2,2,2,2,2,,,2,,,,2,,,,,,,,,,,,,,,,,,,2.0,2.0,1.0,3.0,2.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.63,11
33,20,2022,H015664,4,1,1,1,2,1,2,1,1,2,2,,,2,,,,2,,,,,,,,,,,,,,,,,,,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,2.0,2.0,2.0,2.0,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.63,2
36,20,2022,H025236,4,1,1,1,2,0,1,2,2,1,2,,,2,,,,2,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,11.0,14
46,20,2022,H046500,4,1,1,1,2,1,1,2,2,1,2,,,2,,,,2,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.66,11
47,20,2022,H038806,4,1,1,1,1,0,1,2,2,1,2,,,2,,,,2,,,,,,,,,,,,,,,,,,,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,2.0,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.96,14
50,20,2022,H002157,4,1,1,1,1,0,2,3,2,2,1,,,2,,,,2,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.91,13
52,20,2022,H028273,4,1,1,1,1,0,1,3,3,1,2,,,2,,,,2,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.99,9
56,20,2022,H040247,4,1,1,1,2,0,1,3,3,1,2,,,2,,,,2,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.63,9
64,20,2022,H016056,4,1,1,1,2,1,1,3,3,1,2,,,2,,,,2,,,,,,,,,,,,,,,,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.58,2


### Observations
* Records that have BSC fields filled in are children that's 1 or younger (possibly 2). 

In [14]:
# Checking the NA value differences per age

dfc22.groupby('AGEP_C').apply(lambda x: x.isnull().sum())

Unnamed: 0_level_0,RECTYPE,SRVY_YR,HHX,REGION,INTV_MON,HHSTAT_C,RELTIVP_C,SEX_C,AGEP_C,PCNTLT18TC,PCNT18UPTC,PCNTADLT_C,PCNTKIDS_C,MLTFAMFLG_C,ADHDEV_C,ADHDNW_C,IDEV1_C,IDNW1_C,ASDEV_C,ASDNW_C,DDEV_C,DDNW_C,LDEV_C,LDNW_C,PICKUPDF_C,SELFCAREDF_C,UNDRSTYOU_C,UNDRSTCHD_C,UNDRSTIHH_C,UNDRSTOHH_C,LEARNDF_C,REMEMBERDF_C,ANXFREQ_C,DEPFREQ_C,BEHDFPLYG_C,BEHDFCNTR_C,BEHDFFCS_C,BEHDFCHG_C,BEHDFMKFR_C,BSCNWPPL_C,BSCNWPLCS_C,BSCCHG_C,BSCHLOPPL_C,BSCCRYALT_C,BSCCLMDWN_C,BSCFUSSY_C,BSCSTHE_C,BSCSCHD_C,BSCPTSLP_C,BSCSTYSLP_C,BSCPRLKSL_C,SCHSPEDEV_C,SCHSPED_C,SCHSPEDEM_C,MHRX_C,MHTHRPY_C,SUPPORT_C,COMSUPPORT_C,SDQ1_C,SDQ2_C,SDQ3_C,SDQ4_C,SDQ5_C,SDQ6_C,SDQ7_C,SDQ8_C,SDQ9_C,SDQ10_C,SDQ11_C,SDQ12_C,SDQ13_C,SDQ14_C,SDQ15_C,SDQ16_C,SDQ17_C,SDQ18_C,SDQ19_C,SDQ20_C,SDQ21_C,SDQ22_C,SDQ23_C,SDQ24_C,SDQ25_C,SDQIMP1_C,SDQIMP2_C,SDQIMP3_C,SDQIMP4_C,SDQIMP5_C,SDQIMP6_C,SDQIMP7_C,SDQIMP8_C,SDQEMOT_C,SDQCOND_C,SDQHYPE_C,SDQPEER_C,SDQPROS_C,SDQTOT_C,SDQIMPTOT_C,POVRATTC_C,RATCAT_C
AGEP_C,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,417,417,0,417,417,417,0,413,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,0,0,0,0,0,0,0,0,0,0,0,0,0,410,412,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,417,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,414,414,0,413,414,414,0,399,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,0,0,0,0,0,0,0,0,0,0,0,0,0,402,409,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,414,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,362,0,358,0,351,0,339,0,355,0,362,0,0,362,362,0,362,362,362,0,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,0,334,342,0,0,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,362,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,385,0,386,0,381,0,362,0,380,0,389,0,0,389,389,0,389,389,389,0,389,389,389,389,389,389,389,389,389,389,389,389,389,389,389,389,0,358,366,0,0,389,389,389,389,389,389,389,389,389,389,389,389,389,389,389,389,389,389,389,389,389,389,389,389,389,389,389,389,389,389,389,389,389,389,389,389,389,389,389,389,389,389,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,378,0,375,0,371,0,355,0,372,0,384,0,0,384,384,0,384,384,384,0,384,384,384,384,384,384,384,384,384,384,384,384,384,384,384,384,0,347,354,0,0,384,384,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,277,277,277,277,277,277,277,0,0,0,0,0,0,277,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,415,0,421,0,404,0,398,0,412,425,0,425,425,0,0,0,0,0,0,425,0,0,0,0,425,425,425,425,425,425,425,425,425,425,425,425,0,383,395,0,0,425,425,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,307,307,307,307,307,307,307,0,0,0,0,0,0,307,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,368,0,392,0,388,0,382,0,379,397,0,397,397,0,0,0,0,0,0,397,0,0,0,0,397,397,397,397,397,397,397,397,397,397,397,397,0,345,359,0,0,397,397,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,263,263,263,263,263,263,263,0,0,0,0,0,0,263,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,354,0,378,0,366,0,349,0,353,380,0,380,380,0,0,0,0,0,0,380,0,0,0,0,380,380,380,380,380,380,380,380,380,380,380,380,0,321,341,0,0,380,380,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,264,264,264,264,264,264,264,0,0,0,0,0,0,264,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,306,0,349,0,342,0,337,0,321,353,0,353,353,0,0,0,0,0,0,353,0,0,0,0,353,353,353,353,353,353,353,353,353,353,353,353,0,295,308,0,0,353,353,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,236,236,236,236,236,236,236,0,0,0,0,0,0,236,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,364,0,402,0,389,0,387,0,370,408,0,408,408,0,0,0,0,0,0,408,0,0,0,0,408,408,408,408,408,408,408,408,408,408,408,408,0,334,357,0,0,408,408,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,272,272,272,272,272,272,272,0,0,0,0,0,0,272,0,0


As suspected, different questions are answered based on age. 
There seems to be different set of questions from: 
* 0-1
* 2-4
* 5-17 (SUPPORT_C and COMSUPPORT_C is not answered above 12, presumably by some age limits)

## [Working analysis]

In [15]:
dfc22[['ASDEV_C']].value_counts()

ASDEV_C
2.0        6370
1.0         244
9.0          11
7.0           7
8.0           1
Name: count, dtype: int64

In [16]:
dfc22[['ASDNW_C']].value_counts()

ASDNW_C
1.0        231
2.0         10
9.0          3
Name: count, dtype: int64