# Data Preparation
### Goal : obtain a dataframe containing only columns with values ranging from 0 to 1
### Steps :
#### 1) Is there NaN values in the column ? If yes we replace them by what ? Or do we delete the entries ?
#### 2) What's the data type ? Numerical, Boolean or String ? 
##### If it's string, is it categorical ? If yes replace the values by numerical ones
##### If not categorical -> one hot encode or drop
#### 3) Then change the values to be ranging between 0 and 1


In [12]:
import pandas as pd
import numpy as np
import py_scripts.tools as tools

import warnings
warnings.filterwarnings("ignore")

PATH = '../Data/'
features_file = "training_set_features.csv"
label_file = "training_set_labels.csv"

feature_df = pd.read_csv(f"{PATH}{features_file}")
label_df = pd.read_csv(f"{PATH}{label_file}")

In [13]:
# merge the two dataframes
result_df = pd.merge(label_df,feature_df,on='respondent_id')

## h1n1_concern

In [14]:
    # ILLUSTRATION
# rows where the column has a NaN value
h1n1_nan_rows = result_df.h1n1_concern.isna()
# number of NaN in the column
print(h1n1_nan_rows.sum())

    # Replacement of the NaN values
result_df = tools.mean_feature_clustered_numeric(result_df.copy(), 'h1n1_concern', ['age_group','race','sex','hhs_geo_region','census_msa'])
# See, there's no more NaN values
print(result_df.h1n1_concern.isna().sum())

92
0


In [15]:
    # Standardization
result_df.h1n1_concern = result_df.h1n1_concern / result_df.h1n1_concern.max()
# now the values of h1n1_concern are between 0 and 1

## h1n1_knowledge

In [16]:
result_df = tools.mean_feature_clustered_numeric(result_df.copy(), 'h1n1_knowledge', ['age_group','race','sex','hhs_geo_region','census_msa'])
result_df.h1n1_knowledge = result_df.h1n1_knowledge / result_df.h1n1_knowledge.max()

## behavioral_antiviral_meds 

In [18]:
result_df = tools.mean_feature_clustered_numeric(result_df.copy(), 'behavioral_antiviral_meds', ['age_group','race','sex','hhs_geo_region','census_msa'])
# no standardization as the variable is already contained between 0 and 1 (binary)

## behavioral_avoidance

In [19]:
result_df = tools.mean_feature_clustered_numeric(result_df.copy(), 'behavioral_avoidance', ['age_group','race','sex','hhs_geo_region','census_msa'])
# no standardization as the variable is already contained between 0 and 1 (binary)

## health_worker        

In [20]:
result_df = tools.mean_feature_clustered_numeric(result_df.copy(), 'health_worker', ['age_group','race','sex','hhs_geo_region','census_msa'])
# no standardization as the variable is already contained between 0 and 1 (binary)

## health_insurance

In [21]:
result_df = tools.mean_feature_clustered_numeric(result_df.copy(), 'health_insurance', ['age_group','race','sex','hhs_geo_region','census_msa'])
# no standardization as the variable is already contained between 0 and 1 (binary)

## opinion_h1n1_vacc_effective

In [22]:
result_df = tools.mean_feature_clustered_numeric(result_df.copy(), 'opinion_h1n1_vacc_effective', ['age_group','race','sex','hhs_geo_region','census_msa'])
result_df.opinion_h1n1_vacc_effective = result_df.opinion_h1n1_vacc_effective / result_df.opinion_h1n1_vacc_effective.max()

## opinion_h1n1_risk 

In [23]:
result_df = tools.mean_feature_clustered_numeric(result_df.copy(), 'opinion_h1n1_risk', ['age_group','race','sex','hhs_geo_region','census_msa'])
result_df.opinion_h1n1_risk  = result_df.opinion_h1n1_risk  / result_df.opinion_h1n1_risk.max()

## opinion_h1n1_sick_from_vacc

In [24]:
result_df = tools.mean_feature_clustered_numeric(result_df.copy(), 'opinion_h1n1_sick_from_vacc', ['age_group','race','sex','hhs_geo_region','census_msa'])
result_df.opinion_h1n1_sick_from_vacc  = result_df.opinion_h1n1_sick_from_vacc  / result_df.opinion_h1n1_sick_from_vacc.max()

## opinion_seas_vacc_effective

In [25]:
result_df = tools.mean_feature_clustered_numeric(result_df.copy(), 'opinion_seas_vacc_effective', ['age_group','race','sex','hhs_geo_region','census_msa'])
result_df.opinion_seas_vacc_effective  = result_df.opinion_seas_vacc_effective  / result_df.opinion_seas_vacc_effective.max()

In [None]:
age_group                       object
education                       object
race                            object
sex                             object

employment_status               object
hhs_geo_region                  object
census_msa                      object
household_adults               float64
household_children             float64
employment_industry             object
employment_occupation           object