# CapstoneThree - Pre-Processing


## Health Insurance Premium Data

Prepare data for modeling.

In [1]:
# Import necessary modules

import pandas as pd
import numpy as np

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# load file prepared in the "Data-Wrangling" stage

df = pd.read_excel('insurance_data.xlsx', index_col=0)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,monthly_charge,age_group,weight_status
0,19,female,27.9,0,yes,southwest,16884.924,1407.08,10s,Overweight
1,18,male,33.77,1,no,southeast,1725.5523,143.8,10s,Obese
2,28,male,33.0,3,no,southeast,4449.462,370.79,20s,Obese
3,33,male,22.705,0,no,northwest,21984.47061,1832.04,30s,Normal
4,32,male,28.88,0,no,northwest,3866.8552,322.24,30s,Overweight


### Preparing Data - Convert Categorical data to binaries

In [3]:
# convert 'sex' and 'smoker' to binary

df_prep = df[['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'monthly_charge']]
df_prep['gender'] = np.where(df['sex']=='female', 1, 0)  # female = 1, male = 0
df_prep['smoking'] = np.where(df['smoker']=='yes', 1, 0)  # smoker = 1, non-smoker = 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_prep['gender'] = np.where(df['sex']=='female', 1, 0)  # female = 1, male = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_prep['smoking'] = np.where(df['smoker']=='yes', 1, 0)  # smoker = 1, non-smoker = 0


In [4]:
df_prep.drop(['sex', 'smoker'], axis = 1, inplace = True)
df_prep.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,age,bmi,children,region,monthly_charge,gender,smoking
0,19,27.9,0,southwest,1407.08,1,1
1,18,33.77,1,southeast,143.8,0,0
2,28,33.0,3,southeast,370.79,0,0
3,33,22.705,0,northwest,1832.04,0,0
4,32,28.88,0,northwest,322.24,0,0


In [5]:
# change 'region' to binary

df_dummy = pd.get_dummies(df_prep)
df_dummy.head()

Unnamed: 0,age,bmi,children,monthly_charge,gender,smoking,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,1407.08,1,1,0,0,0,1
1,18,33.77,1,143.8,0,0,0,0,1,0
2,28,33.0,3,370.79,0,0,0,0,1,0
3,33,22.705,0,1832.04,0,0,0,1,0,0
4,32,28.88,0,322.24,0,0,0,1,0,0


### Data Splitting and Scaling

In [6]:
X = df_dummy.drop(['monthly_charge'], axis = 1)
y = df_dummy['monthly_charge']

In [7]:
# split data into 80% training and 20% testing 

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size = 0.2)

In [8]:
# apply StandardScaler

sc = StandardScaler()

X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [10]:
print(X_train_sc)


[[-0.14853305  0.54530479  2.41394802 ... -0.55791981 -0.6155209
   1.70862925]
 [-1.49780784  0.59867181 -0.89219519 ... -0.55791981  1.6246402
  -0.58526447]
 [-1.14273553  0.96092064  0.76087642 ... -0.55791981 -0.6155209
  -0.58526447]
 ...
 [ 0.06451033 -0.91339361 -0.89219519 ... -0.55791981  1.6246402
  -0.58526447]
 [-1.42679338  0.77656186 -0.89219519 ...  1.79237229 -0.6155209
  -0.58526447]
 [-0.4325909  -1.97749955 -0.06565939 ... -0.55791981 -0.6155209
   1.70862925]]
