# label encoder

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("insurance.csv")
df.head(3)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462


In [3]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [4]:
# df['smoker'].value_counts

In [5]:
from sklearn.preprocessing import LabelEncoder

In [6]:
lb = LabelEncoder() 

In [7]:
df['sex'] = lb.fit_transform(df['sex'])
df['smoker'] = lb.fit_transform(df['smoker'])
df['region'] = lb.fit_transform(df['region'])

In [8]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


# ordinal encoder

In [9]:
df = pd.read_csv("insurance.csv")
df.head(3)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462


In [10]:
df = df.drop(columns =['age','bmi','children','charges'])

In [11]:
df['smoker'].value_counts()

smoker
no     1064
yes     274
Name: count, dtype: int64

In [12]:
from sklearn.preprocessing import OrdinalEncoder

In [13]:
oe = OrdinalEncoder(categories = [['male' , 'female'],
                                 ['no' , 'yes'],
                                 ['southeast' ,'southwest' , 'northwest' , 'northeast']])

In [14]:
oe_sc = oe.fit_transform(df)

In [15]:
new_df = pd.DataFrame(oe_sc , columns = df.columns)
new_df.head(3)

Unnamed: 0,sex,smoker,region
0,1.0,1.0,1.0
1,0.0,0.0,0.0
2,0.0,0.0,0.0


# one hot encoder

In [28]:
df = df.drop(columns =['age','bmi','children','charges'])

In [29]:
df.head()

Unnamed: 0,sex,smoker,region
0,female,yes,southwest
1,male,no,southeast
2,male,no,southeast
3,male,no,northwest
4,male,no,northwest


In [30]:
from sklearn.preprocessing import OneHotEncoder 

In [31]:
ohe = OneHotEncoder(drop = 'first' , sparse_output = False , dtype = np.int32)

In [36]:
df_new = ohe.fit_transform(df[['sex','smoker','region']])

In [37]:
df_new

array([[0, 1, 0, 0, 1],
       [1, 0, 0, 1, 0],
       [1, 0, 0, 1, 0],
       ...,
       [0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1],
       [0, 1, 1, 0, 0]], shape=(1338, 5), dtype=int32)

# get_dummis

In [16]:
df = pd.read_csv("insurance.csv")
df.head(3)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462


In [17]:
df_new = pd.get_dummies(df , drop_first =True , columns = ['sex' , 'smoker' , 'region' ])

In [18]:
df_new.astype(int)

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27,0,16884,0,1,0,0,1
1,18,33,1,1725,1,0,0,1,0
2,28,33,3,4449,1,0,0,1,0
3,33,22,0,21984,1,0,1,0,0
4,32,28,0,3866,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...
1333,50,30,3,10600,1,0,1,0,0
1334,18,31,0,2205,0,0,0,0,0
1335,18,36,0,1629,0,0,0,1,0
1336,21,25,0,2007,0,0,0,0,1


In [19]:
x = df_new.drop(columns = ['charges']) 
y = df_new['charges'] 

In [20]:
from sklearn.model_selection import train_test_split 

In [21]:
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [22]:
from sklearn.preprocessing import StandardScaler 

In [23]:
sc = StandardScaler() 

In [24]:
x_train

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
560,46,19.950,2,False,False,True,False,False
1285,47,24.320,0,False,False,False,False,False
1142,52,24.860,0,False,False,False,True,False
969,39,34.320,5,False,False,False,True,False
486,54,21.470,3,False,False,True,False,False
...,...,...,...,...,...,...,...,...
1095,18,31.350,4,False,False,False,False,False
1130,39,23.870,5,False,False,False,True,False
1294,58,25.175,0,True,False,False,False,False
860,37,47.600,2,False,True,False,False,True


In [25]:
x_train_sc = sc.fit_transform(x_train)

In [26]:
x_train_new = pd.DataFrame(x_train_sc , columns = x_train.columns)

In [27]:
np.round(x_train_new.describe() , 2)

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
count,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0
mean,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,0.0
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.52,-2.42,-0.91,-1.02,-0.51,-0.56,-0.6,-0.57
25%,-0.88,-0.72,-0.91,-1.02,-0.51,-0.56,-0.6,-0.57
50%,0.01,-0.06,-0.09,0.98,-0.51,-0.56,-0.6,-0.57
75%,0.83,0.65,0.73,0.98,-0.51,-0.56,1.67,-0.57
max,1.75,3.74,3.2,0.98,1.97,1.78,1.67,1.75
