# Encoding

In [1]:
import pandas as pd
df = pd.read_csv('insurance.csv')

In [2]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [4]:
df.shape

(1338, 7)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
age         1338 non-null int64
sex         1338 non-null object
bmi         1338 non-null float64
children    1338 non-null int64
smoker      1338 non-null object
region      1338 non-null object
charges     1338 non-null float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.2+ KB


In [6]:
df1 = df.copy()
df2 = df.copy()
df3 = df.copy()
df4 = df.copy()
df5 = df.copy()

# Without Encoding Techniques

In [7]:
df1.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [8]:
df1.sex.unique()

array(['female', 'male'], dtype=object)

In [9]:
df1.sex = df1.sex.replace(['female', 'male'],[0,1])

In [10]:
df1.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,yes,southwest,16884.924
1,18,1,33.77,1,no,southeast,1725.5523
2,28,1,33.0,3,no,southeast,4449.462
3,33,1,22.705,0,no,northwest,21984.47061
4,32,1,28.88,0,no,northwest,3866.8552


# Label Encoder

In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [12]:
df1.smoker = le.fit_transform(df1.smoker)

In [13]:
df1.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523
2,28,1,33.0,3,0,southeast,4449.462
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.88,0,0,northwest,3866.8552


# Label Encoder using loop

In [14]:
import numpy as np
df2.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [15]:
df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [16]:
for column in df1.columns:
    if df2[column].dtype == np.number:
        continue
    df2[column] = le.fit_transform(df2[column])

In [17]:
df2.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,1,0,27.9,0,1,3,16884.924
1,0,1,33.77,1,0,2,1725.5523
2,10,1,33.0,3,0,2,4449.462
3,15,1,22.705,0,0,1,21984.47061
4,14,1,28.88,0,0,1,3866.8552


# One-Hot-Encoder

In [18]:
df3.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [19]:
dummy = pd.get_dummies(df3['sex'], prefix = 'Encoded', drop_first = True)

In [20]:
dummy.head()

Unnamed: 0,Encoded_male
0,0
1,1
2,1
3,1
4,1


In [21]:
df3 = df3.drop(['sex'], axis = 1)

In [22]:
df3.head()

Unnamed: 0,age,bmi,children,smoker,region,charges
0,19,27.9,0,yes,southwest,16884.924
1,18,33.77,1,no,southeast,1725.5523
2,28,33.0,3,no,southeast,4449.462
3,33,22.705,0,no,northwest,21984.47061
4,32,28.88,0,no,northwest,3866.8552


In [23]:
new_df = pd.concat([df3,dummy],axis=1)

In [24]:
new_df.head()

Unnamed: 0,age,bmi,children,smoker,region,charges,Encoded_male
0,19,27.9,0,yes,southwest,16884.924,0
1,18,33.77,1,no,southeast,1725.5523,1
2,28,33.0,3,no,southeast,4449.462,1
3,33,22.705,0,no,northwest,21984.47061,1
4,32,28.88,0,no,northwest,3866.8552,1


In [25]:
y = new_df.charges
x = new_df.drop('charges', axis =1)

In [26]:
x.head()

Unnamed: 0,age,bmi,children,smoker,region,Encoded_male
0,19,27.9,0,yes,southwest,0
1,18,33.77,1,no,southeast,1
2,28,33.0,3,no,southeast,1
3,33,22.705,0,no,northwest,1
4,32,28.88,0,no,northwest,1


In [27]:
y.head()

0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: charges, dtype: float64

In [28]:
df4.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# One-Hot-Encoder with loop

In [66]:
for column in df4.columns:
    if df4[column].dtype == np.object:
        one = pd.get_dummies(df4[column],drop_first = True)
        df4 = pd.concat([df4,one],axis=1).drop(column,axis=1)

In [67]:
df4.head()

Unnamed: 0,age,bmi,children,charges,male,yes,northwest,southeast,southwest
0,19,27.9,0,16884.924,0,1,0,0,1
1,18,33.77,1,1725.5523,1,0,0,1,0
2,28,33.0,3,4449.462,1,0,0,1,0
3,33,22.705,0,21984.47061,1,0,1,0,0
4,32,28.88,0,3866.8552,1,0,1,0,0


# Ordinal Encoding

In [31]:
df5.smoker.unique()

array(['yes', 'no'], dtype=object)

In [32]:
smoker = ['yes', 'no']
from sklearn.preprocessing import OrdinalEncoder
ordinal = OrdinalEncoder(categories=[smoker])

In [33]:
encoded = ordinal.fit_transform(df5[['smoker']])

In [34]:
en = pd.DataFrame(encoded,columns=['smoker'])

In [35]:
en.head()

Unnamed: 0,smoker
0,0.0
1,1.0
2,1.0
3,1.0
4,1.0


In [36]:
df5.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [37]:
df5 = df5.drop(['smoker'],axis=1)
df5 = pd.concat([df5,en],axis=1)

In [38]:
df5.head()

Unnamed: 0,age,sex,bmi,children,region,charges,smoker
0,19,female,27.9,0,southwest,16884.924,0.0
1,18,male,33.77,1,southeast,1725.5523,1.0
2,28,male,33.0,3,southeast,4449.462,1.0
3,33,male,22.705,0,northwest,21984.47061,1.0
4,32,male,28.88,0,northwest,3866.8552,1.0


In [39]:
df6 = df.copy()

In [40]:
df6.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# Ordinal Encoding with loop

In [43]:
df7 = df.copy()

In [59]:
df7.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [62]:
for column in df7.columns:
    if df7[column].dtype == np.object:
        unique=df7[column].unique()
        df7[column]=OrdinalEncoder(categories=[unique]).fit_transform(df7[[column]])

In [63]:
df7.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0.0,27.9,0,0.0,0.0,16884.924
1,18,1.0,33.77,1,1.0,1.0,1725.5523
2,28,1.0,33.0,3,1.0,1.0,4449.462
3,33,1.0,22.705,0,1.0,2.0,21984.47061
4,32,1.0,28.88,0,1.0,2.0,3866.8552
