# ordinal encoding

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("insurance.csv")
df.head(3)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462


In [3]:
df = df.drop(columns =['age' , 'bmi' , 'children' , 'charges']) 

In [4]:
from sklearn.preprocessing import OrdinalEncoder

In [11]:
oe = OrdinalEncoder(categories = [['male' , 'female'],
                                 ['no' , 'yes'],
                                 ['southeast' , 'southwest' , 'northwest' , 'northeast']]
                                 )

In [12]:
oe_sc = oe.fit_transform(df)

In [13]:
new_df = pd.DataFrame(oe_sc , columns = df.columns)
new_df.head(3)

Unnamed: 0,sex,smoker,region
0,1.0,1.0,1.0
1,0.0,0.0,0.0
2,0.0,0.0,0.0


# onehot encoding

In [26]:
df = pd.read_csv("insurance.csv")
df.head(3)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462


In [27]:
df = df.drop(columns =['age' , 'bmi' , 'children' , 'charges']) 

In [28]:
df.head(3)

Unnamed: 0,sex,smoker,region
0,female,yes,southwest
1,male,no,southeast
2,male,no,southeast


In [29]:
from sklearn.preprocessing import OneHotEncoder 

In [30]:
ohe = OneHotEncoder(drop = 'first' , sparse_output = False , dtype = np.int32)

In [32]:
df_new = ohe.fit_transform(df[['sex' , 'smoker' , 'region']])

In [33]:
df_new

array([[0, 1, 0, 0, 1],
       [1, 0, 0, 1, 0],
       [1, 0, 0, 1, 0],
       ...,
       [0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1],
       [0, 1, 1, 0, 0]], shape=(1338, 5), dtype=int32)

# getdummies

In [34]:
df_new = pd.get_dummies(df , drop_first =True , columns = ['sex' , 'smoker' , 'region' ])

In [35]:
df_new.astype(int)

Unnamed: 0,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,0,1,0,0,1
1,1,0,0,1,0
2,1,0,0,1,0
3,1,0,1,0,0
4,1,0,1,0,0
...,...,...,...,...,...
1333,1,0,1,0,0
1334,0,0,0,0,0
1335,0,0,0,1,0
1336,0,0,0,0,1


# function transformer

In [38]:
df = pd.read_csv("covid_toy.csv")
df.head(3)

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No


In [39]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [40]:
from sklearn.impute import SimpleImputer

In [41]:
si = SimpleImputer()

In [42]:
df['fever'] = si.fit_transform(df[['fever']])

In [43]:
df.isnull().sum()

age          0
gender       0
fever        0
cough        0
city         0
has_covid    0
dtype: int64

In [44]:
df.head(3)

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No


In [45]:
from sklearn.preprocessing import LabelEncoder

In [46]:
lb = LabelEncoder() 

In [47]:
df['gender'] = lb.fit_transform(df['gender'])
df['cough'] = lb.fit_transform(df['cough'])
df['city'] = lb.fit_transform(df['city'])
df['has_covid'] = lb.fit_transform(df['has_covid'])

In [50]:
x = df.drop(columns = ['has_covid'])
y = df['has_covid']

In [51]:
from sklearn.preprocessing import FunctionTransformer

In [52]:
log_transform = FunctionTransformer(np.log1p)

# apply the transformation to the dataset
X_transformed = log_transform.transform(x)

In [53]:
X_transformed

Unnamed: 0,age,gender,fever,cough,city
0,4.110874,0.693147,4.644391,0.000000,1.098612
1,3.332205,0.693147,4.615121,0.000000,0.693147
2,3.761200,0.693147,4.624973,0.000000,0.693147
3,3.465736,0.000000,4.595120,0.000000,1.098612
4,4.189655,0.000000,4.624973,0.000000,1.386294
...,...,...,...,...,...
95,2.564949,0.000000,4.653960,0.000000,0.000000
96,3.951244,0.000000,4.624973,0.693147,1.098612
97,3.044522,0.000000,4.624973,0.000000,0.000000
98,1.791759,0.000000,4.595120,0.693147,1.386294
