In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("D://DataSet//covid_toy.csv")

In [3]:
df.head(5)

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


## Handling missing values

In [5]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [6]:
numerical_list = [val for val in df.select_dtypes(include=["int64","float64"])]

object_list = [val for val in df.select_dtypes(include="object")]

In [7]:
missing = [val for val in df.columns if df[val].isnull().sum()>0 ]

from sklearn.impute import SimpleImputer

for i in missing:
    if i in numerical_list:
        num_si = SimpleImputer(strategy="mean")
        df[i] = num_si.fit_transform(df[[i]])
    else:
        cat_si = SimpleImputer(strategy="most_frequent")
        df[object_list] = cat_si.fit_transform(df[object_list])

In [8]:
df.isnull().sum()

age          0
gender       0
fever        0
cough        0
city         0
has_covid    0
dtype: int64

## LabelEncoding

In [10]:
df1 = df.copy()

In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in object_list:
    df1[i] = le.fit_transform(df1[i])

In [12]:
df1.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,1,103.0,0,2,0
1,27,1,100.0,0,1,1
2,42,1,101.0,0,1,0
3,31,0,98.0,0,2,0
4,65,0,101.0,0,3,0


# OrdinalEncoding

In [14]:
df2 = df.copy()

In [15]:
df2 = df2.drop(columns = numerical_list)

In [16]:
from sklearn.preprocessing import OrdinalEncoder

In [17]:
oe = OrdinalEncoder(categories = [
    ["Male","Female"], 
    ["Mild","Strong"], 
    ["Kolkata","Bangalore","Delhi","Mumbai"], 
    ["No","Yes"]
])

In [18]:
df2_transform = oe.fit_transform(df2)  #array

In [27]:
df2_cat  = pd.DataFrame(df2_transform, columns = object_list)

In [29]:
df2_num  = pd.DataFrame(df, columns = numerical_list)

In [31]:
df2 = pd.concat([df2_num,df2_cat],axis = 1)
df2

Unnamed: 0,age,fever,gender,cough,city,has_covid
0,60,103.0,0.0,0.0,0.0,0.0
1,27,100.0,0.0,0.0,2.0,1.0
2,42,101.0,0.0,0.0,2.0,0.0
3,31,98.0,1.0,0.0,0.0,0.0
4,65,101.0,1.0,0.0,3.0,0.0
...,...,...,...,...,...,...
95,12,104.0,1.0,0.0,1.0,0.0
96,51,101.0,1.0,1.0,0.0,1.0
97,20,101.0,1.0,0.0,1.0,0.0
98,5,98.0,1.0,1.0,3.0,0.0


## OneHotEncoding

In [37]:
df3 = df.copy()

In [39]:
from sklearn.preprocessing import OneHotEncoder

In [41]:
ohe = OneHotEncoder(drop="first", sparse_output=False, dtype=np.int32)
arr_ohe = ohe.fit_transform(df)
arr_ohe

array([[0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 1, ..., 1, 0, 1]])

## get_dummies

In [44]:
df4 = df.copy()

In [54]:
gd = pd.get_dummies(df4, columns=object_list, drop_first=True)
df_gd = gd.astype(int)
df_gd

Unnamed: 0,age,fever,gender_Male,cough_Strong,city_Delhi,city_Kolkata,city_Mumbai,has_covid_Yes
0,60,103,1,0,0,1,0,0
1,27,100,1,0,1,0,0,1
2,42,101,1,0,1,0,0,0
3,31,98,0,0,0,1,0,0
4,65,101,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...
95,12,104,0,0,0,0,0,0
96,51,101,0,1,0,1,0,1
97,20,101,0,0,0,0,0,0
98,5,98,0,1,0,0,1,0


In [61]:
df = df1

## train_test_split

In [66]:
target_col = input("Enter the target column : ")

from sklearn.model_selection import train_test_split

x = df.drop(columns = [target_col])
y = df[target_col]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

Enter the target column :  has_covid


In [68]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train_sc = sc.fit_transform(x_train)
x_train_sc_new = pd.DataFrame(x_train_sc)
np.round(x_train_sc_new.describe())

Unnamed: 0,0,1,2,3,4
count,80.0,80.0,80.0,80.0,80.0
mean,-0.0,0.0,-0.0,-0.0,-0.0
std,1.0,1.0,1.0,1.0,1.0
min,-2.0,-1.0,-2.0,-1.0,-1.0
25%,-1.0,-1.0,-1.0,-1.0,-1.0
50%,-0.0,-1.0,0.0,-1.0,-0.0
75%,1.0,1.0,1.0,1.0,1.0
max,2.0,1.0,2.0,1.0,2.0


In [70]:
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()
x_train_mm = mm.fit_transform(x_train)
x_train_mn_new = pd.DataFrame(x_train_mm)
np.round(x_train_mn_new.describe())

Unnamed: 0,0,1,2,3,4
count,80.0,80.0,80.0,80.0,80.0
mean,0.0,0.0,0.0,0.0,0.0
std,0.0,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0
