In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [6]:
df = pd.read_csv('covid_toy.csv')
df

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No
...,...,...,...,...,...,...
95,12,Female,104.0,Mild,Bangalore,No
96,51,Female,101.0,Strong,Kolkata,Yes
97,20,Female,101.0,Mild,Bangalore,No
98,5,Female,98.0,Strong,Mumbai,No


In [4]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [5]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [7]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['has_covid']),df['has_covid'],
                                                test_size=0.2)

In [8]:
X_train

Unnamed: 0,age,gender,fever,cough,city
23,80,Female,98.0,Mild,Delhi
86,25,Male,104.0,Mild,Bangalore
1,27,Male,100.0,Mild,Delhi
18,64,Female,98.0,Mild,Bangalore
62,56,Female,104.0,Strong,Bangalore
...,...,...,...,...,...
61,81,Female,98.0,Strong,Mumbai
21,73,Male,98.0,Mild,Bangalore
48,66,Male,99.0,Strong,Bangalore
35,82,Female,102.0,Strong,Bangalore


## 1. Without Column Transformer

In [10]:
# adding simple imputer to fever col
si = SimpleImputer()
X_train_fever = si.fit_transform(X_train[['fever']])

# also the test data
X_test_fever = si.fit_transform(X_test[['fever']])
print(X_train)
                                 
X_train_fever.shape

    age  gender  fever   cough       city
23   80  Female   98.0    Mild      Delhi
86   25    Male  104.0    Mild  Bangalore
1    27    Male  100.0    Mild      Delhi
18   64  Female   98.0    Mild  Bangalore
62   56  Female  104.0  Strong  Bangalore
..  ...     ...    ...     ...        ...
61   81  Female   98.0  Strong     Mumbai
21   73    Male   98.0    Mild  Bangalore
48   66    Male   99.0  Strong  Bangalore
35   82  Female  102.0  Strong  Bangalore
83   17  Female  104.0    Mild    Kolkata

[80 rows x 5 columns]


(80, 1)

In [11]:
# Ordinalencoding -> cough
oe = OrdinalEncoder(categories=[['Mild','Strong']])
X_train_cough = oe.fit_transform(X_train[['cough']])

# also the test data
X_test_cough = oe.fit_transform(X_test[['cough']])

X_train_cough.shape

(80, 1)

In [12]:
# OneHotEncoding -> gender,city
ohe = OneHotEncoder(drop='first',sparse=False)
X_train_gender_city = ohe.fit_transform(X_train[['gender','city']])

# also the test data
X_test_gender_city = ohe.fit_transform(X_test[['gender','city']])

X_train_gender_city.shape



(80, 4)

In [14]:
# Extracting Age
X_train_age = X_train.drop(columns=['gender','fever','cough','city']).values

# also the test data
X_test_age = X_test.drop(columns=['gender','fever','cough','city']).values

X_train_age

array([[80],
       [25],
       [27],
       [64],
       [56],
       [70],
       [75],
       [ 5],
       [69],
       [60],
       [82],
       [51],
       [47],
       [81],
       [40],
       [55],
       [42],
       [ 5],
       [42],
       [73],
       [14],
       [12],
       [24],
       [12],
       [20],
       [83],
       [11],
       [65],
       [82],
       [14],
       [84],
       [49],
       [ 5],
       [71],
       [54],
       [10],
       [20],
       [72],
       [22],
       [50],
       [13],
       [83],
       [19],
       [68],
       [69],
       [34],
       [27],
       [26],
       [15],
       [69],
       [38],
       [47],
       [38],
       [44],
       [34],
       [75],
       [19],
       [33],
       [11],
       [16],
       [60],
       [51],
       [59],
       [80],
       [23],
       [64],
       [49],
       [34],
       [64],
       [34],
       [46],
       [31],
       [74],
       [19],
       [ 8],
       [81],
       [73],

In [17]:
X_train_transformed = np.concatenate((X_train_age,X_train_fever,X_train_gender_city,X_train_cough),axis=1)
# also the test data
X_test_transformed = np.concatenate((X_test_age,X_test_fever,X_test_gender_city,X_test_cough),axis=1)

X_train_transformed.shape

(80, 7)

## Using Column Transformer

In [18]:
from sklearn.compose import ColumnTransformer

In [20]:
transformer = ColumnTransformer(transformers=[
    ('tnf1',SimpleImputer(),['fever']),
    ('tnf2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('tnf3',OneHotEncoder(sparse=False,drop='first'),['gender','city'])
],remainder='passthrough')

In [21]:
transformer.fit_transform(X_train).shape



(80, 7)

In [22]:
transformer.transform(X_test).shape

(20, 7)