In [13]:
#importing libraries
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [4]:
#loading the dataset
df = pd.read_csv("dataset.csv")
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        100 non-null    int64  
 1   gender     100 non-null    object 
 2   fever      90 non-null     float64
 3   cough      100 non-null    object 
 4   city       100 non-null    object 
 5   has_covid  100 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB


In [8]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

##### Finding from above dataset 

1. Fever has 10 null values and we need to impute them. 
2. Gender and City column we need to transform using one hot encoding. 
3. For Cough column we need to use ordinal encoder

In [11]:
#dividing the data into train and test 
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['has_covid']),df['has_covid'],
                                                test_size=0.2)

In [15]:
#imputing fever column 
simple_imputer = SimpleImputer()
#train data
X_train_fever = simple_imputer.fit_transform(X_train[['fever']])

#test data
X_test_fever = simple_imputer.fit_transform(X_test[['fever']])
                                 
X_train_fever.shape

(80, 1)

In [17]:
# transforming cough column
ordinal_encoder = OrdinalEncoder(categories=[['Mild','Strong']])

#train data 
X_train_cough = ordinal_encoder.fit_transform(X_train[['cough']])

#test data
X_test_cough = ordinal_encoder.fit_transform(X_test[['cough']])

X_train_cough.shape

(80, 1)

In [18]:
# transforming gender,city
one_hot_encoder = OneHotEncoder(drop='first',sparse=False)

#train data 
X_train_gender_city = one_hot_encoder.fit_transform(X_train[['gender','city']])

# also the test data
X_test_gender_city = one_hot_encoder.fit_transform(X_test[['gender','city']])

X_train_gender_city.shape

(80, 4)

In [19]:
# Seprating Age Column for concatenate 

#train data 
X_train_age = X_train.drop(columns=['gender','fever','cough','city']).values

#test data
X_test_age = X_test.drop(columns=['gender','fever','cough','city']).values

X_train_age.shape

(80, 1)

In [20]:
X_train_transformed = np.concatenate((X_train_age,X_train_fever,X_train_gender_city,X_train_cough),axis=1)
# also the test data
X_test_transformed = np.concatenate((X_test_age,X_test_fever,X_test_gender_city,X_test_cough),axis=1)

X_train_transformed.shape

(80, 7)

### ColumnTransformer 

In [21]:
from sklearn.compose import ColumnTransformer

"""
1. In column transformer we pass the list of transformers which we want
   to apply on different columns. 

2. Remainder has two values
    a. passthrough : dont change anything on other columns 
    b. drop : drop other columns

"""
transformer = ColumnTransformer(transformers=[
    ('transformer_1',SimpleImputer(),['fever']),
    ('transformer_2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('transformer_3',OneHotEncoder(sparse=False,drop='first'),['gender','city'])
],remainder='passthrough')



In [34]:
transformer.fit_transform(X_train).shape


(80, 7)

In [35]:
transformer.transform(X_test).shape

(20, 7)