# Data Preprocessing

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
#%pip install matplotlib

## Importing Dataset

In [2]:
data_set = pd.read_csv('Covid_data.csv')
X = data_set.iloc[: , :-1].values
y = data_set.iloc[: , -1].values

In [5]:
X

array([[10.0, 'Normal', 'no', 'no', 97.0],
       [12.0, 'Normal', 'no', 'no', 97.0],
       [15.0, 'Normal', 'no', 'no', 94.0],
       [10.0, 'Normal', 'no', 'no', 97.0],
       [13.0, 'Moderate', 'no', 'no', 94.0],
       [12.0, 'Moderate', 'no', 'no', 97.0],
       [13.0, 'Moderate', 'no', 'no', 93.0],
       [15.0, 'Moderate', 'no', 'no', 92.0],
       [18.0, 'Moderate', 'no', 'no', 66.0],
       [19.0, 'Normal', 'no', 'no', 92.0],
       [20.0, 'Normal', 'no', 'no', 93.0],
       [17.0, 'Normal', 'no', 'no', 93.0],
       [16.0, 'Normal', 'no', 'no', 92.0],
       [18.0, 'Normal', 'no', 'no', 93.0],
       [20.0, 'Normal', 'no', 'no', 92.0],
       [25.0, 'Moderate', 'no', 'no', 93.0],
       [24.0, 'Moderate', 'no', 'no', 92.0],
       [26.0, 'High', 'no', 'no', 94.0],
       [28.0, 'Normal', 'no', 'no', 99.0],
       [29.0, 'Normal', 'no', 'no', 93.0],
       [30.0, 'Moderate', 'no', 'no', 62.0],
       [19.0, 'Normal', 'no', 'no', 89.0],
       [25.0, 'Normal', 'no', 'yes', 86.




## Handling Missing Data

In [1]:
#%pip install scikit-learn

In [3]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[: ,0:1])
X[: ,0:1] = imputer.transform(X[: ,0:1])

In [4]:
imputer.fit(X[: , 4:5])
X[: , 4:5] = imputer.transform(X[: , 4:5])

In [10]:
X

array([[10.0, 'Normal', 'no', 'no', 97.0],
       [12.0, 'Normal', 'no', 'no', 97.0],
       [15.0, 'Normal', 'no', 'no', 94.0],
       [10.0, 'Normal', 'no', 'no', 97.0],
       [13.0, 'Moderate', 'no', 'no', 94.0],
       [12.0, 'Moderate', 'no', 'no', 97.0],
       [13.0, 'Moderate', 'no', 'no', 93.0],
       [15.0, 'Moderate', 'no', 'no', 92.0],
       [18.0, 'Moderate', 'no', 'no', 66.0],
       [19.0, 'Normal', 'no', 'no', 92.0],
       [20.0, 'Normal', 'no', 'no', 93.0],
       [17.0, 'Normal', 'no', 'no', 93.0],
       [16.0, 'Normal', 'no', 'no', 92.0],
       [18.0, 'Normal', 'no', 'no', 93.0],
       [20.0, 'Normal', 'no', 'no', 92.0],
       [25.0, 'Moderate', 'no', 'no', 93.0],
       [24.0, 'Moderate', 'no', 'no', 92.0],
       [26.0, 'High', 'no', 'no', 94.0],
       [28.0, 'Normal', 'no', 'no', 99.0],
       [29.0, 'Normal', 'no', 'no', 93.0],
       [30.0, 'Moderate', 'no', 'no', 62.0],
       [19.0, 'Normal', 'no', 'no', 89.0],
       [25.0, 'Normal', 'no', 'yes', 86.

## Encoding Categorical Data

### Encoding independent variables

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [6]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [15]:
X

array([[0.0, 0.0, 1.0, 10.0, 'no', 'no', 97.0],
       [0.0, 0.0, 1.0, 12.0, 'no', 'no', 97.0],
       [0.0, 0.0, 1.0, 15.0, 'no', 'no', 94.0],
       [0.0, 0.0, 1.0, 10.0, 'no', 'no', 97.0],
       [0.0, 1.0, 0.0, 13.0, 'no', 'no', 94.0],
       [0.0, 1.0, 0.0, 12.0, 'no', 'no', 97.0],
       [0.0, 1.0, 0.0, 13.0, 'no', 'no', 93.0],
       [0.0, 1.0, 0.0, 15.0, 'no', 'no', 92.0],
       [0.0, 1.0, 0.0, 18.0, 'no', 'no', 66.0],
       [0.0, 0.0, 1.0, 19.0, 'no', 'no', 92.0],
       [0.0, 0.0, 1.0, 20.0, 'no', 'no', 93.0],
       [0.0, 0.0, 1.0, 17.0, 'no', 'no', 93.0],
       [0.0, 0.0, 1.0, 16.0, 'no', 'no', 92.0],
       [0.0, 0.0, 1.0, 18.0, 'no', 'no', 93.0],
       [0.0, 0.0, 1.0, 20.0, 'no', 'no', 92.0],
       [0.0, 1.0, 0.0, 25.0, 'no', 'no', 93.0],
       [0.0, 1.0, 0.0, 24.0, 'no', 'no', 92.0],
       [1.0, 0.0, 0.0, 26.0, 'no', 'no', 94.0],
       [0.0, 0.0, 1.0, 28.0, 'no', 'no', 99.0],
       [0.0, 0.0, 1.0, 29.0, 'no', 'no', 93.0],
       [0.0, 1.0, 0.0, 30.0, 'no', 'no',

### Encoding dependent variables

In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [8]:
print(y)

[0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1]


## Splitting data into Test set & Training Set


In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 42)

In [11]:
print(X_test)

[[0.0 0.0 1.0 25.0 'no' 'yes' 86.0]
 [0.0 0.0 1.0 10.0 'no' 'no' 97.0]
 [1.0 0.0 0.0 68.0 'yes' 'no' 67.0]
 [0.0 1.0 0.0 13.0 'no' 'no' 94.0]
 [1.0 0.0 0.0 59.0 'yes' 'no' 68.0]
 [0.0 0.0 1.0 28.0 'no' 'no' 99.0]
 [0.0 0.0 1.0 20.0 'no' 'no' 93.0]
 [1.0 0.0 0.0 46.0 'yes' 'no' 91.0]
 [1.0 0.0 0.0 53.0 'yes' 'no' 55.0]
 [0.0 0.0 1.0 16.0 'no' 'no' 92.0]
 [1.0 0.0 0.0 38.0 'no' 'no' 75.0]
 [0.0 0.0 1.0 19.0 'no' 'no' 92.0]
 [1.0 0.0 0.0 94.0 'yes' 'yes' 64.0]
 [0.0 1.0 0.0 12.0 'no' 'no' 97.0]]


In [12]:
print(y_test)

[0 0 1 0 1 0 0 0 1 0 1 0 1 0]


## Feature Scaling

In [15]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[: ,6:] = sc.fit_transform(X_train[: ,6:])
X_test[: ,6:] = sc.transform(X_test[: ,6:])

In [16]:
print(X_train)

[[1.0 0.0 0.0 86.0 'yes' 'yes' -0.9941858494300262]
 [0.0 1.0 0.0 35.0 'no' 'yes' 0.9574354145081888]
 [1.0 0.0 0.0 75.0 'no' 'yes' -0.9012515035282065]
 [1.0 0.0 0.0 42.0 'no' 'no' 0.9574354145081888]
 [1.0 0.0 0.0 74.0 'yes' 'yes' -0.6224484658227473]
 [1.0 0.0 0.0 62.0 'yes' 'yes' -1.2729888871354855]
 [0.0 1.0 0.0 24.0 'no' 'no' 0.9574354145081888]
 [0.0 1.0 0.0 48.0 'no' 'yes' 1.0503697604100086]
 [1.0 0.0 0.0 58.0 'no' 'no' -1.0871201953318461]
 [0.0 1.0 0.0 45.130434782608695 'yes' 'no' 0.7715667227045493]
 [0.0 1.0 0.0 15.0 'no' 'no' 0.9574354145081888]
 [0.0 0.0 1.0 70.0 'no' 'yes' 0.5856980309009098]
 [1.0 0.0 0.0 52.0 'yes' 'yes' -0.1577767363136484]
 [1.0 0.0 0.0 60.0 'yes' 'yes' -1.2729888871354855]
 [1.0 0.0 0.0 54.0 'yes' 'yes' -1.0871201953318461]
 [0.0 0.0 1.0 29.0 'no' 'no' 1.0503697604100086]
 [1.0 0.0 0.0 78.0 'yes' 'yes' -0.9941858494300262]
 [1.0 0.0 0.0 36.0 'yes' 'no' 0.7715667227045493]
 [0.0 1.0 0.0 30.0 'yes' 'no' 0.3998293390972702]
 [1.0 0.0 0.0 50.0 'yes' 