In [5]:
import numpy as np
import pandas as pd

In [6]:
df = pd.read_csv("/content/insurance.csv")
df.head(3)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462


In [7]:
df.isnull().sum()

Unnamed: 0,0
age,0
sex,0
bmi,0
children,0
smoker,0
region,0
charges,0


In [8]:
df = df.drop(columns = ['age' , 'bmi' , 'children' , 'charges'])
df.head(3)

Unnamed: 0,sex,smoker,region
0,female,yes,southwest
1,male,no,southeast
2,male,no,southeast


In [9]:
from sklearn.preprocessing import OrdinalEncoder

In [14]:
# df['region'].value_counts()

In [13]:
oe = OrdinalEncoder(categories = [['male','female'],
                                  ['no','yes'],
                                  ['southeast','southwest','northeast','northwest']])


In [16]:
df_oe = oe.fit_transform(df)

In [17]:
df_new = pd.DataFrame(df_oe , columns = df.columns)
df_new

Unnamed: 0,sex,smoker,region
0,1.0,1.0,1.0
1,0.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,0.0,3.0
4,0.0,0.0,3.0
...,...,...,...
1333,0.0,0.0,3.0
1334,1.0,0.0,2.0
1335,1.0,0.0,0.0
1336,1.0,0.0,1.0


In [18]:
# Column Transformer ===> Container ===> We put all the steps of our project .
# ColumnTransformer('constant' , 'Approach_Name' , 'Column_Name')

In [19]:
df = pd.read_csv("/content/covid_toy.csv")
df.head(3)

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No


In [20]:
# columns ===>
# Numerical data ===> Missing values ==> fill(SimpleImputer) ===> training data .
# Categorical data ===> Missing values ==> fill(SimpleImputer) ===> training data.
# All columns training data ===> combined ===>Check  data shape .

In [21]:

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [22]:
df.isnull().sum()

Unnamed: 0,0
age,0
gender,0
fever,10
cough,0
city,0
has_covid,0


In [23]:
from sklearn.model_selection import train_test_split


In [24]:
x = df.drop(columns = ['has_covid'])
y = df['has_covid']

In [25]:
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size = 0.2 , random_state = 42)

In [26]:
# Manually type output

In [27]:
x_train.head(4)

Unnamed: 0,age,gender,fever,cough,city
55,81,Female,101.0,Mild,Mumbai
88,5,Female,100.0,Mild,Kolkata
26,19,Female,100.0,Mild,Kolkata
42,27,Male,100.0,Mild,Delhi


In [28]:
# adding simple imputer to fever column
si = SimpleImputer(strategy = "mean")
x_train_fever = si.fit_transform(x_train[['fever']])

# also the test data
x_test_fever = si.fit_transform(x_test[['fever']])

x_train_fever.shape

(80, 1)

In [29]:
# Ordinal Encoding --> Cough

oe = OrdinalEncoder(categories = [['Mild' , 'Strong']])
x_train_cough = oe.fit_transform(x_train[['cough']])

# also the test data
x_test_cough = oe.fit_transform(x_test[['cough']])

x_train_cough.shape

(80, 1)

In [30]:
# OneHotEncoding --> Gender , city

ohe = OneHotEncoder(drop = 'first' , sparse = False )
x_train_gender_city = ohe.fit_transform(x_train[['gender' , 'city']])

# also the test data
x_test_gender_city = ohe.fit_transform(x_test[['gender' , 'city']])

x_train_gender_city.shape



(80, 4)

In [31]:
# Extracting Age

x_train_age = x_train.drop(columns =
                           ['gender' , 'fever' , 'cough' , 'city']).values

# also the test data

x_test_age = x_test.drop(columns =
                         ['gender' , 'fever' , 'cough' , 'city']).values

In [32]:
x_train_transformed = np.concatenate((x_train_age , x_train_fever ,
                                      x_train_gender_city ,
                                      x_train_cough) , axis = 1)

In [33]:
x_train_transformed.shape

(80, 7)

In [34]:
# By the help of Column Transformer

In [38]:
from sklearn.compose import ColumnTransformer   # this is how to import ColumnTransformer

transformer = ColumnTransformer(transformers=[
    ('a',SimpleImputer(),['fever']),    # in a 'fever'column by the
#     help of SI  we
#     fill missing values by mean , median , mode .
    ('b',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),  # by
#     this process
#     we encode our data .
    ('c',OneHotEncoder(sparse_output=False,drop='first'),['gender','city'])
],remainder='passthrough')  # remainder = passthrough ==>
# it means rest all the columns remain same .

In [39]:
transformer.fit_transform(x_train).shape

(80, 7)