In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder
from sklearn.impute import SimpleImputer

In [3]:
df = pd.read_csv('covid_toy.csv')

In [4]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [5]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [6]:
X = df.iloc[:,:5]

In [7]:
X.sample(5)

Unnamed: 0,age,gender,fever,cough,city
44,20,Male,102.0,Strong,Delhi
5,84,Female,,Mild,Bangalore
54,60,Female,99.0,Mild,Mumbai
76,80,Male,100.0,Mild,Bangalore
88,5,Female,100.0,Mild,Kolkata


In [8]:
y = df.iloc[:,-1]
y.sample(5)

50    Yes
96    Yes
79    Yes
84     No
7     Yes
Name: has_covid, dtype: object

In [9]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=2)

In [10]:
X_train.sample(5)

Unnamed: 0,age,gender,fever,cough,city
91,38,Male,,Mild,Delhi
72,83,Female,101.0,Mild,Kolkata
84,69,Female,98.0,Strong,Mumbai
85,16,Female,103.0,Mild,Bangalore
63,10,Male,100.0,Mild,Bangalore


In [11]:
y_train.sample(5)

36     No
62    Yes
9      No
90     No
11    Yes
Name: has_covid, dtype: object

# Concatenate Method

In [12]:
# Simple Imputer

si = SimpleImputer()

X_train_fever = si.fit_transform(X_train[['fever']])

X_test_fever = si.fit_transform(X_test[['fever']])

In [13]:
X_train['cough'].unique()

array(['Strong', 'Mild'], dtype=object)

In [14]:
# Ordinal Encoder

oe = OrdinalEncoder(categories=[['Strong', 'Mild']])

X_train_cough = oe.fit_transform(X_train[['cough']])

X_test_cough = oe.fit_transform(X_test[['cough']])
X_train_cough.shape

(80, 1)

In [15]:
X_train['city'].unique()

array(['Bangalore', 'Mumbai', 'Delhi', 'Kolkata'], dtype=object)

In [16]:
X_train['gender'].unique()

array(['Female', 'Male'], dtype=object)

In [17]:
# One Hot Encoder -> gender,city

ohe = OneHotEncoder(drop ='first',sparse=False)

X_train_gender_city = ohe.fit_transform(X_train[['gender','city']])

X_test_gender_city = ohe.fit_transform(X_test[['gender','city']])
X_train_gender_city.shape

(80, 4)

In [18]:
# Dividing age from df

X_train_age = X_train.drop(columns=['gender', 'fever', 'cough', 'city']).values

X_test_age = X_test.drop(columns=['gender', 'fever', 'cough', 'city']).values
X_train_age.shape

(80, 1)

In [19]:
X_train_transformed = np.concatenate((X_train_age,X_train_fever,X_train_gender_city,X_train_cough),axis = 1)

X_test_transformed = np.concatenate((X_test_age,X_test_fever,X_test_gender_city,X_test_cough),axis = 1)

In [20]:
X_train_transformed.shape

(80, 7)

In [21]:
X_test_transformed.shape

(20, 7)

In [22]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [23]:
le.fit(y_train)

LabelEncoder()

In [24]:
le.classes_

array(['No', 'Yes'], dtype=object)

In [25]:
y_train = le.transform(y_train)
y_test = le.transform(y_test)

In [26]:
gbr_params = {'n_estimators': 500,
          'max_depth': 3,
          'min_samples_split': 5,
          'learning_rate': 1,
          'loss': 'squared_error'}

In [27]:
from sklearn.ensemble import GradientBoostingRegressor

In [28]:
gbr = GradientBoostingRegressor(**gbr_params)

In [29]:
y_pred = gbr.fit(X_train_transformed,y_train)

In [42]:
from sklearn.compose import ColumnTransformer
column_trans = ColumnTransformer(transformers=[
    ('tn1', SimpleImputer(),['fever']),
    ('tn2',OrdinalEncoder(categories=[['Strong', 'Mild']]),['cough']),
    ('tn3',OneHotEncoder(sparse=False,drop='first'),['gender','city'])
],remainder='passthrough')

In [43]:
transformer.fit_transform(X_train)
transformer.fit_transform(X_test)

array([[104.        ,   1.        ,   0.        ,   0.        ,
          1.        ,   0.        ,  17.        ],
       [101.        ,   1.        ,   1.        ,   1.        ,
          0.        ,   0.        ,  15.        ],
       [101.05555556,   0.        ,   1.        ,   0.        ,
          1.        ,   0.        ,  71.        ],
       [100.        ,   0.        ,   0.        ,   0.        ,
          1.        ,   0.        ,  13.        ],
       [103.        ,   1.        ,   0.        ,   0.        ,
          1.        ,   0.        ,  69.        ],
       [ 98.        ,   1.        ,   0.        ,   1.        ,
          0.        ,   0.        ,  80.        ],
       [101.        ,   1.        ,   1.        ,   1.        ,
          0.        ,   0.        ,  42.        ],
       [102.        ,   0.        ,   0.        ,   1.        ,
          0.        ,   0.        ,  33.        ],
       [104.        ,   1.        ,   1.        ,   0.        ,
          1.    

In [37]:
from sklearn.pipeline import make_pipeline

In [45]:
pipe=make_pipeline(column_trans,gbr)

In [46]:
pipe.fit(X_train,y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('tn1', SimpleImputer(),
                                                  ['fever']),
                                                 ('tn2',
                                                  OrdinalEncoder(categories=[['Strong',
                                                                              'Mild']]),
                                                  ['cough']),
                                                 ('tn3',
                                                  OneHotEncoder(drop='first',
                                                                sparse=False),
                                                  ['gender', 'city'])])),
                ('gradientboostingregressor',
                 GradientBoostingRegressor(learning_rate=1, min_samples_split=5,
                                           n_estimators

In [47]:
y_pred=pipe.predict(X_test)

In [30]:
df.columns

Index(['age', 'gender', 'fever', 'cough', 'city', 'has_covid'], dtype='object')

In [51]:
df.sample()

Unnamed: 0,age,gender,fever,cough,city,has_covid
53,83,Male,98.0,Mild,Delhi,Yes


In [52]:
pipe.predict(pd.DataFrame(columns=['age', 'gender', 'fever', 'cough', 'city'],data=np.array([83, 'Male', 98.0, 'Mild','Delhi']).reshape(1,5)))

array([1.00000001])