In [45]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest,chi2


In [46]:
df=pd.read_csv('/content/covid_toy.csv')
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [47]:
df['city'].value_counts()

Unnamed: 0_level_0,count
city,Unnamed: 1_level_1
Kolkata,32
Bangalore,30
Delhi,22
Mumbai,16


In [48]:
df.isnull().sum()

Unnamed: 0,0
age,0
gender,0
fever,10
cough,0
city,0
has_covid,0


# Plan
fever - Simpleimuter

gender, city - onehot

cough - ordinal

scale

In [49]:
x_tr,x_te,y_tr,y_te=train_test_split(df.drop(columns=['has_covid']),df['has_covid'],test_size=0.2,random_state=42)

In [50]:
x_tr

Unnamed: 0,age,gender,fever,cough,city
55,81,Female,101.0,Mild,Mumbai
88,5,Female,100.0,Mild,Kolkata
26,19,Female,100.0,Mild,Kolkata
42,27,Male,100.0,Mild,Delhi
69,73,Female,103.0,Mild,Delhi
...,...,...,...,...,...
60,24,Female,102.0,Strong,Bangalore
71,75,Female,104.0,Strong,Delhi
14,51,Male,104.0,Mild,Bangalore
92,82,Female,102.0,Strong,Kolkata


# imputa

In [51]:
trf1 = ColumnTransformer([
    ('fever_imputer', SimpleImputer(strategy='most_frequent'), ['fever'])
], remainder='passthrough')


In [52]:
'''trf1 = ColumnTransformer([
    ('fever_imputer', SimpleImputer(strategy='most_frequent'), [2])
], remainder='passthrough')
'''

"trf1 = ColumnTransformer([\n    ('fever_imputer', SimpleImputer(strategy='most_frequent'), [2])\n], remainder='passthrough')\n"

# oridinal

In [53]:
'''trf2= ColumnTransformer([
    ('cough_ordinal',OrdinalEncoder(categories=[['Mild','Strong']]),[3])
],remainder='passthrough')'''

"trf2= ColumnTransformer([\n    ('cough_ordinal',OrdinalEncoder(categories=[['Mild','Strong']]),[3])\n],remainder='passthrough')"

In [54]:
trf2= ColumnTransformer([
    ('cough_ordinal',OrdinalEncoder(categories=[['Mild','Strong']]),['cough'])
],remainder='passthrough')

# onehot

In [55]:
'''trf3 = ColumnTransformer([
    ('gender_city',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[1,4]),

], remainder='passthrough')'''


"trf3 = ColumnTransformer([\n    ('gender_city',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[1,4]),\n\n], remainder='passthrough')"

In [56]:
trf3 = ColumnTransformer([
    ('gender_onehot',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),['gender']),
    ('city_onehot',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),['city'])
], remainder='passthrough')


# scale

In [57]:
trf4=ColumnTransformer([
    ('scaler',MinMaxScaler(),slice(0,9))
])

In [58]:
trf5= DecisionTreeClassifier()

# we use preproicessor as it is better to use col names

In [59]:
preprocessor = ColumnTransformer(
    transformers=[
        ('fever_imputer', SimpleImputer(), ['fever']),
        ('gender_onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ['gender']), # handle_unknown='ignore' to prevent error on unseen categories in test set
        ('city_onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ['city']),
        ('cough_ordinal', OrdinalEncoder(categories=[['Mild','Strong']], handle_unknown='use_encoded_value', unknown_value=-1), ['cough']) # handle_unknown and unknown_value for robustness
    ],
    remainder='passthrough' # Keep other columns that weren't explicitly transformed
)

# pipeline

In [60]:
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', MinMaxScaler()), # Apply scaler to the output of the preprocessor
    ('model', DecisionTreeClassifier())
])

In [61]:
'''pipe=Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5)
   ])'''

"pipe=Pipeline([\n    ('trf1',trf1),\n    ('trf2',trf2),\n    ('trf3',trf3),\n    ('trf4',trf4),\n    ('trf5',trf5)\n   ])"

In [62]:
pipe.fit(x_tr,y_tr)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [63]:
y_pred= pipe.predict(x_te)

In [64]:
from sklearn.metrics import accuracy_score
accuracy_score(y_te,y_pred)

0.5

In [65]:
import pickle
pickle.dump(pipe,open('covid.pkl','wb'))