Data => separate (categorical, numerical)

Categorical data => SimpleImputer fill => Encode

Numerical data => SimpleImputer fill => Standardize

Apply => Algorithm (LogsiticRegression)

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("D:\\DataSet\\covid_toy.csv")
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [3]:
from sklearn.model_selection import train_test_split

In [4]:
x = df.drop(columns = ["has_covid"])
y = df["has_covid"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [5]:
numerical = []
categorical = []

def dtype_filter(data):
    for i in data.select_dtypes(include=["int64", "float64"]).columns:
        numerical.append(i)
    for j in data.select_dtypes(include=["object"]).columns:
        categorical.append(j)

dtype_filter(x_train)
print("Numerical dtype : ", numerical)
print("Categorical dtype : ", categorical)

Numerical dtype :  ['age', 'fever']
Categorical dtype :  ['gender', 'cough', 'city']


In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [7]:
# Create transformers
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num",numerical_transformer, numerical),
        ("cat",categorical_transformer, categorical)
])

clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression())
])

clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

acc = accuracy_score(y_test, y_pred)
acc

0.45