In [1]:
import pandas as pd

Covid_DF = pd.read_csv(r"C:\Users\ACER\Desktop\Kranthi\DataScience_Desktop\MachineLearningFiles\covid_toy.csv")
Covid_DF.shape
# Yes or No - Classification problem

(100, 6)

In [3]:
# Check if there are nulls in any of the columns

Covid_DF.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [5]:
# Check info of the file.

Covid_DF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        100 non-null    int64  
 1   gender     100 non-null    object 
 2   fever      90 non-null     float64
 3   cough      100 non-null    object 
 4   city       100 non-null    object 
 5   has_covid  100 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB


In [7]:
Covid_DF.describe()

Unnamed: 0,age,fever
count,100.0,90.0
mean,44.22,100.844444
std,24.878931,2.054926
min,5.0,98.0
25%,20.0,99.0
50%,45.0,101.0
75%,66.5,102.75
max,84.0,104.0


In [9]:
# Descibe for non numerical columns

Covid_DF.describe(include='object')

Unnamed: 0,gender,cough,city,has_covid
count,100,100,100,100
unique,2,2,4,2
top,Female,Mild,Kolkata,No
freq,59,62,32,55


In [11]:
Covid_DF.duplicated()

0     False
1     False
2     False
3     False
4     False
      ...  
95    False
96    False
97    False
98    False
99    False
Length: 100, dtype: bool

In [13]:
# Check random records

Covid_DF.sample(5)

Unnamed: 0,age,gender,fever,cough,city,has_covid
82,24,Male,98.0,Mild,Kolkata,Yes
69,73,Female,103.0,Mild,Delhi,No
62,56,Female,104.0,Strong,Bangalore,Yes
23,80,Female,98.0,Mild,Delhi,Yes
51,11,Female,100.0,Strong,Kolkata,Yes


#### Column wise analysis

In [16]:
# Fever column

Covid_DF.fever.describe()

count     90.000000
mean     100.844444
std        2.054926
min       98.000000
25%       99.000000
50%      101.000000
75%      102.750000
max      104.000000
Name: fever, dtype: float64

In [18]:
Covid_DF.fever.value_counts()
# Data mostly seems equally distributed.

fever
101.0    17
98.0     17
104.0    14
100.0    13
99.0     10
102.0    10
103.0     9
Name: count, dtype: int64

In [20]:
Covid_DF.fever.isnull().sum()
# There are nulls in fever column
# Impute with Mean

10

In [22]:
Covid_DF.cough.value_counts()

cough
Mild      62
Strong    38
Name: count, dtype: int64

In [24]:
# Impute nulls in Fever column with Mean value of fever - SimpleImputer
# Scale Age and Fever - Standard Scaler / MinMaxScaler
# Encode Gender  - Nominal Categorical - OneHotEncoder
# Cough - OrdinalEncoder
# City  - Nominal Categorical - OneHotEncoder
# Has Covid - Nominal Categorical - Output label - LabelEncoder

In [26]:
# Prepare the train and test data

from sklearn.model_selection import train_test_split
x = Covid_DF.drop('has_covid',axis=1)
y = Covid_DF['has_covid']
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [27]:
x_train.head(2)

Unnamed: 0,age,gender,fever,cough,city
14,51,Male,104.0,Mild,Bangalore
36,38,Female,101.0,Mild,Bangalore


In [28]:
# Import the required packages

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [32]:
ColumnTransformer_Obj = ColumnTransformer([
    ('Tr_SI_fever',SimpleImputer(),['fever']),
    ('Tr_OE_cough',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('Tr_OE_gender_city',OneHotEncoder(sparse_output=False,drop='first'),['gender','city'])
],remainder='passthrough')
# We are only giving the column names in the transformation
# At the time of fit, it checks for the column names.
# We can also give the index number 
# There will be change in column order after using column transformer, first will be fever, then cough, then gender, city and then remaining values.

In [34]:
x_train_transformed = ColumnTransformer_Obj.fit_transform(x_train)

In [36]:
Tr_OE_gender_cityObj = ColumnTransformer_Obj.named_transformers_['Tr_OE_gender_city']
Tr_OE_gender_cityObj.get_feature_names_out()

array(['gender_Male', 'city_Delhi', 'city_Kolkata', 'city_Mumbai'],
      dtype=object)

In [38]:
x_train_transformed_DF = pd.DataFrame(x_train_transformed, columns = ['fever','cough','gender_Male','city_Delhi','city_Kolkata','city_Mumbai','Age'])
x_train_transformed_DF.head(3)

Unnamed: 0,fever,cough,gender_Male,city_Delhi,city_Kolkata,city_Mumbai,Age
0,104.0,0.0,1.0,0.0,0.0,0.0,51.0
1,101.0,0.0,0.0,0.0,0.0,0.0,38.0
2,98.0,1.0,0.0,0.0,1.0,0.0,71.0


In [40]:
x_train_transformed[0:2]

array([[104.,   0.,   1.,   0.,   0.,   0.,  51.],
       [101.,   0.,   0.,   0.,   0.,   0.,  38.]])

In [42]:
x_test_transformed = ColumnTransformer_Obj.transform(x_test)

In [44]:
x_train_transformed[0:2]

array([[104.,   0.,   1.,   0.,   0.,   0.,  51.],
       [101.,   0.,   0.,   0.,   0.,   0.,  38.]])

In [46]:
ColumnTransformer_Obj.named_transformers_

{'Tr_SI_fever': SimpleImputer(),
 'Tr_OE_cough': OrdinalEncoder(categories=[['Mild', 'Strong']]),
 'Tr_OE_gender_city': OneHotEncoder(drop='first', sparse_output=False),
 'remainder': FunctionTransformer(accept_sparse=True, check_inverse=False,
                     feature_names_out='one-to-one')}

In [48]:
Tr_SI_feverObj = ColumnTransformer_Obj.named_transformers_['Tr_SI_fever']
Tr_SI_feverObj.get_params

<bound method BaseEstimator.get_params of SimpleImputer()>

In [50]:
Tr_OE_gender_cityObj = ColumnTransformer_Obj.named_transformers_['Tr_OE_gender_city']
Tr_OE_gender_cityObj.get_feature_names_out()

array(['gender_Male', 'city_Delhi', 'city_Kolkata', 'city_Mumbai'],
      dtype=object)

In [52]:
Covid_DF.city.value_counts()

city
Kolkata      32
Bangalore    30
Delhi        22
Mumbai       16
Name: count, dtype: int64

In [57]:
Covid_DF.head(2)

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
