# Setup

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Get To Know Data

In [44]:
df = pd.read_csv('../data/covid_dataset.csv')

In [45]:
df.head()

Unnamed: 0,USMER,MEDICAL_UNIT,SEX,PATIENT_TYPE,DATE_DIED,INTUBED,PNEUMONIA,AGE,PREGNANT,DIABETES,...,ASTHMA,INMSUPR,HIPERTENSION,OTHER_DISEASE,CARDIOVASCULAR,OBESITY,RENAL_CHRONIC,TOBACCO,CLASIFFICATION_FINAL,ICU
0,2,1,1,1,03/05/2020,97,1,65,2,2,...,2,2,1,2,2,2,2,2,3,97
1,2,1,2,1,03/06/2020,97,1,72,97,2,...,2,2,1,2,2,1,1,2,5,97
2,2,1,2,2,09/06/2020,1,2,55,97,1,...,2,2,2,2,2,2,2,2,3,2
3,2,1,1,1,12/06/2020,97,2,53,2,2,...,2,2,2,2,2,2,2,2,7,97
4,2,1,2,1,21/06/2020,97,2,68,97,1,...,2,2,1,2,2,2,2,2,3,97


# Preprocessing

In [30]:
df.replace({97: pd.NA, 99: pd.NA}, inplace=True)

In [36]:
categorical_columns = df.select_dtypes(include=['object', 'category']).columns

for col in categorical_columns:
    # Check if the column contains missing values
    if df[col].isna().any():
        # Replace missing values with the mode (most frequent value)
        mode_value = df[col].mode()[0]
        df[col] = df[col].fillna(mode_value)  # Reassigning directly to avoid chained assignment warning

# Confirm the changes
print(df[categorical_columns].head())



    DATE_DIED  INTUBED  PNEUMONIA  AGE  PREGNANT  ICU
0  03/05/2020        2          1   65         2    2
1  03/06/2020        2          1   72         2    2
2  09/06/2020        1          2   55         2    2
3  12/06/2020        2          2   53         2    2
4  21/06/2020        2          2   68         2    2


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 21 columns):
 #   Column                Non-Null Count    Dtype 
---  ------                --------------    ----- 
 0   USMER                 1048575 non-null  int64 
 1   MEDICAL_UNIT          1048575 non-null  int64 
 2   SEX                   1048575 non-null  int64 
 3   PATIENT_TYPE          1048575 non-null  int64 
 4   DATE_DIED             1048575 non-null  object
 5   INTUBED               1048575 non-null  int64 
 6   PNEUMONIA             1048575 non-null  int64 
 7   AGE                   1048575 non-null  int64 
 8   PREGNANT              1048575 non-null  int64 
 9   DIABETES              1048575 non-null  int64 
 10  COPD                  1048575 non-null  int64 
 11  ASTHMA                1048575 non-null  int64 
 12  INMSUPR               1048575 non-null  int64 
 13  HIPERTENSION          1048575 non-null  int64 
 14  OTHER_DISEASE         1048575 non-null  int64 
 15

In [38]:
age_bins = [0, 20, 40, 60, 80, 120] 
age_labels = ['<20', '20-40', '40-60', '60-80', '80+'] 

df['AGE_GROUP'] = pd.cut(df['AGE'], bins=age_bins, labels=age_labels, right=False)

print(df[['AGE', 'AGE_GROUP']].head())

   AGE AGE_GROUP
0   65     60-80
1   72     60-80
2   55     40-60
3   53     40-60
4   68     60-80


In [39]:
df['CLASIFFICATION_FINAL'] = df['CLASIFFICATION_FINAL'].apply(lambda x: 1 if x in [1, 2, 3] else 0)

print(df['CLASIFFICATION_FINAL'].value_counts())

CLASIFFICATION_FINAL
0    656596
1    391979
Name: count, dtype: int64


In [40]:
# check if there is nan values in the AGE_GROUP column
print(df['AGE_GROUP'].isnull().sum())
df['AGE_GROUP'].fillna(df['AGE_GROUP'].mode()[0], inplace=True)

6


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['AGE_GROUP'].fillna(df['AGE_GROUP'].mode()[0], inplace=True)


In [43]:
df.to_csv("../data/covid_preprocessed.csv")