In [None]:
# Downloading the dataset zip

In [None]:
import kaggle
!kaggle competitions download -c playground-series-s3e22

In [None]:
# Extracting the zip file

In [None]:
from zipfile import ZipFile
file_path = 'playground-series-s3e22.zip'
with ZipFile(file_path, 'r') as zip:
    zip.extractall()

In [None]:
# Importing Pandas and Numpy

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Reading the Training and Test csv's

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
# Removing the singleton values from train_df which are not in test_df and also Encoding the outcome column

In [None]:
train_df = train_df[train_df['peristalsis'] != 'distend_small']
train_df = train_df[train_df['nasogastric_reflux'] != 'slight']
train_df = train_df[train_df['rectal_exam_feces'] != 'serosanguious']
train_df['outcome'].replace({'died': 0, 'euthanized': 1, 'lived': 2}, inplace=True)
train_df.shape

In [6]:
# Creating two dataframes precisely from training and test dataframes by dropping unnecessary columns

In [None]:
df1 = train_df.drop(columns=['id', 'hospital_number', 'outcome'], axis = 1)
df2 = test_df.drop(columns=['id', 'hospital_number'], axis = 1)

In [7]:
# Getting out all the necessary information from both the dataframes

In [None]:
df1.info()

In [None]:
df2.info()

In [8]:
# Finding out the categorical and numerical features

In [None]:
categorical_features = []
numerical_features = []
for cols in df2:
    if df2[cols].dtype == 'object':
        categorical_features.append(cols)
    else:
        numerical_features.append(cols)

In [9]:
# Cheking the null values from both the columns

In [None]:
df1.isna().sum()

In [None]:
df2.isna().sum()

In [10]:
# Filling the categorical column null values with mode and numerical column nul values with median

In [None]:
for col in df1.columns:
    if col in categorical_features:
        mode_value = df1[col].mode()[0]  
        df1[col].fillna(mode_value, inplace=True)  
    else:
        median_value = df1[col].median()  
        df1[col].fillna(median_value, inplace=True) 

In [None]:
for col in df2.columns:
    if col in categorical_features:
        mode_value = df2[col].mode()[0]  
        df2[col].fillna(mode_value, inplace=True)  
    else:
        median_value = df2[col].median()  
        df2[col].fillna(median_value, inplace=True) 

In [11]:
# Data Visualization and EDA

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(data=df1[['rectal_temp', 'pulse', 'respiratory_rate', 'packed_cell_volume', 'total_protein']])
plt.title('Box Plot of Numerical Features')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(data=df1[['rectal_temp', 'pulse', 'respiratory_rate', 'packed_cell_volume', 'total_protein']], bins=20, kde=True)
plt.title('Distribution of Numerical Features')
plt.show()

In [12]:
# Checking unique values

In [None]:
for cols in categorical_features:
    print(cols, df1[cols].unique())

In [None]:
for cols in categorical_features:
    print(cols, df2[cols].unique())

In [13]:
# Category Mapping of all the categorical columns in both the dataframes 

In [None]:
category_mapping = {
    'surgery': {'yes': 0, 'no': 1},
    'age': {'adult': 0, 'young': 1},
    'temp_of_extremities': {'cool': 0, 'cold': 1, 'normal': 2, 'warm': 3},
    'peripheral_pulse': {'reduced': 0, 'normal': 1, 'absent': 2, 'increased': 3},
    'mucous_membrane': {
        'dark_cyanotic': 0, 'pale_cyanotic': 1, 'pale_pink': 2,
        'normal_pink': 3, 'bright_pink': 4, 'bright_red': 5
    },
    'capillary_refill_time': {'more_3_sec': 0, 'less_3_sec': 1, '3': 2},
    'pain': {'depressed': 0, 'mild_pain': 1, 'extreme_pain': 2, 'alert': 3, 'severe_pain': 4, 'slight': 5, 'moderate': 5},
    'peristalsis': {'absent': 0, 'hypomotile': 1, 'normal': 2, 'hypermotile': 3, 'distend_small': 4},
    'abdominal_distention': {'slight': 0, 'moderate': 1, 'none': 2, 'severe': 3},
    'nasogastric_tube': {'slight': 0, 'none': 1, 'significant': 2},
    'nasogastric_reflux': {'less_1_liter': 0, 'more_1_liter': 1, 'none': 2, 'slight': 3},
    'rectal_exam_feces': {'decreased': 0, 'absent': 1, 'normal': 2, 'increased': 3, 'serosanguious': 4},
    'abdomen': {'distend_small': 0, 'distend_large': 1, 'normal': 2, 'firm': 3, 'other': 4},
    'abdomo_appearance': {'serosanguious': 0, 'cloudy': 1, 'clear': 2},
    'surgical_lesion': {'yes': 0, 'no': 1},
    'cp_data': {'no': 0, 'yes': 1}
}


df1.replace(category_mapping, inplace=True)
df2.replace(category_mapping, inplace=True)

In [14]:
# Specifying Train and Test nd arrays

In [None]:
train_X = df1
test_X = df2

In [None]:
train_X.shape

In [None]:
test_X.shape

In [15]:
# Feature Scaling using Column Transformer and Standard Scaler

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer([
    ('StandardScaling', StandardScaler(), numerical_features)
], remainder='passthrough')
train_X = ct.fit_transform(train_X)
test_X = ct.transform(test_X)

In [None]:
y = train_df['outcome']
y.shape

In [None]:
train_X.shape

In [None]:
test_X.shape

In [16]:
# Specifying the all the models being used.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting (XGBoost)': XGBClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting (LightGBM)': LGBMClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting (CatBoost)': CatBoostClassifier(iterations=100, random_state=42, verbose=0),
    'Support Vector Machine (SVM)': SVC(kernel='rbf', C=1.0, random_state=42),
    'Naive Bayes': GaussianNB(),
    'K-Nearest Neighbors (KNN)': KNeighborsClassifier(n_neighbors=5),
    'Ensemble': AdaBoostClassifier(n_estimators=50, random_state=42)
}


In [17]:
# Training and Test set split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_X, y, test_size=0.2, random_state=42)

In [18]:
# Training and Evaluation of the model

In [None]:
from sklearn.metrics import accuracy_score as acc_score, precision_score as prec_score, recall_score as rec_score, f1_score as f1

model_list = []
accuracy_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model_name = list(models.keys())[i]
    model.fit(X_train, y_train)
    y_test_pred = model.predict(X_test)
    model_list.append(model_name)
    accuracy = acc_score(y_test, y_test_pred)
    precision = prec_score(y_test, y_test_pred, average='weighted')
    recall = rec_score(y_test, y_test_pred, average='weighted')
    f1_score = f1(y_test, y_test_pred, average='weighted')
    accuracy_list.append(accuracy)
    print(f"{model_name} has scores as: Accuracy - {accuracy}, Precision - {precision}, Recall - {recall}, F1 Score - {f1_score}")


In [None]:
model_evaluation = pd.DataFrame({'Model Name':model_list, 'Accuracy':accuracy_list})
model_evaluation = model_evaluation.sort_values(by=['Accuracy'], ascending=False)
model_evaluation

In [19]:
# Finding out the best model and taking out the predictions of the test_df and saving the predictions into a csv file to submit.

In [None]:
best_model = RandomForestClassifier()
best_model.fit(train_X, y)

In [None]:
predictions = best_model.predict(test_X)
prediction_mapping = {0: 'died', 1: 'euthanized', 2: 'lived'}
predicted_labels = [prediction_mapping[pred] for pred in predictions]
submission_df = pd.DataFrame({'Predicted_Outcome': predicted_labels})
start_id = 1235
ids = range(start_id, start_id + len(predicted_labels))
submission_df = pd.DataFrame({'id': ids, 'Outcome': predicted_labels})
submission_df.to_csv('submissions.csv', index=False)