In [1]:
import torch
import torch.nn as nn
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
from datetime import  datetime

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, RobustScaler
from sklearn.model_selection import  train_test_split
from sklearn.metrics import  f1_score
from torch.utils.data import  Dataset

In [2]:
TRAIN_PATH = 'dataset/train.csv'
TEST_PATH = 'dataset/test.csv'
# PRETRAINED = r'D:\Kaggle\HorseHealth\models\horse_health_model_03_11_2023.pth'
PRETRAINED = ''

In [3]:
df = pd.read_csv(TRAIN_PATH, index_col='id')
test_df = pd.read_csv(TEST_PATH, index_col='id')
df.head()

Unnamed: 0_level_0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data,outcome
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,yes,adult,530001,38.1,132.0,24.0,cool,reduced,dark_cyanotic,more_3_sec,...,57.0,8.5,serosanguious,3.4,yes,2209,0,0,no,died
1,yes,adult,533836,37.5,88.0,12.0,cool,normal,pale_cyanotic,more_3_sec,...,33.0,64.0,serosanguious,2.0,yes,2208,0,0,no,euthanized
2,yes,adult,529812,38.3,120.0,28.0,cool,reduced,pale_pink,less_3_sec,...,37.0,6.4,serosanguious,3.4,yes,5124,0,0,no,lived
3,yes,adult,5262541,37.1,72.0,30.0,cold,reduced,pale_pink,more_3_sec,...,53.0,7.0,cloudy,3.9,yes,2208,0,0,yes,lived
4,no,adult,5299629,38.0,52.0,48.0,normal,normal,normal_pink,less_3_sec,...,47.0,7.3,cloudy,2.6,no,0,0,0,yes,lived


In [4]:
CAT_COLS = [col for col in df.columns if df[col].dtype == 'object']
NUM_COLS = [col for col in df.columns if col not in CAT_COLS]

df[CAT_COLS] = df[CAT_COLS].fillna("no_value")
test_df[CAT_COLS] = df[CAT_COLS].fillna("no value")

df["hospital_number"] = df["hospital_number"].astype("str")
test_df["hospital_number"] = test_df["hospital_number"].astype("str")

In [5]:
CAT_COLS = [col for col in df.columns if df[col].dtype == 'object']
NUM_COLS = [col for col in df.columns if col not in CAT_COLS]

for col in NUM_COLS:
    df[col] = df[col].apply(lambda x: np.cbrt(x))
    test_df[col] = test_df[col].apply(lambda x: np.cbrt(x))

In [6]:
y = df["outcome"]
X = df.copy().drop("outcome", axis=1)

In [7]:
CAT_COLS = [col for col in X.columns if df[col].dtype == 'object']
NUM_COLS = [col for col in X.columns if col not in CAT_COLS]

In [8]:
onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown="infrequent_if_exist")
X_cat = onehot_encoder.fit_transform(X[CAT_COLS])
X = np.concatenate([X[NUM_COLS], X_cat], axis=1)
X_test_cat = onehot_encoder.transform(test_df[CAT_COLS])
X_test = np.concatenate([test_df[NUM_COLS], X_test_cat], axis=1)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [9]:
scaler = RobustScaler()
X = scaler.fit_transform(X)
X_test = scaler.fit_transform(X_test)

In [10]:
X = torch.from_numpy(X).float()
X_test_df = torch.from_numpy(X_test).float()
y = torch.from_numpy(y).long()

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [12]:
model = nn.Sequential(
    nn.Linear(in_features = 337, out_features = 512),
    nn.Mish(),
    nn.Linear(in_features = 512, out_features = 256),
    nn.BatchNorm1d(num_features=256),
    nn.Dropout(p=0.5),

    nn.Linear(in_features = 256, out_features = 512),
    nn.Mish(),
    nn.Linear(in_features = 512, out_features = 256),
    nn.BatchNorm1d(num_features=256),
    nn.Dropout(p=0.5),

    nn.Linear(in_features = 256, out_features = 3)
)

cur_time = datetime.now().today().strftime("%d_%m_%Y")
MODEL_SAVE_PATH = f"models\horse_health_model_{cur_time}.pth"

In [13]:
if PRETRAINED:
    model.load_state_dict(torch.load(PRETRAINED), strict=False)

In [14]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)

def f1_fn(y_logits, y_true):
    y_pred = torch.argmax(y_logits, dim=1).detach().cpu().numpy()
    y_true = y_true.detach().cpu().numpy()
    f1 = f1_score(y_true, y_pred, average='micro')
    return f1

epochs = 200
epoch_count = []
train_loss_value = []
test_loss_value = []
for epoch in range(epochs):
    model.train()
    y_logits = model(X_train).squeeze()

    loss = loss_fn(y_logits, y_train)
    acc = f1_fn(y_logits, y_train)
    train_loss_value.append(loss)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()                     
    model.eval()
    with torch.inference_mode():
        test_logits = model(X_test).squeeze()
        test_pred = torch.round(torch.sigmoid(test_logits))
        test_loss = loss_fn(test_logits, y_test)
        test_acc = f1_fn(test_pred, y_test)
        test_loss_value.append(test_loss)
    if epoch % 10 == 0:
        print
        print(f"Epoch {epoch}: Train_loss: {loss:.5f}, Test_loss: {test_loss:.5f} | Train_f1: {acc:.5f}, Test_f1: {test_acc:.5f}")

Epoch 0: Train_loss: 1.34605, Test_loss: 1.06057 | Train_f1: 0.31053, Test_f1: 0.48387
Epoch 10: Train_loss: 0.76524, Test_loss: 0.98884 | Train_f1: 0.69577, Test_f1: 0.58065
Epoch 20: Train_loss: 0.58825, Test_loss: 0.85826 | Train_f1: 0.78668, Test_f1: 0.57258
Epoch 30: Train_loss: 0.48894, Test_loss: 0.84844 | Train_f1: 0.82538, Test_f1: 0.62097
Epoch 40: Train_loss: 0.40095, Test_loss: 1.01266 | Train_f1: 0.84788, Test_f1: 0.57258
Epoch 50: Train_loss: 0.31306, Test_loss: 1.32102 | Train_f1: 0.87399, Test_f1: 0.58065
Epoch 60: Train_loss: 0.26439, Test_loss: 1.71319 | Train_f1: 0.90009, Test_f1: 0.55645
Epoch 70: Train_loss: 0.21487, Test_loss: 2.01834 | Train_f1: 0.91449, Test_f1: 0.58065
Epoch 80: Train_loss: 0.16796, Test_loss: 2.23723 | Train_f1: 0.93699, Test_f1: 0.58065
Epoch 90: Train_loss: 0.11095, Test_loss: 2.29612 | Train_f1: 0.95410, Test_f1: 0.56452
Epoch 100: Train_loss: 0.08416, Test_loss: 2.49353 | Train_f1: 0.97300, Test_f1: 0.57258
Epoch 110: Train_loss: 0.06043, 

In [15]:
torch.save(obj = model.state_dict(), f=MODEL_SAVE_PATH)

In [16]:
model.eval()

with torch.no_grad():
    y_test_logits = model(X_test_df)

y_test_probabilities = torch.softmax(y_test_logits, dim=1)
_, y_test_pred = torch.max(y_test_probabilities, 1)


In [17]:
label_encoder.classes_

array(['died', 'euthanized', 'lived'], dtype=object)

In [18]:
submission = pd.DataFrame()
submission["id"] = test_df.index
submission["outcome"] = label_encoder.inverse_transform(y_test_pred)

In [19]:
submission.to_csv("submission.csv", index=False)