# Код для повторения на Kaggle

In [1]:
import torch
import random
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim

from tqdm.notebook import tqdm
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [2]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print(f"Random seed set as {seed}")


set_seed(42)

Random seed set as 42


# Загрузка датасета

In [3]:
df = pd.read_csv("/kaggle/input/df-train/train.csv")

df.head()

Unnamed: 0,id,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,0,35.0,175.0,75.0,86.5,1.2,1.2,1.0,1.0,127.0,...,58.0,108.0,15.6,1.0,0.9,17.0,14.0,21.0,0.0,0.0
1,1,45.0,155.0,60.0,82.0,1.2,1.0,1.0,1.0,129.0,...,50.0,110.0,14.0,1.0,0.7,22.0,18.0,14.0,0.0,0.0
2,2,35.0,175.0,60.0,74.0,1.2,1.2,1.0,1.0,100.0,...,58.0,116.0,14.8,1.0,0.9,20.0,15.0,16.0,0.0,1.0
3,3,60.0,160.0,55.0,74.0,1.2,1.5,1.0,1.0,139.0,...,73.0,95.0,15.1,1.0,0.7,47.0,31.0,15.0,0.0,0.0
4,4,40.0,160.0,55.0,71.0,0.9,1.2,1.0,1.0,100.0,...,66.0,103.0,13.1,1.0,0.6,24.0,21.0,13.0,0.0,0.0


In [4]:
df.shape

(15000, 24)

# Подготовка данных

In [5]:
# Размеры выборок
train_size = 0.6
val_size = 0.2
test_size = 0.2

# Разделение признаков и целевой переменной
X = df.drop(columns=["smoking"])
y = df["smoking"]

# Разбивка
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, train_size=train_size, random_state=12, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=(test_size / (test_size + val_size)),
    random_state=12,
    stratify=y_temp,
)

# Вывод результата
print(f"Всего ({1.0*100}%): {df.shape[0]}")
print(f"Тренировачный ({train_size*100}%): {X_train.shape[0]}")
print(f"Валидационный ({val_size*100}%): {X_val.shape[0]}")
print(f"Тестовый ({test_size*100}%): {X_test.shape[0]}")

Всего (100.0%): 15000
Тренировачный (60.0%): 9000
Валидационный (20.0%): 3000
Тестовый (20.0%): 3000


In [6]:
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.Tensor(X.values)
        self.y = torch.from_numpy(y.values).reshape(-1, 1).float()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, index):
        return (self.X[index], self.y[index])

In [7]:
train_dataset = CustomDataset(X_train, y_train)
val_dataset = CustomDataset(X_val, y_val)
test_dataset = CustomDataset(X_test, y_test)

In [8]:
batch_size = 16 # Лучшая сходимость по сравнение с 32 и 64

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Нейросетевая модель

In [9]:
# Пробовал разные комбинации по количество слоев и признаков, не удается выбить больше
class CustomModel(nn.Module):
    def __init__(self, inputs_size, out_size):
        super(CustomModel, self).__init__()
        
        self.linear_1 = nn.Linear(inputs_size, 500)
        self.linear_2 = nn.Linear(500, 1000)
        self.linear_3 = nn.Linear(1000, 500)
        self.linear_4 = nn.Linear(500, out_size)
        
        self.bn_1 = nn.BatchNorm1d(500)
        self.bn_2 = nn.BatchNorm1d(1000)
        self.bn_3 = nn.BatchNorm1d(500)
        
        self.dropout = nn.Dropout(p=0.3) # выше и ниже дает хуже сходимость
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.linear_1(x)
        x = self.bn_1(x)
        x = self.relu(x)
        
        x = self.dropout(x)
        x = self.linear_2(x)
        x = self.bn_2(x)
        x = self.relu(x)
        
        x = self.dropout(x)
        x = self.linear_3(x)
        x = self.bn_3(x)
        x = self.relu(x)
        
        x = self.dropout(x)
        x = self.linear_4(x)
        x = self.sigmoid(x)
        
        return x

In [10]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [11]:
model = CustomModel(inputs_size=23, out_size=1)

# Код обучения и оценки модели

In [12]:
def train_loop(model, train_dataloader, val_dataloader, epochs=20, lr=0.001): # данные lr дает лучше сходимость
    model.to(device)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr) # SGD не дает пробить 0,75
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0

        # Обучающая часть цикла
        for batch in (
            tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs} [Training]")
            if ((epoch + 1) % 100 == 0 or epoch == 0)
            else train_dataloader
        ):
            X_batch, y_batch = batch
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * X_batch.size(0)
        train_loss /= len(train_dataloader.dataset)

        # Валидационная часть цикла
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch in (
                tqdm(val_dataloader, desc=f"Epoch {epoch+1}/{epochs} [Training]")
                if ((epoch + 1) % 100 == 0 or epoch == 0)
                else val_dataloader
            ):
                X_batch, y_batch = batch
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                y_pred = model(X_batch)
                loss = criterion(y_pred, y_batch)
                val_loss += loss.item() * X_batch.size(0)
        val_loss /= len(val_dataloader.dataset)
        if (epoch + 1) % 5 == 0 or epoch == 0:
            print(
                f"Epoch [{epoch+1}/{epochs}] - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f}"
            )
    print("Обучение завершено!")

In [13]:
def evaluate_model(model, test_dataloader):
    model.eval()
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in test_dataloader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            predicted = (outputs > 0.5).float().cpu().numpy()
            all_predictions.extend(predicted)
            all_labels.extend(labels.numpy())

    return np.array(all_predictions), np.array(all_labels)

# Обучение модели и проверка качества

In [14]:
%%time
train_loop(model, train_dataloader, val_dataloader, 200)

Epoch 1/200 [Training]:   0%|          | 0/563 [00:00<?, ?it/s]

Epoch 1/200 [Training]:   0%|          | 0/188 [00:00<?, ?it/s]

Epoch [1/200] - Train Loss: 0.5558 - Val Loss: 0.4901
Epoch [5/200] - Train Loss: 0.4870 - Val Loss: 0.4459
Epoch [10/200] - Train Loss: 0.4809 - Val Loss: 0.4685
Epoch [15/200] - Train Loss: 0.4685 - Val Loss: 0.4433
Epoch [20/200] - Train Loss: 0.4518 - Val Loss: 0.4263
Epoch [25/200] - Train Loss: 0.4467 - Val Loss: 0.5419
Epoch [30/200] - Train Loss: 0.4443 - Val Loss: 0.4368
Epoch [35/200] - Train Loss: 0.4415 - Val Loss: 0.4228
Epoch [40/200] - Train Loss: 0.4365 - Val Loss: 0.4127
Epoch [45/200] - Train Loss: 0.4376 - Val Loss: 0.4139
Epoch [50/200] - Train Loss: 0.4352 - Val Loss: 0.4332
Epoch [55/200] - Train Loss: 0.4304 - Val Loss: 0.4345
Epoch [60/200] - Train Loss: 0.4361 - Val Loss: 0.4135
Epoch [65/200] - Train Loss: 0.4277 - Val Loss: 0.4277
Epoch [70/200] - Train Loss: 0.4297 - Val Loss: 0.4153
Epoch [75/200] - Train Loss: 0.4281 - Val Loss: 0.4208
Epoch [80/200] - Train Loss: 0.4311 - Val Loss: 0.4260
Epoch [85/200] - Train Loss: 0.4280 - Val Loss: 0.4145
Epoch [90/20

Epoch 100/200 [Training]:   0%|          | 0/563 [00:00<?, ?it/s]

Epoch 100/200 [Training]:   0%|          | 0/188 [00:00<?, ?it/s]

Epoch [100/200] - Train Loss: 0.4236 - Val Loss: 0.4179
Epoch [105/200] - Train Loss: 0.4233 - Val Loss: 0.4297
Epoch [110/200] - Train Loss: 0.4224 - Val Loss: 0.4144
Epoch [115/200] - Train Loss: 0.4244 - Val Loss: 0.4139
Epoch [120/200] - Train Loss: 0.4207 - Val Loss: 0.4148
Epoch [125/200] - Train Loss: 0.4213 - Val Loss: 0.4159
Epoch [130/200] - Train Loss: 0.4217 - Val Loss: 0.4204
Epoch [135/200] - Train Loss: 0.4203 - Val Loss: 0.4177
Epoch [140/200] - Train Loss: 0.4191 - Val Loss: 0.4159
Epoch [145/200] - Train Loss: 0.4186 - Val Loss: 0.4146
Epoch [150/200] - Train Loss: 0.4195 - Val Loss: 0.4139
Epoch [155/200] - Train Loss: 0.4191 - Val Loss: 0.4172
Epoch [160/200] - Train Loss: 0.4143 - Val Loss: 0.4127
Epoch [165/200] - Train Loss: 0.4192 - Val Loss: 0.4109
Epoch [170/200] - Train Loss: 0.4173 - Val Loss: 0.4149
Epoch [175/200] - Train Loss: 0.4125 - Val Loss: 0.4113
Epoch [180/200] - Train Loss: 0.4185 - Val Loss: 0.4158
Epoch [185/200] - Train Loss: 0.4127 - Val Loss:

Epoch 200/200 [Training]:   0%|          | 0/563 [00:00<?, ?it/s]

Epoch 200/200 [Training]:   0%|          | 0/188 [00:00<?, ?it/s]

Epoch [200/200] - Train Loss: 0.4176 - Val Loss: 0.4228
Обучение завершено!
CPU times: user 5min 27s, sys: 4.43 s, total: 5min 32s
Wall time: 5min 37s


In [15]:
y_pred, y_true = evaluate_model(model, test_dataloader)

print(
    classification_report(
        y_true,
        y_pred,
    )
)

              precision    recall  f1-score   support

         0.0       0.81      0.84      0.83      1889
         1.0       0.71      0.68      0.69      1111

    accuracy                           0.78      3000
   macro avg       0.76      0.76      0.76      3000
weighted avg       0.78      0.78      0.78      3000



In [16]:
auc_score = roc_auc_score(y_true, y_pred)

auc_score

0.7569885628054599

# Подготовка ответа

In [17]:
df_test = pd.read_csv("/kaggle/input/df-test/test.csv")

df_test.head()

Unnamed: 0,id,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,...,triglyceride,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries
0,15000,25.0,165.0,65.0,85.0,1.2,1.2,1.0,1.0,128.0,...,92.0,41.0,132.0,15.0,1.0,1.1,34.0,23.0,14.0,0.0
1,15001,45.0,165.0,60.0,74.0,1.5,1.0,1.0,1.0,104.0,...,124.0,54.0,129.0,11.3,1.0,0.7,20.0,17.0,11.0,0.0
2,15002,65.0,155.0,55.0,72.0,0.8,0.6,1.0,1.0,130.0,...,103.0,76.0,128.0,14.4,1.0,0.8,38.0,18.0,24.0,1.0
3,15003,30.0,170.0,85.0,88.0,0.7,0.9,1.0,1.0,119.0,...,212.0,44.0,117.0,14.8,1.0,1.1,26.0,38.0,19.0,0.0
4,15004,40.0,155.0,50.0,70.0,0.9,0.8,1.0,1.0,102.0,...,87.0,68.0,130.0,13.3,1.0,0.9,18.0,12.0,14.0,0.0


In [18]:
class CustomDatasetTest(Dataset):
    def __init__(self, X):
        self.X = torch.Tensor(X.values)

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, index):
        return self.X[index]

In [19]:
answer_dataset = CustomDatasetTest(df_test)

answer_dataloader = DataLoader(answer_dataset, batch_size=batch_size, shuffle=False)

In [20]:
def evaluate_model_answer(model, test_dataloader):
    model.eval()
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for inputs in test_dataloader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            predicted = (outputs > 0.5).float().cpu().numpy()
            all_predictions.extend(predicted)

    return np.array(all_predictions), np.array(all_labels)

In [21]:
y_pred_answer, _ = evaluate_model_answer(model, answer_dataloader)

In [22]:
df_test["smoking"] = y_pred_answer

In [23]:
df_test[["id", "smoking"]].to_csv("answer.csv", index=False)