<a href="https://colab.research.google.com/github/ilfattvru/ML-DL/blob/main/Titanic_kaggle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Конкурс на Kagle
Ссылка: https://www.kaggle.com/competitions/titanic/overview

В работе использованы библиотеки pytorch, sklearn, numpy, pandas.

In [8]:
# !pip install kaggle
!pip install torch
# !kaggle competitions download titanic



In [52]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_df['Age'] = train_df['Age'].fillna(train_df['Age'].median())
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].median())
test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].median())
train_df['Embarked'] = train_df['Embarked'].fillna('S')

label_encoder = LabelEncoder()
train_df['Sex'] = label_encoder.fit_transform(train_df['Sex'])
test_df['Sex'] = label_encoder.transform(test_df['Sex'])

train_df = pd.get_dummies(train_df, columns=['Embarked'], drop_first=True)
test_df = pd.get_dummies(test_df, columns=['Embarked'], drop_first=True)

train_df = train_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
test_df = test_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

X = train_df.drop('Survived', axis=1)
y = train_df['Survived']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_tensor = torch.tensor(X_train.values.astype('float32'))
y_train_tensor = torch.tensor(y_train.values.astype('float32')).unsqueeze(1)
X_val_tensor = torch.tensor(X_val.values.astype('float32'))
y_val_tensor = torch.tensor(y_val.values.astype('float32')).unsqueeze(1)


train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)


class TitanicNN(nn.Module):
    def __init__(self):
        super(TitanicNN, self).__init__()
        self.fc1 = nn.Linear(X_train.shape[1], 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.sigmoid(self.fc3(x))
        return x

model = TitanicNN()
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.008)

def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=20):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        # Validation step
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, labels in val_loader:
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

        print(f'Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}')
        model.train()

def evaluate_model(model, val_loader):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            preds = (outputs >= 0.5).float()
            predictions.extend(preds.numpy())
            true_labels.extend(labels.numpy())
    predictions = np.array(predictions)
    true_labels = np.array(true_labels)

    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    roc_auc = roc_auc_score(true_labels, predictions)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC-AUC: {roc_auc:.4f}")

train_model(model, train_loader, val_loader, criterion, optimizer, epochs=50)

evaluate_model(model, val_loader)

Epoch 1/50, Loss: 0.9493, Val Loss: 0.6509
Epoch 2/50, Loss: 0.6553, Val Loss: 0.6006
Epoch 3/50, Loss: 0.6452, Val Loss: 0.6108
Epoch 4/50, Loss: 0.6282, Val Loss: 0.5757
Epoch 5/50, Loss: 0.6093, Val Loss: 0.5643
Epoch 6/50, Loss: 0.6113, Val Loss: 0.5713
Epoch 7/50, Loss: 0.5954, Val Loss: 0.5605
Epoch 8/50, Loss: 0.5860, Val Loss: 0.5746
Epoch 9/50, Loss: 0.6192, Val Loss: 0.6229
Epoch 10/50, Loss: 0.6200, Val Loss: 0.5520
Epoch 11/50, Loss: 0.5769, Val Loss: 0.5337
Epoch 12/50, Loss: 0.5883, Val Loss: 0.5203
Epoch 13/50, Loss: 0.5484, Val Loss: 0.5549
Epoch 14/50, Loss: 0.5684, Val Loss: 0.4942
Epoch 15/50, Loss: 0.5786, Val Loss: 0.5372
Epoch 16/50, Loss: 0.5542, Val Loss: 0.5031
Epoch 17/50, Loss: 0.5474, Val Loss: 0.4874
Epoch 18/50, Loss: 0.5178, Val Loss: 0.4962
Epoch 19/50, Loss: 0.5240, Val Loss: 0.4883
Epoch 20/50, Loss: 0.5143, Val Loss: 0.5004
Epoch 21/50, Loss: 0.5226, Val Loss: 0.4970
Epoch 22/50, Loss: 0.5109, Val Loss: 0.4796
Epoch 23/50, Loss: 0.5033, Val Loss: 0.48

Наибольшего результата roc-auc удалось добиться 0.8, хотя оно скачет от 0.74 до 0.8.

Сформируем файл для загрузки на сайт.

In [44]:
def generate_submission_file(model, test_df, original_test_df, output_file='submission.csv'):
    model.eval()
    test_tensor = torch.tensor(test_df.values.astype('float32'))

    with torch.no_grad():
        predictions = model(test_tensor)
        predictions = (predictions >= 0.5).float().numpy()

    submission_df = pd.DataFrame({
        'PassengerId': original_test_df['PassengerId'],
        'Survived': predictions.flatten().astype(int)
    })

    submission_df.to_csv(output_file, index=False)

original_test_df = pd.read_csv('test.csv')

generate_submission_file(model, test_df, original_test_df, output_file='submission.csv')

Submission file saved as submission.csv
