In [111]:
import argparse

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, confusion_matrix
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [112]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [113]:
def load_data(train_csv, val_csv, test_csv):
    # Загрузка данных
    train_data = pd.read_csv('/content/drive/MyDrive/ML_2024/hw_1/train.csv')
    val_data = pd.read_csv('/content/drive/MyDrive/ML_2024/hw_1/val.csv')
    test_data = pd.read_csv('/content/drive/MyDrive/ML_2024/hw_1/test.csv')

    # Опредение признаков и целевой переменной
    features = [col for col in train_data.columns if col.startswith('y')]
    target = 'order0'

    X_train = train_data[features].values
    y_train = train_data[target].values
    X_val = val_data[features].values
    y_val = val_data[target].values
    X_test = test_data[features].values

    # Нормализация данных
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)

    return X_train, y_train, X_val, y_val, X_test


In [114]:
# Создание PyTorch Dataset
class StarDataset(Dataset):
    def __init__(self, features, targets=None):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.targets = torch.tensor(targets, dtype=torch.long) if targets is not None else None

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if self.targets is not None:
            return self.features[idx], self.targets[idx]
        return self.features[idx]


In [115]:
# Построение модели
class MLP(nn.Module):
    def __init__(self, input_size, num_classes):
        super(MLP, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, num_classes)
        )

    def forward(self, x):
        return self.model(x)

In [116]:
def init_model(input_size, lr):
    model = MLP(input_size, num_classes=3)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    return model, criterion, optimizer


In [117]:
def evaluate(model, X, y):
    model.eval()
    dataset = StarDataset(X, y)
    loader = DataLoader(dataset, batch_size=64, shuffle=False)

    predictions = []
    targets = []
    with torch.no_grad():
        for X_batch, y_batch in loader:
            outputs = model(X_batch)
            preds = torch.argmax(outputs, dim=1).numpy()
            predictions.extend(preds)
            targets.extend(y_batch.numpy())

    accuracy = accuracy_score(targets, predictions)
    conf_matrix = confusion_matrix(targets, predictions)
    return predictions, accuracy, conf_matrix

In [118]:
# Обучение модели
def train(model, criterion, optimizer, X_train, y_train, X_val, y_val, epochs, batch_size):
    train_dataset = StarDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    val_dataset = StarDataset(X_val, y_val)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        val_preds, val_acc, _ = evaluate(model, X_val, y_val)
        print(f"Epoch {epoch + 1}/{epochs},"
              f"val accuracy: {val_acc:.4f}")

    return model

In [119]:
def main(args):

    X_train, y_train, X_val, y_val, X_test = load_data(args.train_csv, args.val_csv, args.test_csv)

    input_size = X_train.shape[1]
    model, criterion, optimizer = init_model(input_size, args.lr)

    model = train(model, criterion, optimizer, X_train, y_train, X_val, y_val, args.num_epoches, args.batch_size)

    model.eval()
    test_dataset = StarDataset(X_test)
    test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False)

    test_predictions = []
    with torch.no_grad():
        for X_batch in test_loader:
            outputs = model(X_batch)
            preds = torch.argmax(outputs, dim=1).numpy()
            test_predictions.extend(preds)

    # Save predictions
    submission = pd.DataFrame({'target': test_predictions})
    submission.to_csv(args.out_csv, index=False)
    print(f"Predictions saved to {args.out_csv}")

In [120]:

if __name__ == '__main__':


    parser = argparse.ArgumentParser()

    # Определяем аргументы
    parser.add_argument('--train_csv', default='/content/drive/MyDrive/ML_2024/hw_1/train.csv')
    parser.add_argument('--val_csv', default='/content/drive/MyDrive/ML_2024/hw_1/val.csv')
    parser.add_argument('--test_csv', default='/content/drive/MyDrive/ML_2024/hw_1/test.csv')
    parser.add_argument('--out_csv', default='/content/drive/MyDrive/ML_2024/hw_1/submission.csv')
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--batch_size', type=int, default=1024)
    parser.add_argument('--num_epoches', type=int, default=10)

    # Парсим аргументы
    args = parser.parse_args()
    main(args)

Epoch 1/10,val accuracy: 0.7171
Epoch 2/10,val accuracy: 0.7425
Epoch 3/10,val accuracy: 0.7913
Epoch 4/10,val accuracy: 0.7975
Epoch 5/10,val accuracy: 0.8062
Epoch 6/10,val accuracy: 0.8093
Epoch 7/10,val accuracy: 0.8137
Epoch 8/10,val accuracy: 0.8140
Epoch 9/10,val accuracy: 0.8195
Epoch 10/10,val accuracy: 0.8268
Predictions saved to /content/drive/MyDrive/ML_2024/hw_1/submission.csv
