In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
import pandas as pd
from tqdm import tqdm
import os

IS_KAGGLE = 'KAGGLE_KERNEL_RUN_TYPE' in os.environ
IS_INFERENCE = True  # 이 값으로 학습/추론 모드 결정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
BASE_MODEL_DIR = '/kaggle/input/nn-linear-regression-l2-models/linear_regression_nn_l2'

In [3]:
def weighted_r2_score(y_true, y_pred, weights):
    """
    Calculate weighted R² score
    R² = 1 - Σ wᵢ(yᵢ - ŷᵢ)² / Σ wᵢyᵢ²
    """
    numerator = torch.sum(weights * (y_true - y_pred) ** 2)
    denominator = torch.sum(weights * y_true ** 2)
    r2 = 1 - numerator / denominator
    return r2.item()


In [4]:
class StockDataset(Dataset):
    train_mean = None
    train_std = None

    def __init__(self, df, is_test=False):
        features = df.select([f"feature_{i:02d}" for i in range(79)]).fill_null(0).to_numpy()

        if not is_test:
            # 학습 데이터일 경우 평균과 표준편차를 계산하여 저장
            StockDataset.train_mean = np.mean(features, axis=0)
            StockDataset.train_std = np.std(features, axis=0)

            # 학습 시에는 평균과 표준편차도 저장
            np.save(f'{BASE_MODEL_DIR}/train_mean.npy', StockDataset.train_mean)
            np.save(f'{BASE_MODEL_DIR}/train_std.npy', StockDataset.train_std)
        else:
            # 추론 시에는 저장된 평균과 표준편차를 로드
            StockDataset.train_mean = np.load(f'{BASE_MODEL_DIR}/train_mean.npy')
            StockDataset.train_std = np.load(f'{BASE_MODEL_DIR}/train_std.npy')

        # 표준화
        self.features = (features - StockDataset.train_mean) / StockDataset.train_std
        self.features = torch.FloatTensor(self.features)

        self.weights = torch.FloatTensor(df.select(["weight"]).to_numpy())

        if not is_test:
            self.target = torch.FloatTensor(df.select(["responder_6"]).to_numpy())

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if hasattr(self, 'target'):
            return self.features[idx], self.target[idx], self.weights[idx]
        return self.features[idx], torch.zeros(1), self.weights[idx]


In [5]:
class WeightedMSELoss(nn.Module):
    def __init__(self, l2_lambda=0.01):  # L2 정규화 강도 설정
        super().__init__()
        self.l2_lambda = l2_lambda

    def forward(self, pred, target, weights, model):
        # 기본 MSE 손실
        mse_loss = torch.mean(weights * (pred - target) ** 2)

        # L2 정규화: 모든 가중치의 제곱합
        l2_loss = sum(p.pow(2).sum() for p in model.parameters())

        # 최종 손실 = MSE + λ||w||²
        return mse_loss + self.l2_lambda * l2_loss

In [6]:
class LinearRegression(nn.Module):
    def __init__(self, input_dim):
        super(LinearRegression, self).__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        return self.linear(x)

In [7]:
import matplotlib.pyplot as plt
def train_model(model, train_loader, criterion, optimizer, num_epochs):
    print("Training started...")

    # 학습 기록을 저장할 리스트
    losses = []
    r2_scores = []

    # 결과 저장할 디렉토리 생성
    import os
    os.makedirs('training_plots', exist_ok=True)

    # 플롯 설정
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        all_targets = []
        all_outputs = []
        all_weights = []

        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}')

        for batch_features, batch_targets, batch_weights in progress_bar:
            batch_features = batch_features.to(device)
            batch_targets = batch_targets.to(device)
            batch_weights = batch_weights.to(device)

            outputs = model(batch_features)
            loss = criterion(outputs, batch_targets, batch_weights, model)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            all_targets.append(batch_targets)
            all_outputs.append(outputs)
            all_weights.append(batch_weights)

            progress_bar.set_postfix({'batch_loss': f'{loss.item():.4f}'})

        # Calculate metrics
        avg_loss = total_loss / len(train_loader)
        y_true = torch.cat(all_targets)
        y_pred = torch.cat(all_outputs)
        weights = torch.cat(all_weights)
        r2 = weighted_r2_score(y_true, y_pred, weights)

        # 기록 저장
        losses.append(avg_loss)
        r2_scores.append(r2)

        # Loss 플롯
        ax1.clear()
        ax1.plot(losses, 'b-')
        ax1.set_title('Training Loss')
        ax1.set_xlabel('Epoch')
        ax1.set_ylabel('Loss')
        ax1.grid(True)

        # R² 점수 플롯
        ax2.clear()
        ax2.plot(r2_scores, 'r-')
        ax2.set_title('R² Score')
        ax2.set_xlabel('Epoch')
        ax2.set_ylabel('R²')
        ax2.grid(True)

        plt.tight_layout()

        # 현재 에폭의 그래프를 파일로 저장
        plt.savefig(f'training_plots/epoch_{epoch+1:03d}.png')

        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, R² Score: {r2:.4f}')
        print('-' * 50)

    # 최종 결과를 CSV 파일로 저장
    import pandas as pd
    results_df = pd.DataFrame({
        'epoch': range(1, num_epochs + 1),
        'loss': losses,
        'r2_score': r2_scores
    })
    results_df.to_csv(f'{BASE_MODEL_DIR}/training_results.csv', index=False)

    print("Training completed. Results saved to 'training_results.csv'")
    print(f"Training plots saved in 'training_plots' directory")

In [8]:
lags_: pl.DataFrame | None = None

def predict(test: pl.DataFrame, lags: pl.DataFrame | None = None) -> pl.DataFrame:
    """Competition prediction function"""
    global model  # PyTorch 모델
    global lags_

    # 원본 row_id 저장
    row_ids = test['row_id'].to_numpy()

    if lags is not None:
        lags_ = lags

    # 데이터셋 생성
    test_dataset = StockDataset(test, is_test=True)
    test_loader = DataLoader(
        test_dataset,
        batch_size=4096,
        pin_memory=True
    )

    # 모델 예측
    model.eval()
    predictions = []

    with torch.no_grad():
        for features, _, _ in tqdm(test_loader, desc="Predicting"):
            features = features.to(device)
            outputs = model(features)
            predictions.append(outputs.cpu())

    # 예측값을 numpy array로 변환
    predictions = torch.cat(predictions).numpy()
    predictions = np.clip(predictions, -5, 5)

    # 결과 DataFrame 생성
    result = pl.DataFrame({
        'row_id': row_ids,
        'responder_6': predictions.flatten()  # 2D array를 1D로 변환
    })

    print("Prediction completed.")
    print(result)
    print(result.shape)

    # Validation checks
    assert isinstance(result, (pl.DataFrame, pd.DataFrame))
    assert result.columns == ['row_id', 'responder_6']
    assert len(result) == len(test)

    return result

In [9]:
if not IS_INFERENCE:
    # 학습 모드
    # 데이터 로드 및 데이터셋 생성
    train = pl.read_parquet('kaggle/data/train.parquet')
    dataset = StockDataset(train)
    train_loader = DataLoader(
        dataset,
        batch_size=4096,
        pin_memory=True
    )

    # 모델 초기화
    input_dim = 79
    model = LinearRegression(input_dim).to(device)
    criterion = WeightedMSELoss(l2_lambda=0.01)  # L2 정규화 파라미터 설정
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0003)

    # 모델 학습
    num_epochs = 5
    train_model(model, train_loader, criterion, optimizer, num_epochs)

    # 모델 저장
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, f'{BASE_MODEL_DIR}/linear_regression_nn.pth')
    print(f"Model saved to '{BASE_MODEL_DIR}/linear_regression_nn.pth'")

else:
    # 추론 모드
    # 모델 로드
    input_dim = 79
    model = LinearRegression(input_dim).to(device)
    checkpoint = torch.load(f'{BASE_MODEL_DIR}/linear_regression_nn.pth', weights_only=True)
    model.load_state_dict(checkpoint['model_state_dict'])

    if IS_KAGGLE:
        import kaggle_evaluation.jane_street_inference_server

        inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

        if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
            inference_server.serve()
        else:
            inference_server.run_local_gateway(
                (
                    '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
                    '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
                )
            )
    else:
        test = pl.read_parquet('kaggle/data/test.parquet')
        lags = pl.read_parquet('kaggle/data/lags.parquet')
        predict(test, lags)

  self.weights = torch.FloatTensor(df.select(["weight"]).to_numpy())
Predicting: 100%|██████████| 1/1 [00:00<00:00,  7.25it/s]

Prediction completed.
shape: (39, 2)
┌────────┬─────────────┐
│ row_id ┆ responder_6 │
│ ---    ┆ ---         │
│ i64    ┆ f32         │
╞════════╪═════════════╡
│ 0      ┆ 0.020922    │
│ 1      ┆ 0.020922    │
│ 2      ┆ 0.020922    │
│ 3      ┆ 0.020922    │
│ 4      ┆ 0.020922    │
│ …      ┆ …           │
│ 34     ┆ 0.020922    │
│ 35     ┆ 0.020922    │
│ 36     ┆ 0.020922    │
│ 37     ┆ 0.020922    │
│ 38     ┆ 0.020922    │
└────────┴─────────────┘
(39, 2)



