In [36]:
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch

In [37]:
class CNN1DModel(nn.Module):
    def __init__(self,
                 num_numerical_features: int,
                 hidden_size: int = 1024,
                 n_target: int = 1,
                 channel_1: int = 64,
                 channel_2: int = 128,
                 kernel_size: int = 5,
                 dropout_rate: float = 0.2):
        super().__init__()

        self.hidden_size = hidden_size  # Store hidden_size as an instance variable

        # 1. Expand 단계: Dense 레이어
        self.expand = nn.Sequential(
            nn.LayerNorm(num_numerical_features),
            nn.Dropout(dropout_rate),
            nn.Linear(num_numerical_features, hidden_size),
            nn.ReLU()
        )

        self.conv1 = nn.Sequential(
            nn.Conv1d(
                in_channels=hidden_size // 16, 
                out_channels=channel_1, 
                kernel_size=kernel_size, 
                stride=1, 
                padding=kernel_size // 2
            ),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )

        # 3. Conv 블록 2
        self.conv2 = nn.Sequential(
            nn.Conv1d(
                in_channels=channel_1, 
                out_channels=channel_2, 
                kernel_size=kernel_size, 
                stride=1, 
                padding=kernel_size // 2
            ),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.AdaptiveAvgPool1d(output_size=16)
        )

        # 4. Flatten and Dense
        self.flatten = nn.Flatten()
        self.dense = nn.Sequential(
            nn.Linear(channel_2 * 16, 640),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(640, n_target)
        )

        # 추가된 Tanh
        self.tanh = nn.Tanh()

    def forward(self, x):
        # 1. Expand 단계
        x = self.expand(x)

        # 2. Reshape to match Conv input
        batch_size = x.size(0)
        seq_length = x.size(1) // (self.hidden_size // 16)  # Dynamically compute seq_length
        x = x.view(batch_size, self.hidden_size // 16, seq_length)

        # 3. Conv 블록 1
        x = self.conv1(x)

        # 4. Conv 블록 2
        x = self.conv2(x)

        # 4. Flatten and Dense
        x = self.flatten(x)
        x = self.dense(x)
        x = 5 * self.tanh(x)

        return x

In [38]:
def weighted_mse_loss(y_true, y_pred, weights):
    """
    Multi-target weighted MSE loss

    Args:
        y_true: target values (batch_size, n_targets)
        y_pred: predicted values (batch_size, n_targets)
        weights: weights for each target (batch_size, n_targets)
    """
    return torch.mean(weights * (y_true - y_pred)**2)

def weighted_r2_score(y_true, y_pred, weights):
    """
    Multi-target weighted R2 score

    Args:
        y_true: target values (batch_size, n_targets)
        y_pred: predicted values (batch_size, n_targets)
        weights: weights for each target (batch_size, n_targets)

    Returns:
        weighted R2 score (scalar)
    """
    # Ensure inputs are on CPU and converted to numpy
    y_true = y_true.detach().cpu().numpy()
    y_pred = y_pred.detach().cpu().numpy()
    weights = weights.detach().cpu().numpy()

    weights = np.repeat(weights, y_true.shape[1], axis=1)

    # print(y_true.shape, y_pred.shape, weights.shape)
    # Calculate weighted means for each target
    weighted_mean = np.average(y_true, weights=weights, axis=0)

    # Calculate total sum of squares
    total_ss = np.sum(weights * (y_true - weighted_mean) ** 2, axis=0)

    # Calculate residual sum of squares
    residual_ss = np.sum(weights * (y_true - y_pred) ** 2, axis=0)

    # Calculate R2 score for each target
    r2_scores = 1 - (residual_ss / total_ss)

    # Return mean R2 score across all targets
    return np.mean(r2_scores)

In [54]:
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

class CustomDataset(Dataset):
    def __init__(self, data, numerical_columns, target_columns, weight_columns=None):
        """
        Args:
            data: pandas DataFrame containing all features
            numerical_columns: list of column names for numerical features
            target_columns: list of target column names
            weight_columns: list of weight column names (optional)
        """
        self.numerical_features = torch.FloatTensor(data[numerical_columns].values)
        self.symbol = torch.LongTensor(data['symbol_id'].values)
        self.feature_09 = torch.LongTensor(data['feature_09'].values)
        self.feature_10 = torch.LongTensor(data['feature_10'].values)
        self.feature_11 = torch.LongTensor(data['feature_11'].values)
        self.time = torch.LongTensor(data['time_id'].values)

        # Multi-target 처리
        self.targets = torch.FloatTensor(data[target_columns].values)

        # 가중치 처리 (옵션)
        if weight_columns:
            self.weights = torch.FloatTensor(data[weight_columns].values)
        else:
            self.weights = torch.ones_like(self.targets)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return {
            'numerical_features': self.numerical_features[idx],
            'symbol_id': self.symbol[idx],
            'feature_09': self.feature_09[idx],
            'feature_10': self.feature_10[idx],
            'feature_11': self.feature_11[idx],
            'time_id': self.time[idx],
            'targets': self.targets[idx],
            'weights': self.weights[idx]
        }

def create_data_loaders(train_data, valid_data, numerical_columns,
                        target_columns, weight_columns=None,
                        batch_size=256, num_workers=1):
    """
    데이터로더를 생성하는 함수

    Args:
        train_data: 학습 데이터가 담긴 DataFrame
        valid_data: 검증 데이터가 담긴 DataFrame
        numerical_columns: 수치형 특성들의 컬럼명 리스트
        target_columns: 타겟 변수들의 컬럼명 리스트
        weight_columns: 가중치 컬럼명 리스트 (옵션)
        batch_size: 배치 크기
        num_workers: 데이터 로딩에 사용할 워커 수
    """

    # Dataset 객체 생성
    train_dataset = CustomDataset(train_data, numerical_columns, target_columns, weight_columns)
    valid_dataset = CustomDataset(valid_data, numerical_columns, target_columns, weight_columns)

    # DataLoader 생성
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        pin_memory=True
    )

    valid_loader = DataLoader(
        valid_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=True
    )

    return train_loader, valid_loader

def train_epoch(model, train_loader, optimizer, device):
    model.train()
    total_loss = 0
    total_r2 = 0
    num_batches = len(train_loader)


    for batch_idx, batch in enumerate(tqdm(train_loader, desc="Training Batches")):
        # 데이터를 디바이스로 이동
        numerical_features = batch['numerical_features'].to(device)
        symbol = batch['symbol_id'].to(device)
        feature_09 = batch['feature_09'].to(device)
        feature_10 = batch['feature_10'].to(device)
        feature_11 = batch['feature_11'].to(device)
        targets = batch['targets'].to(device)
        weights = batch['weights'].to(device)

        # Forward pass
        outputs = model(numerical_features)

        # 손실과 R2 score 계산
        loss = weighted_mse_loss(targets, outputs, weights)
        r2 = weighted_r2_score(targets, outputs, weights)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_r2 += r2

        # 배치별 진행상황 출력 (10배치마다)
        if (batch_idx + 1) % 1000 == 0:
            avg_loss = total_loss / (batch_idx + 1)
            avg_r2 = total_r2 / (batch_idx + 1)
            print(f'Batch [{batch_idx+1}/{num_batches}] Loss: {avg_loss:.4f}, R2: {avg_r2:.4f}')

    return total_loss / num_batches, total_r2 / num_batches

def validate(model, valid_loader, device):
    model.eval()
    total_loss = 0
    total_r2 = 0
    num_batches = len(valid_loader)

    with torch.no_grad():
        for batch_idx, batch in enumerate(tqdm(valid_loader, desc="Validation Batches")):
            numerical_features = batch['numerical_features'].to(device)
            feature_09 = batch['feature_09'].to(device)
            feature_10 = batch['feature_10'].to(device)
            feature_11 = batch['feature_11'].to(device)
            targets = batch['targets'].to(device)
            weights = batch['weights'].to(device)

            outputs = model(numerical_features)

            loss = weighted_mse_loss(targets, outputs, weights)
            r2 = weighted_r2_score(targets, outputs, weights)

            total_loss += loss.item()
            total_r2 += r2

        if (batch_idx + 1) % 1000 == 0:
            avg_loss = total_loss / (batch_idx + 1)
            avg_r2 = total_r2 / (batch_idx + 1)
            print(f'Batch [{batch_idx+1}/{num_batches}] Loss: {avg_loss:.4f}, R2: {avg_r2:.4f}')
        
    return total_loss / num_batches, total_r2 / num_batches

In [40]:
import polars as pl
train = pl.scan_parquet("/kaggle/input/js24-preprocessing-create-lags/training.parquet").collect()
valid = pl.scan_parquet("/kaggle/input/js24-preprocessing-create-lags/validation.parquet").collect()

In [41]:
import joblib

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
feature_train_list = [f"feature_{idx:02d}" for idx in range(79)] 
target_col = "responder_6"
feature_train = feature_train_list + [f"responder_{idx}_lag_1" for idx in range(9)] 

feature_cat = ["feature_09", "feature_10", "feature_11"]
feature_cont = [item for item in feature_train if item not in feature_cat]
std_feature = [i for i in feature_train_list if i not in feature_cat] + [f"responder_{idx}_lag_1" for idx in range(9)]

data_stats = joblib.load("/kaggle/input/jane-street-data-preprocessing/data_stats.pkl")
means = data_stats['mean']
stds = data_stats['std']

def standardize(df, feature_cols, means, stds):
    return df.with_columns([
        ((pl.col(col) - means[col]) / stds[col]).alias(col) for col in feature_cols
    ])

numerical_columns = feature_train + ['date_id', 'symbol_id', 'time_id']
target_columns = ['responder_6'] # 예측할 타겟들
weight_columns = ['weight']  # 각 타겟에 대한 가중치 (옵션)

train = standardize(train, numerical_columns, means, stds).to_pandas()
valid = standardize(valid, numerical_columns, means, stds).to_pandas()


In [42]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [47]:
train = train.dropna()
valid = valid.dropna()

In [55]:
# 컬럼 정의
# 데이터로더 생성
train_loader, valid_loader = create_data_loaders(
    train_data=train,
    valid_data=valid,
    numerical_columns=numerical_columns,
    target_columns=target_columns,
    weight_columns=weight_columns,
    batch_size=2048 * 4,
    num_workers=0
)

In [52]:
# 모델 초기화
model = CNN1DModel(
    num_numerical_features=len(numerical_columns),
    n_target=len(target_columns),
).to(device)

In [53]:
 # 학습 루프 수정
num_epochs = 100
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay = 5e-4)

for epoch in range(num_epochs):
    train_loss, train_r2 = train_epoch(model, train_loader, optimizer, device)
    valid_loss, valid_r2 = validate(model, valid_loader, device)

    print(f'Epoch {epoch+1}/{num_epochs}:')
    print(f'Train Loss: {train_loss:.4f}, Train R2: {train_r2:.4f}')
    print(f'Valid Loss: {valid_loss:.4f}, Valid R2: {valid_r2:.4f}')
    print('-' * 50)

Training Batches:   0%|          | 0/4212 [00:00<?, ?it/s]