# Kaggle Backpack Prediction Challenge

https://www.kaggle.com/competitions/playground-series-s5e2

In [14]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.pipeline    import Pipeline
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
import os 

In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Training on device: {device}")

file_path = os.path.join("dataset", "train.csv") # Use relative path directly
print(
    f"Attempting to load dataset from: {file_path}"
)
df = pd.read_csv(file_path)
df = df.dropna().drop_duplicates().drop("id", axis=1)
categorical_cols = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns=categorical_cols)
df.info()

Training on device: cpu
Attempting to load dataset from: dataset\train.csv
<class 'pandas.core.frame.DataFrame'>
Index: 246686 entries, 0 to 299999
Data columns (total 28 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Compartments            246686 non-null  float64
 1   Weight Capacity (kg)    246686 non-null  float64
 2   Price                   246686 non-null  float64
 3   Brand_Adidas            246686 non-null  bool   
 4   Brand_Jansport          246686 non-null  bool   
 5   Brand_Nike              246686 non-null  bool   
 6   Brand_Puma              246686 non-null  bool   
 7   Brand_Under Armour      246686 non-null  bool   
 8   Material_Canvas         246686 non-null  bool   
 9   Material_Leather        246686 non-null  bool   
 10  Material_Nylon          246686 non-null  bool   
 11  Material_Polyester      246686 non-null  bool   
 12  Size_Large              246686 non-null  bool   
 13  Size

In [26]:
X = df.drop("Price", axis=1)
y = df["Price"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)
pipeline = torch.nn.Sequential(
    torch.nn.Linear(X_train.shape[1], 64),
    torch.nn.ReLU(),
    torch.nn.Linear(64, 32),
    torch.nn.ReLU(),
    torch.nn.Linear(32, 1)
).to(device)


In [None]:
X_train_arr = X_train.to_numpy(dtype=np.float32)
y_train_arr = y_train.to_numpy(dtype=np.float32).reshape(-1, 1)

# Torch‑Tensoren erzeugen und aufs Device schieben
X_train_tensor = torch.from_numpy(X_train_arr).to(device)
y_train_tensor = torch.from_numpy(y_train_arr).to(device)

# 2. Dataset und DataLoader erstellen
train_ds = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)

# 3. Loss-Funktion und Optimizer definieren
criterion = torch.nn.MSELoss()            # z. B. für Regression
optimizer = torch.optim.Adam(pipeline.parameters(), lr=1e-3)

# 4. Trainingsschleife
n_epochs = 50
for epoch in range(1, n_epochs + 1):
    pipeline.train()
    epoch_loss = 0.0

    for X_batch, y_batch in train_loader:
        # 4.1 Gradienten zurücksetzen
        optimizer.zero_grad()
        # 4.2 Forward-Pass
        y_pred = pipeline(X_batch)
        # 4.3 Loss berechnen
        loss = criterion(y_pred, y_batch)
        # 4.4 Backward-Pass
        loss.backward()
        # 4.5 Gewichte updaten
        optimizer.step()

        epoch_loss += loss.item() * X_batch.size(0)

    epoch_loss /= len(train_loader.dataset)
    if epoch % 5 == 0 or epoch == 1:
        print(f"Epoch {epoch:2d}/{n_epochs} — Loss: {epoch_loss:.4f}")

# 5. Modell speichern (optional)
torch.save(pipeline.state_dict(), "model.pth")

Epoch  1/50 — Loss: 1596.4153
