# Data Exploration for the Kaggle Backpack Challenge

In [1]:
import os
from dotenv import load_dotenv
import torch.nn as nn
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import pandas as pd

In [4]:
load_dotenv()
kaggle_data_dir = os.getenv("KAGGLE_DATA_DIR")

In [12]:
data_path = os.path.join(kaggle_data_dir, '05.02-backpack')
df = pd.read_csv(os.path.join(data_path, 'train.csv')).dropna()
df.shape

(246686, 11)

In [6]:
class KaggleBackpackDataset(Dataset):
  def __init__(self, train_csv):
    self.categorical_vars = ["Brand", "Material", "Size", "Laptop Compartment", "Waterproof", "Style", "Color"]
    self.df = pd.read_csv(train_csv).dropna()
    self.id = torch.tensor(self.df.iloc[:, 0].values, dtype=torch.int)
    self.features = torch.tensor(self.df.iloc[:,[4, 9]].values, dtype=torch.float32)
    
    for cat in self.categorical_vars:        
        df = pd.Categorical(self.df[cat])
        category_codes = torch.tensor(df.codes, dtype=torch.long)
        num_classes = len(df.categories)
        one_hot_encoded = F.one_hot(category_codes, num_classes=num_classes)
        self.features = torch.cat((self.features, one_hot_encoded), dim=1)
    
    if ("Price" in self.df.columns):
        self.target = torch.tensor(self.df["Price"].to_numpy(), dtype=torch.float32).unsqueeze(1)  # Ensure (N, 1) shape
    else:
        self.target = torch.zeros(self.features.shape[0], dtype=torch.float32)

  def __len__(self):
    return len(self.df)

  def __getitem__(self, idx):
    return self.id[idx], self.features[idx], self.target[idx]
    
    return features, target

In [7]:
class BackpackPricePredictor(nn.Module):
    def __init__(self, input_dim=27):
        super(BackpackPricePredictor, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)  # Output layer for regression

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)  # No activation for regression output
        return x

In [10]:
# Load dataset
dataset = KaggleBackpackDataset(os.path.join(data_path, 'train.csv'))
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Initialize model
model = BackpackPricePredictor()
criterion = nn.MSELoss()  # Mean Squared Error for regression
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training loop
epochs = 20
for epoch in range(epochs):
    epoch_loss = 0
    for batch_ids, batch_features, batch_targets in dataloader:
        optimizer.zero_grad()
        predictions = model(batch_features)
        loss = criterion(predictions, batch_targets)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss/len(dataloader):.4f}")

Epoch 1/20, Loss: 1555.4445
Epoch 2/20, Loss: 1529.7090
Epoch 3/20, Loss: 1524.6694
Epoch 4/20, Loss: 1523.6303
Epoch 5/20, Loss: 1523.6691
Epoch 6/20, Loss: 1524.1299
Epoch 7/20, Loss: 1521.9901
Epoch 8/20, Loss: 1522.8204
Epoch 9/20, Loss: 1522.1984
Epoch 10/20, Loss: 1521.9411
Epoch 11/20, Loss: 1521.8343
Epoch 12/20, Loss: 1521.0439
Epoch 13/20, Loss: 1521.1049
Epoch 14/20, Loss: 1521.3615
Epoch 15/20, Loss: 1520.4470
Epoch 16/20, Loss: 1520.0286
Epoch 17/20, Loss: 1520.3987
Epoch 18/20, Loss: 1519.7741
Epoch 19/20, Loss: 1519.7583
Epoch 20/20, Loss: 1519.5202


In [11]:
# Test on a sample
test_id, test_sample, test_target = dataset[0]
test_sample = test_sample.unsqueeze(0)  # Add batch dimension

predicted_price = model(test_sample).item()
print(f"Predicted Price: {predicted_price}, Actual Price: {test_target.item()}")

Predicted Price: 78.79701232910156, Actual Price: 112.15875244140625


In [83]:
# Define the validation function
def validate_model(model, test_csv, batch_size=32):
    # Load test dataset
    test_dataset = KaggleBackpackDataset(test_csv)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Loss function
    criterion = nn.MSELoss()

    model.eval()  # Set model to evaluation mode
    total_loss = 0
    total_samples = 0
    mae_loss = 0
    
    # List to store predictions
    prediction_results = []

    with torch.no_grad():  # Disable gradient computation
        for id, features, targets in test_loader:
            predictions = model(features)           
            for id, predicted_price in zip(id, predictions):
                prediction_results.append((id.item(), predicted_price.item()))

    
    # Create DataFrame and save to CSV
    df_predictions = pd.DataFrame(prediction_results, columns=["id", "Price"])
    return df_predictions
    # df_predictions.to_csv("../data/predictions.csv", index=False)
# Example usage (after training your model)
# model = YourTrainedModel()  # Load trained model
df_predictions = validate_model(model, "../data/test.csv")
df_predictions.head()

Unnamed: 0,id,Price
0,300000,82.199867
1,300001,81.78006
2,300002,81.446991
3,300003,81.971878
4,300005,79.9795
