🧠 Step 1: Imports

In [1]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import OneHotEncoder

📂 Step 2: Folder Path and CSV List

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
folder_path = "./nfl-big-data-bowl-2026-prediction/train"
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

🧩 Step 3: Custom Dataset Class

In [4]:
df = pd.read_csv("nfl-big-data-bowl-2026-prediction/train/input_2023_w01.csv")
df.head()

Unnamed: 0,game_id,play_id,player_to_predict,nfl_id,frame_id,play_direction,absolute_yardline_number,player_name,player_height,player_weight,player_birth_date,player_position,player_side,player_role,x,y,s,a,dir,o,num_frames_output,ball_land_x,ball_land_y
0,2023090700,101,False,54527,1,right,42,Bryan Cook,6-1,210,1999-09-07,FS,Defense,Defensive Coverage,52.33,36.94,0.09,0.39,322.4,238.24,21,63.259998,-0.22
1,2023090700,101,False,54527,2,right,42,Bryan Cook,6-1,210,1999-09-07,FS,Defense,Defensive Coverage,52.33,36.94,0.04,0.61,200.89,236.05,21,63.259998,-0.22
2,2023090700,101,False,54527,3,right,42,Bryan Cook,6-1,210,1999-09-07,FS,Defense,Defensive Coverage,52.33,36.93,0.12,0.73,147.55,240.6,21,63.259998,-0.22
3,2023090700,101,False,54527,4,right,42,Bryan Cook,6-1,210,1999-09-07,FS,Defense,Defensive Coverage,52.35,36.92,0.23,0.81,131.4,244.25,21,63.259998,-0.22
4,2023090700,101,False,54527,5,right,42,Bryan Cook,6-1,210,1999-09-07,FS,Defense,Defensive Coverage,52.37,36.9,0.35,0.82,123.26,244.25,21,63.259998,-0.22


In [5]:
len(csv_files)

36

In [6]:
class NFLCsvDataset(Dataset):
    def __init__(self, folder_path):
        self.folder_path = folder_path

        # -----------------------------
        # 1️⃣ Only include input CSV files
        # -----------------------------
        self.csv_files = sorted([
            f for f in os.listdir(folder_path)
            if f.startswith("input_2023_w") and f.endswith(".csv")
        ])
        if not self.csv_files:
            raise FileNotFoundError(f"No input_2023_w*.csv files found in {folder_path}")

        self.data = []

        for file in self.csv_files:
            file_path = os.path.join(folder_path, file)
            print(f"\n📂 Loading: {file_path}")
            df = pd.read_csv(file_path)
            print(f"➡️  Original shape: {df.shape}")

            # ==============================================
            # 🧠 STEP 1: Define numeric and categorical columns
            # ==============================================
            numeric_cols = [
                "game_id", "play_id", "nfl_id", "frame_id",
                "absolute_yardline_number", "player_weight",
                "x", "y", "s", "a", "o", "dir",
                "num_frames_output", "ball_land_x", "ball_land_y"
            ]

            cat_cols = [
                "player_position", "player_side", 
                "player_role", "play_direction"
            ]

            # ----------------------------------------------
            # 🧩 STEP 2: Convert numeric columns safely
            # ----------------------------------------------
            existing_numeric_cols = [c for c in numeric_cols if c in df.columns]
            for col in existing_numeric_cols:
                df[col] = pd.to_numeric(df[col], errors='coerce')

            # ----------------------------------------------
            # 🧩 STEP 3: One-hot encode categorical columns
            # ----------------------------------------------
            for col in cat_cols:
                if col in df.columns:
                    ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
                    transformed = ohe.fit_transform(df[[col]])
                    ohe_cols = [f"{col}_{cls}" for cls in ohe.categories_[0]]
                    df_ohe = pd.DataFrame(transformed, columns=ohe_cols, index=df.index)
                    df = pd.concat([df.drop(columns=[col]), df_ohe], axis=1)

            # ----------------------------------------------
            # 🧩 STEP 4: Boolean and special conversions
            # ----------------------------------------------
            if "player_to_predict" in df.columns:
                df["player_to_predict"] = (
                    df["player_to_predict"].astype(str)
                    .map({"True": 1, "False": 0, "1": 1, "0": 0})
                    .fillna(0)
                    .astype(int)
                )

            if "player_height" in df.columns:
                def height_to_inches(h):
                    try:
                        ft, inch = h.split('-')
                        return int(ft) * 12 + int(inch)
                    except Exception:
                        return np.nan
                df["player_height_in"] = df["player_height"].apply(height_to_inches)

            # ----------------------------------------------
            # 🧩 STEP 5: Keep only numeric columns
            # ----------------------------------------------
            df = df.select_dtypes(include=[np.number])
            df = df.dropna(how="all")

            if df.empty:
                print(f"⚠️ Skipped {file} (no usable numeric data).")
                continue

            print(f"✅ Processed shape: {df.shape}")
            self.data.append(df)

        # ==============================================
        # 🧩 STEP 6: Combine all CSVs
        # ==============================================
        if not self.data:
            raise ValueError("No valid data could be loaded from folder.")

        self.data = pd.concat(self.data, ignore_index=True)
        print(f"\n✅ Combined dataset shape: {self.data.shape}")
        self.data.to_csv('prepro_input.csv', index=False)
        print("Single File Made Successfully.")

    

    #     # ==============================================
    #     # 🧩 STEP 6: Define features and targets
    #     # ==============================================
    #     target_cols = ["x", "y"]  

    #     feature_cols = [i for i in range(self.data.shape[1]) if i not in target_cols]
    #     X_np = self.data.iloc[:, feature_cols].values
    #     y_np = self.data.iloc[:, target_cols].values

    #     # ==============================================
    #     # 🧩 STEP 7: Convert to Tensors
    #     # ==============================================
    #     self.X = torch.tensor(X_np, dtype=torch.float32)
    #     self.y = torch.tensor(y_np, dtype=torch.float32)

    # def __len__(self):
    #     return len(self.X)

    # def __getitem__(self, idx):
    #     return self.X[idx], self.y[idx]


In [7]:
dataset = NFLCsvDataset(folder_path)


📂 Loading: ./nfl-big-data-bowl-2026-prediction/train/input_2023_w01.csv
➡️  Original shape: (285714, 23)
✅ Processed shape: (285714, 40)

📂 Loading: ./nfl-big-data-bowl-2026-prediction/train/input_2023_w02.csv
➡️  Original shape: (288586, 23)
✅ Processed shape: (288586, 41)

📂 Loading: ./nfl-big-data-bowl-2026-prediction/train/input_2023_w03.csv
➡️  Original shape: (297757, 23)
✅ Processed shape: (297757, 40)

📂 Loading: ./nfl-big-data-bowl-2026-prediction/train/input_2023_w04.csv
➡️  Original shape: (272475, 23)
✅ Processed shape: (272475, 40)

📂 Loading: ./nfl-big-data-bowl-2026-prediction/train/input_2023_w05.csv
➡️  Original shape: (254779, 23)
✅ Processed shape: (254779, 40)

📂 Loading: ./nfl-big-data-bowl-2026-prediction/train/input_2023_w06.csv
➡️  Original shape: (270676, 23)
✅ Processed shape: (270676, 40)

📂 Loading: ./nfl-big-data-bowl-2026-prediction/train/input_2023_w07.csv
➡️  Original shape: (233597, 23)
✅ Processed shape: (233597, 40)

📂 Loading: ./nfl-big-data-bowl-20

In [None]:
input_df = pd.read_csv("prepro_input.csv")

In [None]:
# Convert to a NumPy array
data_np = df.values  # or df.to_numpy()

# Convert to PyTorch tensor
data_tensor = torch.tensor(data_np, dtype=torch.float32)

# Check tensor info
print("Tensor shape:", data_tensor.shape)
print(data_tensor[:5])  # show first 5 rows

TypeError: object of type 'NFLCsvDataset' has no len()

⚙️ Step 4: Create Dataset and DataLoader

In [None]:
# # Create dataset
# dataset = NFLCsvDataset(folder_path)

# # Create DataLoader for batching
# dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# # Example: iterate through one batch
# for X_batch, y_batch in dataloader:
#     print("Batch X:", X_batch.shape)
#     print("Batch y:", y_batch.shape)
#     break


🔥 Step 5: Use in a Regression Model (Example)

In [None]:
# import torch.nn as nn

# class RegressionModel(nn.Module):
#     def __init__(self, input_dim):
#         super().__init__()
#         self.net = nn.Sequential(
#             nn.Linear(input_dim, 64),
#             nn.ReLU(),
#             nn.Linear(64, 1)
#         )

#     def forward(self, x):
#         return self.net(x)

# # Initialize model
# input_dim = dataset.X.shape[1]
# model = RegressionModel(input_dim)
# print(model)


🧮 Step 6: Training Loop (optional)

In [None]:
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# criterion = nn.MSELoss()

# for epoch in range(5):
#     for X_batch, y_batch in dataloader:
#         optimizer.zero_grad()
#         outputs = model(X_batch)
#         loss = criterion(outputs, y_batch)
#         loss.backward()
#         optimizer.step()
#     print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")
