In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import Dataset
import os

In [4]:
game_df = pd.read_csv(os.path.join('csv','game.csv'))
# player_df = pd.read_csv('player.csv')
# other_stats_df = pd.read_csv('other_stats.csv')
game_features = ['season_id', 'team_id_home', 'team_abbreviation_home', 'pts_home', 'team_id_away', 'team_abbreviation_away', 'pts_away']
game_df = game_df[game_features]
game_df.fillna(game_df.mean(), inplace=True)


  game_df.fillna(game_df.mean(), inplace=True)


In [5]:
encoder = LabelEncoder()
game_df['team_abbreviation_home'] = encoder.fit_transform(game_df['team_abbreviation_home'])
game_df['team_abbreviation_away'] = encoder.fit_transform(game_df['team_abbreviation_away'])


In [6]:
scaler = StandardScaler()
game_df[['pts_home', 'pts_away']] = scaler.fit_transform(game_df[['pts_home', 'pts_away']])


In [7]:
train_df, test_df = train_test_split(game_df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2


In [8]:
class SportsDataset(Dataset):
    def __init__(self, data):
        self.X = data[['team_abbreviation_home', 'team_abbreviation_away', 'pts_home', 'pts_away']].values
        self.y = data['season_id'].values  # This should be the target variable you are interested in

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.float32), torch.tensor(self.y[idx], dtype=torch.int64)


In [9]:
train_dataset = SportsDataset(train_df)
val_dataset = SportsDataset(val_df)
test_dataset = SportsDataset(test_df)

train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=64, shuffle=False)


In [16]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define the MLP model
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(MLP, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        out = self.layer1(x)
        out = self.relu(out)
        out = self.layer2(out)
        return out

# Assuming the inputs from game features are all numeric after encoding and scaling
input_size = len(train_dataset[0][0])  # This should match the number of features
hidden_size = 50  # Example size, can be adjusted
num_classes = len(torch.unique(train_dataset[:][1]))  # Adjust based on your target

model = MLP(input_size, hidden_size, num_classes)



In [18]:
from sklearn.preprocessing import LabelEncoder

# Assuming `season_id` is your target and exists in your train, val, and test sets
all_labels = np.concatenate([train_df['season_id'], val_df['season_id'], test_df['season_id']])
label_encoder = LabelEncoder()
label_encoder.fit(all_labels)

train_df['season_id'] = label_encoder.transform(train_df['season_id'])
val_df['season_id'] = label_encoder.transform(val_df['season_id'])
test_df['season_id'] = label_encoder.transform(test_df['season_id'])


In [19]:
num_classes = len(label_encoder.classes_)  # Adjust this based on remapped labels
model = MLP(input_size, hidden_size, num_classes)


In [20]:
# Checking the data loader output
for data, target in train_loader:
    print("Data shape:", data.shape, "Target shape:", target.shape)
    print("Target sample:", target[:5])
    break


Data shape: torch.Size([64, 4]) Target shape: torch.Size([64])
Target sample: tensor([22003, 22008, 21995, 21995, 31963])


In [22]:
def train_model(model, train_loader, val_loader, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        for data, target in train_loader:
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
        
        # Evaluate validation accuracy after each epoch
        model.eval()
        with torch.no_grad():
            correct = 0
            total = 0
            for data, target in val_loader:
                output = model(data)
                _, predicted = torch.max(output.data, 1)
                total += target.size(0)
                correct += (predicted == target).sum().item()
            print(f'Epoch {epoch+1}: Validation Accuracy: {100 * correct / total}%')


In [17]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()  # Use CrossEntropyLoss for classification problems
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
def train_model(model, train_loader, val_loader, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for data, target in train_loader:
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}')

        # Validation phase
        model.eval()
        with torch.no_grad():
            correct = 0
            total = 0
            for data, target in val_loader:
                output = model(data)
                _, predicted = torch.max(output.data, 1)
                total += target.size(0)
                correct += (predicted == target).sum().item()

            print(f'Validation Accuracy: {100 * correct / total}%')

# Set the number of epochs based on how training progresses, start small to prevent long crashes
num_epochs = 5
train_model(model, train_loader, val_loader, num_epochs)


IndexError: Target 22003 is out of bounds.

BASE MODEL CONSTRUCTION:

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define the model architecture
class SimpleMLP(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(SimpleMLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)  # Input layer to hidden layer
        self.relu = nn.ReLU()                          # Activation function
        self.fc2 = nn.Linear(hidden_size, num_classes) # Hidden layer to output layer

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# Model Hyperparameters
input_size = 4  # Based on the number of features used from the dataset
hidden_size = 64  # A moderate size for hidden layer
num_classes = len(game_df['season_id'].unique())  # Based on unique season IDs

model = SimpleMLP(input_size, hidden_size, num_classes)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()  # Combines softmax
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer with a learning rate of 0.001

# Training Loop
num_epochs = 50
for epoch in range(num_epochs):
    for i, (features, labels) in enumerate(train_loader):
        # Forward pass
        outputs = model(features)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')

# Validate the model
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    correct = 0
    total = 0
    for features, labels in val_loader:
        outputs = model(features)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print(f'Validation Accuracy of the model on the validation set: {100 * correct / total}%')




IndexError: Target 22009 is out of bounds.

IndexError: Target 21982 is out of bounds.

Ver 2

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim


In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F


# class SimpleMLP(nn.Module):
#     def __init__(self, input_dim, hidden_dim, output_dim):
#         super(SimpleMLP, self).__init__()
#         self.fc1 = nn.Linear(input_dim, hidden_dim)  # Input to hidden layer
#         self.fc2 = nn.Linear(hidden_dim, output_dim)  # Hidden to output layer
    
#     def forward(self, x):
#         x = F.relu(self.fc1(x))  # Activation function for hidden layer
#         x = self.fc2(x)  # No activation for output layer, assuming regression or raw output needed
#         return x


In [10]:
# Parameters
input_dim = 4  # Adjust based on the number of input features
hidden_dim = 32  # Modest number of hidden nodes
output_dim = 1  # Adjust based on your specific prediction task (regression or classification)


# Model initialization
model = SimpleMLP(input_dim, hidden_dim, output_dim)

# Loss and Optimizer
criterion = nn.MSELoss()  # Mean Squared Error Loss for regression tasks
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer


In [20]:
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=50):
    for epoch in range(epochs):
        model.train()  # Set the model to training mode
        for batch_idx, (features, targets) in enumerate(train_loader):
            features = features.float()  # Ensure features are in float
            targets = targets.long()  # Convert targets to float if using MSE or similar

            optimizer.zero_grad()  # Clear gradients for this training step
            outputs = model(features)  # Forward pass

            # Ensure outputs and targets are the correct shape
            outputs = outputs.squeeze()  # Remove any extra dimensions from outputs
            loss = criterion(outputs, targets)  # Compute loss
            loss.backward()  # Backpropagation
            optimizer.step()  # Apply gradients
            
            if batch_idx % 100 == 0:
                print(f'Epoch [{epoch+1}/{epochs}], Step [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}')
        
        # Validation loss
        model.eval()  # Set the model to evaluation mode
        with torch.no_grad():
            valid_loss = sum(criterion(model(features.float()).squeeze(), targets.float()) for features, targets in val_loader) / len(val_loader)
        print(f'Validation Loss after Epoch [{epoch+1}/{epochs}]: {valid_loss:.4f}')


In [None]:
# # Training loop
# def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=50):
#     for epoch in range(epochs):
#         model.train()  # Set the model to training mode
#         for batch_idx, (features, targets) in enumerate(train_loader):
#             optimizer.zero_grad()  # Clear gradients for this training step
#             outputs = model(features)  # Forward pass
#             loss = criterion(outputs, targets)  # Compute loss
#             loss.backward()  # Backpropagation
#             optimizer.step()  # Apply gradients
            
#             if batch_idx % 100 == 0:
#                 print(f'Epoch [{epoch+1}/{epochs}], Step [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}')
        
#         # Validation loss
#         model.eval()  # Set the model to evaluation mode
#         with torch.no_grad():
#             valid_loss = sum(criterion(model(features), targets) for features, targets in val_loader) / len(val_loader)
#         print(f'Validation Loss after Epoch [{epoch+1}/{epochs}]: {valid_loss:.4f}')

# # Call to train the model
# train_model(model, train_loader, val_loader, criterion, optimizer, epochs=50)


ver 2

In [11]:
class SimpleMLP(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(SimpleMLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)  # Input layer to hidden layer
        self.relu = nn.ReLU()                         # Activation function
        self.fc2 = nn.Linear(hidden_size, num_classes) # Hidden layer to output layer

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out


In [12]:
input_size = 4  # Number of input features
hidden_size = 16  # Number of hidden units
num_classes = 10  # Example: number of season types or classifications

model = SimpleMLP(input_size, hidden_size, num_classes)


In [15]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        for i, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if (i+1) % 100 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')


In [18]:
def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()  # Set the model to training mode
    for epoch in range(num_epochs):
        for i, (inputs, labels) in enumerate(train_loader):
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if (i+1) % 100 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')


In [None]:
#train_model(model, train_loader, criterion, optimizer, epochs=10)
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=10)


In [None]:
class EnhancedMLP(nn.Module):
    def __init__(self, input_size, num_classes):
        super(EnhancedMLP, self).__init__()
        self.layer1 = nn.Linear(input_size, 128)
        self.relu1 = nn.ReLU()
        self.layer2 = nn.Linear(128, 64)
        self.relu2 = nn.ReLU()
        self.layer3 = nn.Linear(64, 32)
        self.relu3 = nn.ReLU()
        self.fc = nn.Linear(32, num_classes)

    def forward(self, x):
        out = self.relu1(self.layer1(x))
        out = self.relu2(self.layer2(out))
        out = self.relu3(self.layer3(out))
        out = self.fc(out)
        return out


In [None]:
input_size = 4  # Adjust this based on the number of features your dataset has
num_classes = 10  # Adjust based on the number of output classes

model = EnhancedMLP(input_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [11]:
def train_and_validate(model, train_loader, val_loader, criterion, optimizer, num_epochs=10):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader):.4f}')

        # Validation phase
        model.eval()
        total = 0
        correct = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        print(f'Validation Accuracy: {100 * correct / total:.2f}%')

    return model


In [None]:
trained_model = train_and_validate(model, train_loader, val_loader, criterion, optimizer, num_epochs=10)


VER 2

In [None]:
# Define columns to load for each dataframe to minimize memory usage
# game_cols = ['game_id', 'team_id_home', 'team_id_away', 'team_abbreviation_home', 'team_abbreviation_away', 'pts_home', 'season_type', 'pts_away']
# player_cols = ['id', 'full_name']  # Adjust based on actual usage needs
# other_stats_cols = ['game_id', 'team_id_home', 'team_id_away', 'pts_paint_home', 'pts_paint_away', 'total_turnovers_home', 'total_turnovers_away']
# line_score_cols = ['game_id', 'team_id_home', 'team_id_away', 'pts_home', 'pts_away']
# game_features = ['season_id', 'team_id_home', 'team_abbreviation_home', 'pts_home', 'team_id_away', 'team_abbreviation_away', 'pts_away']
# game_df = game_df[game_features]


# # Load datasets with selected columns
# game_df = pd.read_csv('game.csv', usecols=game_cols)
# player_df = pd.read_csv('player.csv', usecols=player_cols)
# other_stats_df = pd.read_csv('other_stats.csv', usecols=other_stats_cols)
# line_score_df = pd.read_csv('line_score.csv', usecols=line_score_cols)
# Load datasets
#game_df = pd.read_csv(os.path.join('csv','game.csv'), usecols=game_cols)
# player_df = pd.read_csv(os.path.join('csv','player.csv'), usecols=player_cols)
# other_stats_df = pd.read_csv(os.path.join('csv','other_stats.csv'), usecols=other_stats_cols)
# line_score_df = pd.read_csv(os.path.join('csv','line_score.csv'), usecols=line_score_cols)

# # Merge datasets based on 'game_id' and 'team_id_home'/'team_id_away'
# merged_df = pd.merge(game_df, other_stats_df, on=["game_id", "team_id_home", "team_id_away"])
# merged_df = pd.merge(merged_df, line_score_df, on=["game_id", "team_id_home", "team_id_away"])


In [None]:
# # Assuming 'pts_home_x' refers to the game file and 'pts_home_y' refers to the line score file, and they should be identical
# assert (merged_df['pts_home_x'] == merged_df['pts_home_y']).all(), "Point columns differ and need inspection."

# # Drop the redundant column if they are identical
# merged_df.drop('pts_home_y', axis=1, inplace=True)
# merged_df.rename(columns={'pts_home_x': 'pts_home'}, inplace=True)

In [None]:
# Fill numerical columns with the mean
numerical_cols = merged_df.select_dtypes(include=[np.number]).columns
merged_df[numerical_cols] = merged_df[numerical_cols].fillna(merged_df[numerical_cols].mean())

# Fill categorical columns with the mode
categorical_cols = merged_df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    merged_df[col] = merged_df[col].fillna(merged_df[col].mode()[0])


In [None]:
# # Define columns to be encoded and scaled
# categorical_features = ['team_abbreviation_home', 'team_abbreviation_away', 'season_type']
# numerical_features = list(numerical_cols)
# numerical_features.remove('pts_home')  # Remove target variable from scaling

# # Create a transformer for scaling and encoding
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), numerical_features),
#         ('cat', OneHotEncoder(), categorical_features)
#     ])

# # Configure pipeline with preprocessing and a dummy estimator
# pipe = Pipeline(steps=[('preprocessor', preprocessor)])

# # Fit and transform the data
# X_transformed = pipe.fit_transform(merged_df.drop('pts_home', axis=1))
# y = merged_df['pts_home'].values


In [None]:
# Define features for encoding and scaling
categorical_features = ['team_abbreviation_home', 'team_abbreviation_away', 'season_type']
numerical_features = [col for col in numerical_cols if col != 'pts_home']  # exclude target variable from features

# Create the transformer and pipeline for handling categorical and numerical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(sparse_output=True), categorical_features),  # Maintain sparse format to save memory
    ], 
    sparse_threshold=0.3  # This ensures the output will be sparse if less than 30% of the data is nonzero
)

pipe = Pipeline(steps=[('preprocessor', preprocessor)])
X_transformed = pipe.fit_transform(merged_df.drop('pts_home', axis=1))
y = merged_df['pts_home'].values


In [None]:
# Ensure the data is in dense format if necessary and convert to PyTorch tensors
if isinstance(X_transformed, np.ndarray):
    X_tensor = torch.tensor(X_transformed.astype(np.float32))
else:
    # Only convert to dense if it is necessary (sparse matrices can be handled directly depending on the model)
    X_tensor = torch.tensor(X_transformed.toarray().astype(np.float32))

y_tensor = torch.tensor(y.astype(np.float32))

# Create a TensorDataset
dataset = TensorDataset(X_tensor, y_tensor)

# Split the dataset into training, validation, and test sets
train_dataset, temp_dataset = train_test_split(dataset, test_size=0.2, random_state=42)
valid_dataset, test_dataset = train_test_split(temp_dataset, test_size=0.5, random_state=42)

# Create DataLoaders with a smaller batch size
batch_size = 32  # Reduced batch size from 64 to 32 to manage memory usage better
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [None]:
# # Convert the sparse matrix to a dense matrix/array before converting to PyTorch tensors
# X_dense = X_transformed.todense() if hasattr(X_transformed, 'todense') else X_transformed
# X_tensor = torch.tensor(X_dense.astype(np.float32))
# y_tensor = torch.tensor(y.astype(np.float32))

# # Create a TensorDataset
# dataset = TensorDataset(X_tensor, y_tensor)

# # Split the dataset into training (80%), validation (10%), and test (10%) sets
# train_dataset, temp_dataset = train_test_split(dataset, test_size=0.2, random_state=42)
# valid_dataset, test_dataset = train_test_split(temp_dataset, test_size=0.5, random_state=42)

# # Create DataLoaders
# batch_size = 64
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
# test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [None]:
# Convert arrays to PyTorch tensors
X_tensor = torch.tensor(X_transformed.astype(np.float32))
y_tensor = torch.tensor(y.astype(np.float32))

# Create a TensorDataset
dataset = TensorDataset(X_tensor, y_tensor)

# Split the dataset into training (80%), validation (10%), and test (10%) sets
train_dataset, temp_dataset = train_test_split(dataset, test_size=0.2, random_state=42)
valid_dataset, test_dataset = train_test_split(temp_dataset, test_size=0.5, random_state=42)

# Create DataLoaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
