In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import torch
from torch.utils.data import TensorDataset, DataLoader
import os

# Define a function to reduce DataFrame memory footprint
def reduce_mem_usage(df):
    """ Iterate through all the columns of a DataFrame and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory usage of dataframe is {start_mem:.2f} MB')
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.api.types.is_float_dtype(col_type):
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
            elif pd.api.types.is_integer_dtype(col_type):
                if c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print(f'Memory usage after optimization is: {end_mem:.2f} MB')
    print(f'Decreased by {(100 * (start_mem - end_mem) / start_mem):.1f}%')
    
    return df

# Load and optimize datasets
game_df = reduce_mem_usage(pd.read_csv(os.path.join('csv','game.csv')))
other_stats_df = reduce_mem_usage(pd.read_csv(os.path.join('csv','other_stats.csv')))
line_score_df = reduce_mem_usage(pd.read_csv(os.path.join('csv','line_score.csv')))


Memory usage of dataframe is 27.57 MB
Memory usage after optimization is: 12.87 MB
Decreased by 53.3%
Memory usage of dataframe is 5.61 MB
Memory usage after optimization is: 2.49 MB
Decreased by 55.6%
Memory usage of dataframe is 19.05 MB
Memory usage after optimization is: 8.73 MB
Decreased by 54.2%


In [2]:
# # Load datasets
# game_df = pd.read_csv(os.path.join('csv','game.csv'))
# player_df = pd.read_csv(os.path.join('csv','player.csv'))
# other_stats_df = pd.read_csv(os.path.join('csv','other_stats.csv'))
# line_score_df = pd.read_csv(os.path.join('csv','line_score.csv'))
# Merge datasets based on 'game_id', 'team_id_home', and 'team_id_away'
merged_df = pd.merge(game_df, other_stats_df, on=["game_id", "team_id_home", "team_id_away"])
merged_df = pd.merge(merged_df, line_score_df, on=["game_id", "team_id_home", "team_id_away"])

# Check and clean duplicated columns
merged_df = merged_df.loc[:,~merged_df.columns.duplicated()]


In [3]:
# Assuming 'pts_home_x' refers to the game file and 'pts_home_y' refers to the line score file, and they should be identical
assert (merged_df['pts_home_x'] == merged_df['pts_home_y']).all(), "Point columns differ and need inspection."

# Drop the redundant column if they are identical
merged_df.drop('pts_home_y', axis=1, inplace=True)
merged_df.rename(columns={'pts_home_x': 'pts_home'}, inplace=True)

In [4]:
# Fill numerical columns with the mean
numerical_cols = merged_df.select_dtypes(include=[np.number]).columns
merged_df[numerical_cols] = merged_df[numerical_cols].fillna(merged_df[numerical_cols].mean())

# Fill categorical columns with the mode
categorical_cols = merged_df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    merged_df[col] = merged_df[col].fillna(merged_df[col].mode()[0])

In [5]:
# Define columns to be encoded and scaled
categorical_features = ['team_abbreviation_home', 'team_abbreviation_away', 'season_type']
numerical_features = list(numerical_cols)
numerical_features.remove('pts_home')  # Remove target variable from scaling

# Create a transformer for scaling and encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Configure pipeline with preprocessing and a dummy estimator
pipe = Pipeline(steps=[('preprocessor', preprocessor)])

# Fit and transform the data
X_transformed = pipe.fit_transform(merged_df.drop('pts_home', axis=1))
y = merged_df['pts_home'].values


In [None]:
# Convert arrays to PyTorch tensors
X_tensor = torch.tensor(X_transformed.astype(np.float32))
y_tensor = torch.tensor(y.astype(np.float32))

# Create a TensorDataset
dataset = TensorDataset(X_tensor, y_tensor)

# Split the dataset into training (80%), validation (10%), and test (10%) sets
train_dataset, temp_dataset = train_test_split(dataset, test_size=0.2, random_state=42)
valid_dataset, test_dataset = train_test_split(temp_dataset, test_size=0.5, random_state=42)

# Create DataLoaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)