In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import tensorflow.keras as keras
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
sns.set_theme()
import os

In [2]:
# Refactored preprocessing class for ablation

class Preprocessing():
    def __init__(self, data):
        self.data = data

        # Feature selection
        self.factors = ['Quarter','ShotType','ShotDist','FoulType',
                'TimeoutTeam','EnterGame','LeaveGame', 'Shooter',
                'Rebounder', 'Blocker','Fouler',
                'Fouled','ReboundType','ViolationPlayer', 'ViolationType',
                'FreeThrowShooter','TurnoverPlayer','TurnoverType','TurnoverCause',
                'TurnoverCauser']

    # Label encoding
    def encoders(self):
        for factor in self.factors:
            globals()[f'{factor}_le'] = LabelEncoder()
            self.data[factor] = globals()[f'{factor}_le'].fit_transform(self.data[factor])
        return self.data

    # Creating arrays of runs
    def home_runner(self):
        run = []
        self.home_runs = []
        for idx in self.data.index:
            if self.data.at[idx,'HomePlay'] is not np.nan:
                if 'makes' in self.data.at[idx,'HomePlay']:
                    run.append(idx)
            elif self.data.at[idx,'AwayPlay'] is not np.nan:
                if 'makes' in self.data.at[idx,'AwayPlay']:
                    run.clear()
            if len(run) == 4:
                self.home_runs.append(run.copy())
                run.clear()
        return self.home_runs
                    
    def away_runner(self):
        run = []
        self.away_runs = []
        for idx in self.data.index:
            if self.data.at[idx,'AwayPlay'] is not np.nan:
                if 'makes' in self.data.at[idx,'AwayPlay']:
                    run.append(idx)
            elif self.data.at[idx,'HomePlay'] is not np.nan:
                if 'makes' in self.data.at[idx,'HomePlay']:
                    run.clear()
            if len(run) == 4:
                self.away_runs.append(run.copy())
                run.clear()
        return self.away_runs

    def all_runner(self):
        self.all_runs = []
        self.all_runs.extend(self.home_runs)
        self.all_runs.extend(self.away_runs)
        return self.all_runs

    # Flattening runs
    def runs_iter(self, removed_feature=None):
        if removed_feature:
            features = [f for f in self.factors if f != removed_feature]
        else:
            features = self.factors
        
        fact_cols = [col + str((i // len(features)) % 10 + 1) for i, col in enumerate(features * 10)]
        fact_cols.append('class')
        
        self.runs_df = pd.DataFrame()
        for run in self.all_runs:
            a = self.data.loc[run[0]-10:run[0]-1, features].values.ravel()
            a = np.append(a,1)
            self.runs_df = pd.concat([self.runs_df,pd.DataFrame([a.copy()])])
        self.runs_df.columns = fact_cols
        return self.runs_df

    # Function to remove runs from original Dataframe
    def no_runs_preprocessing(self):
        r = [i[0] for i in self.all_runs]  
        r_x = []
        for num in r:
            r_x.extend(range(num - 10, num + 1))
        self.no_runs_df = self.data[~self.data.index.isin(r_x)].reset_index(drop=True)
        segment_size = 10
        segments = len(self.no_runs_df) // segment_size
        self.no_runs_split = np.array_split(self.no_runs_df, segments)
        self.no_runs_split = [x for x in self.no_runs_split if len(x) == 10]
        return self.no_runs_split

    # Flattening no runs
    def no_runs_optimized(self, removed_feature=None):
        if removed_feature:
            features = [f for f in self.factors if f != removed_feature]
        else:
            features = self.factors
        
        fact_cols = [col + str((i // len(features)) % 10 + 1) for i, col in enumerate(features * 10)]
        fact_cols.append('class')
        
        self.no_runs_df = pd.DataFrame([np.append(segment.loc[:, features].values.ravel(), int(0)) for segment in self.no_runs_split])
        self.no_runs_df.columns = fact_cols
        self.no_runs_df = self.no_runs_df.sample(frac=0.14)
        return self.no_runs_df

    # Preparing final Dataframe for training
    def final(self, removed_feature=None):
        self.runs_iter(removed_feature)
        self.no_runs_optimized(removed_feature)
        self.final_df = pd.concat([self.runs_df,self.no_runs_df],ignore_index=True).dropna().astype(int)
        self.scaler = MinMaxScaler((0,255))
        self.values = pd.DataFrame(self.scaler.fit_transform(self.final_df.iloc[:,:-1]))
        self.labels = self.final_df.iloc[:,-1]
        self.values.to_csv('data.csv')
        self.labels.to_csv('labels.csv')

    # Run everything
    def preprocess(self, removed_feature=None):
        self.encoders()
        self.home_runner()
        self.away_runner()
        self.all_runner()
        self.no_runs_preprocessing()
        self.final(removed_feature)


In [None]:


# Concat datasets and preprocess
data = pd.DataFrame()

data_dir = 'data'

files = os.listdir(data_dir)

for file in files:
    if file.endswith('.csv'):  
        file_path = os.path.join(data_dir, file)
        data = pd.concat([data, pd.read_csv(file_path)], ignore_index=True)

preprocessor = Preprocessing(data)

# Initialize ablation study results
train_results = {}
val_results = {}

# Run ablation study
for feature in preprocessor.factors:
    print(f"Running ablation study by removing feature: {feature}")
    
    # Preprocess data without the selected feature
    preprocessor.preprocess(removed_feature=feature)
    
    # Load preprocessed data
    X = pd.read_csv('data.csv', index_col=0)
    X = X.values.reshape(-1, 19,10)  # 19 features * 10 rows = 190
    X = X / 255
    y = pd.read_csv('labels.csv', index_col=0)
    y = keras.utils.to_categorical(y, 2)

    # Model training
    early_stop = EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True)
    model = keras.models.Sequential()
    model.add(keras.layers.Input(shape=(19,10,1)))
    model.add(keras.layers.Conv2D(50, (3, 3), strides=1, padding="same", activation="relu"))   
    model.add(keras.layers.MaxPool2D((2, 2), strides=2, padding="same"))
    model.add(keras.layers.Conv2D(25,(3,3), strides=1, padding='same', activation='relu'))
    model.add(keras.layers.MaxPool2D((2, 2), strides=2, padding="same"))
    model.add(keras.layers.Flatten())
    model.add(keras.layers.Dense(units=512, activation="relu"))
    # model.add(keras.layers.Dense(units=256, activation="relu"))
    # model.add(keras.layers.Dense(units=128, activation="relu"))
    # model.add(keras.layers.Dense(units=64, activation="relu"))
    model.add(keras.layers.Dense(units=2, activation="softmax"))

    model.compile(loss="binary_crossentropy", metrics=['accuracy','precision','recall','f1_score'], optimizer='rmsprop')

    history = model.fit(X, y, epochs=20, verbose=1, validation_split=0.2, callbacks=[early_stop])
    
    # Save results
    train_results[feature] = {
        'loss': np.round(np.mean(history.history['loss'])),
        'accuracy': np.round(np.mean(history.history['accuracy'])),
        'precision': np.round(np.mean(history.history['precision'])),
        'recall': np.round(np.mean(history.history['recall'])),
        'f1_score': np.round(np.mean(history.history['f1_score']))
    }

    val_results[feature] = {
        'loss': np.round(np.mean(history.history['val_loss'])),
        'accuracy': np.round(np.mean(history.history['val_accuracy'])),
        'precision': np.round(np.mean(history.history['val_precision'])),
        'recall': np.round(np.mean(history.history['val_recall'])),
        'f1_score': np.round(np.mean(history.history['val_f1_score']))
    }

    # Plot results for each ablation
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title(f'Validation Loss (Removing {feature})')
    plt.legend(['train','validation'],loc='upper right')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')

    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title(f'Validation Accuracy (Removing {feature})')
    plt.legend(['train','validation'],loc='upper right')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')

    plt.tight_layout()
    plt.show()

# Save pickle

import pickle
with open('ablation_train_results.pkl', 'wb') as f:
    pickle.dump(train_results, f)

with open('ablation_val_results.pkl', 'wb') as f:
    pickle.dump(val_results, f)

In [None]:
with open('ablation_train_results.pkl', 'rb') as f:
    results = pickle.load(f)

results