In [227]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, HalvingRandomSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

In [228]:
df = pd.read_csv("../data/clean/merged.csv")
#df = pd.read_csv("../data/clean/teams.csv") # TESTING

# Convert "playoff" column to binary (Y: 1, N: 0)
df["playoff"] = df["playoff"].map({"Y": 1, "N": 0})

### Label Encoding

In [229]:
from sklearn.preprocessing import LabelEncoder

def encode_df(df):
    le = LabelEncoder()
    for col, col_type in df.dtypes.items():
        if col_type == 'object' or col_type == 'datetime64[ns]':
            df[col] = le.fit_transform(df[col])
    return df

df = encode_df(df)
df.head()


Unnamed: 0,year,award_points,last_3_years_sum,all_time_sum,tmID,pl_efficiency,confID,playoff,tm_efficiency,avg_height,avg_weight,avg_pl_efficiency
0,1,0.0,0,0,6,11.97,1,1,2837.48,72.333333,156.666667,25.1625
1,2,6.0,6,6,6,19.88,1,1,-5048.57,72.5,162.5,18.025
2,3,0.0,6,6,6,16.6,1,1,-2174.01,73.5,157.0,16.5075
3,4,0.0,6,6,6,18.13,1,1,34146.05,73.5,157.0,14.904
4,6,0.0,0,6,6,15.1,1,1,-9824.61,72.333333,160.333333,15.17


In [230]:
def calculate_class_distribution_similarity(df, y_train, y_test):
    # Step 1: Calculate class distribution in the original dataset
    original_class_distribution = df['playoff'].value_counts(normalize=True)
    print("Original Class Distribution:")
    print(original_class_distribution)

    # Step 2: Calculate class distribution in the training and test sets
    train_class_distribution = y_train.value_counts(normalize=True)
    test_class_distribution = y_test.value_counts(normalize=True)

    print("\nTraining Set Class Distribution:")
    print(train_class_distribution)
    print("\nTest Set Class Distribution:")
    print(test_class_distribution)

    # Step 3: Compare class distributions
    class_distribution_similarity = (train_class_distribution - test_class_distribution).abs().sum()
    print("\nClass Distribution Similarity Score:", class_distribution_similarity)

    return class_distribution_similarity

# Update the train_model function to include imputation
def train_model(df, year):
    # Remove rows with missing values
    df.dropna(inplace=True)

    teams_df_train = df[df['year'] < year]
    teams_df_test = df[df['year'] == year]

    X_train = teams_df_train.drop("playoff", axis=1)  # Features
    y_train = teams_df_train["playoff"]  # Target variable

    X_test = teams_df_test.drop("playoff", axis=1)  # Features
    y_test = teams_df_test["playoff"]  # Target variable

    print(f"\nTrain/Test size for year={year}:", X_train.shape, X_test.shape, y_train.shape, y_test.shape)

    return X_train, y_train, X_test, y_test

# create a final dataset with the predictions: team, playoff, prediction (a probability between 0 and 1)
def create_predictions(df, year, model, distribution_similarity=True):
    # Create a copy of the original dataset
    new_df = df.copy()

    X_train, y_train, X_test, y_test = train_model(new_df, year)

    # Check if the class distribution is similar between the training and test sets
    if distribution_similarity:
        calculate_class_distribution_similarity(new_df, y_train, y_test)    

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test) # Predictions
    y_pred_proba = model.predict_proba(X_test) # Prediction probabilities

    # Calculate the AUC
    auc = roc_auc_score(y_test, y_pred_proba[:,1])
    print(f"\nAUC for year={year}:", auc)
 
    # Only add predictions to the test set
    new_df.loc[new_df['year'] == year, 'prediction'] = y_pred
    new_df.loc[new_df['year'] == year, 'prediction_proba'] = y_pred_proba[:,1]

    # Filter the dataset to only include the year we are interested in
    new_df = new_df[new_df['year'] == year]

    # Convert the predictions to integers
    new_df['prediction'] = new_df['prediction'].astype(int)

    return new_df



model = LogisticRegression(max_iter=10000)

min_year = df['year'].min() + 1
max_year = df['year'].max()

print("Min year:", min_year, "\nMax year:", max_year)


def create_final_predictions(df, year):
    # Select the best 4 teams for each conference (confID) ensuring unique teams
    final_predictions = df[df['year'] == year].sort_values(by='prediction_proba', ascending=False).drop_duplicates(subset='tmID').groupby('confID').head(4)

    # remove confID column
    final_predictions.drop(['year', 'playoff', 'confID'], axis=1, inplace=True)

    return final_predictions

# Usage of the function within the loop
for i in range(min_year, max_year + 1):
    new_df = create_predictions(df, i, model, distribution_similarity=False)
    new_df = create_final_predictions(new_df, i)
    new_df.drop(new_df.columns.difference(['tmID', 'confID', 'year', 'playoff', 'prediction', 'prediction_proba']), axis=1, inplace=True)
    new_df.to_csv(f"../data/predictions/predictions_{i}.csv", index=False)



Min year: 2 
Max year: 10

Train/Test size for year=2: (21, 11) (26, 11) (21,) (26,)

AUC for year=2: 0.5

Train/Test size for year=3: (47, 11) (34, 11) (47,) (34,)

AUC for year=3: 0.9792387543252594

Train/Test size for year=4: (81, 11) (36, 11) (81,) (36,)

AUC for year=4: 0.4307692307692308

Train/Test size for year=5: (117, 11) (35, 11) (117,) (35,)

AUC for year=5: 0.5641025641025641

Train/Test size for year=6: (152, 11) (37, 11) (152,) (37,)

AUC for year=6: 0.9761904761904762

Train/Test size for year=7: (189, 11) (34, 11) (189,) (34,)

AUC for year=7: 0.5178571428571428

Train/Test size for year=8: (223, 11) (33, 11) (223,) (33,)

AUC for year=8: 0.6942148760330578

Train/Test size for year=9: (256, 11) (34, 11) (256,) (34,)

AUC for year=9: 0.49242424242424243

Train/Test size for year=10: (290, 11) (38, 11) (290,) (38,)

AUC for year=10: 0.9078341013824885
