In [42]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, HalvingRandomSearchCV
from sklearn.metrics import accuracy_score,precision_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

In [43]:
df = pd.read_csv("../data/clean/merged.csv")
#df = pd.read_csv("../data/clean/teams.csv") # TESTING

# Convert "playoff" column to binary (Y: 1, N: 0)
df["playoff"] = df["playoff"].map({"Y": 1, "N": 0})
df.head()

Unnamed: 0,year,tmID,confID,rank,playoff,firstRound,semis,finals,o_fgm,o_fga,...,avg_player_weight,avg_player_efficiency,avg_player_award_points,avg_player_last_3_years_sum_awards,avg_player_all_time_sum_awards,avg_player_age,last_year_avg_player_efficiency,stint_coach,award_points_coach,mean_wins_coach
0,9,ATL,EA,7,0,0,0,0,895,2258,...,173.333333,0.243333,0.0,3.5,3.5,24.833333,0.284286,0,0.0,0.133333
1,10,ATL,EA,2,1,L,0,0,1089,2428,...,173.583333,0.238333,0.0,3.5,3.5,25.25,0.256364,0,1.0,1.0
2,1,CHA,EA,8,0,0,0,0,812,1903,...,169.181818,0.253636,0.0,0.0,0.0,26.636364,0.242222,0,0.0,0.333333
3,2,CHA,EA,4,1,W,W,L,746,1780,...,175.166667,0.231667,0.0,0.0,0.0,26.166667,0.282857,0,0.0,1.222222
4,3,CHA,EA,2,1,L,0,0,770,1790,...,166.666667,0.209333,0.0,0.0,0.0,25.133333,0.220909,0,0.0,1.125


### Label Encoding

In [44]:
from sklearn.preprocessing import LabelEncoder

def encode_df(df):
    le = LabelEncoder()
    tmID_mapping = {} 
    
    for col, col_type in df.dtypes.items():
        if col_type == 'object' or col_type == 'datetime64[ns]':
            # store mapping if the column is 'tmID'
            if col == 'tmID':
                df[col] = le.fit_transform(df[col])
                tmID_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
            else:
                df[col] = le.fit_transform(df[col])
                
    return df, tmID_mapping

# Use the function to encode the dataframe and get the tmID mapping
df, tmID_mapping = encode_df(df)


df.head( 10)
print("tmID Mapping:", tmID_mapping)


tmID Mapping: {'ATL': 0, 'CHA': 1, 'CHI': 2, 'CLE': 3, 'CON': 4, 'DET': 5, 'HOU': 6, 'IND': 7, 'LAS': 8, 'MIA': 9, 'MIN': 10, 'NYL': 11, 'ORL': 12, 'PHO': 13, 'POR': 14, 'SAC': 15, 'SAS': 16, 'SEA': 17, 'UTA': 18, 'WAS': 19}


In [45]:
def calculate_class_distribution_similarity(df, y_train, y_test):
    # Step 1: Calculate class distribution in the original dataset
    original_class_distribution = df['playoff'].value_counts(normalize=True)
    print("Original Class Distribution:")
    print(original_class_distribution)

    # Step 2: Calculate class distribution in the training and test sets
    train_class_distribution = y_train.value_counts(normalize=True)
    test_class_distribution = y_test.value_counts(normalize=True)

    print("\nTraining Set Class Distribution:")
    print(train_class_distribution)
    print("\nTest Set Class Distribution:")
    print(test_class_distribution)

    # Step 3: Compare class distributions
    class_distribution_similarity = (train_class_distribution - test_class_distribution).abs().sum()
    print("\nClass Distribution Similarity Score:", class_distribution_similarity)

    return class_distribution_similarity

# Update the train_model function to include imputation
def train_model(df, year):
    # Remove rows with missing values
    df.dropna(inplace=True)

    teams_df_train = df[df['year'] < year]
    teams_df_test = df[df['year'] == year]

    X_train = teams_df_train.drop("playoff", axis=1)  # Features
    y_train = teams_df_train["playoff"]  # Target variable

    X_test = teams_df_test.drop("playoff", axis=1)  # Features
    y_test = teams_df_test["playoff"]  # Target variable

    print(f"\nTrain/Test size for year={year}:", X_train.shape, X_test.shape, y_train.shape, y_test.shape)

    return X_train, y_train, X_test, y_test

# create a final dataset with the predictions: team, playoff, prediction (a probability between 0 and 1)
def create_predictions(df, year, model, distribution_similarity=True):
    # Create a copy of the original dataset
    new_df = df.copy()

    X_train, y_train, X_test, y_test = train_model(new_df, year)

    # Check if the class distribution is similar between the training and test sets
    if distribution_similarity:
        calculate_class_distribution_similarity(new_df, y_train, y_test)    

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test) # Predictions
    y_pred_proba = model.predict_proba(X_test) # Prediction probabilities

    # Calculate the AUC
    auc = roc_auc_score(y_test, y_pred_proba[:,1])
    print(f"\nAUC for year={year}:", auc)
 
     # Calculate Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy for year={year}: {accuracy}")
    
    # Calculate Precision
    precision = precision_score(y_test, y_pred)
    print(f"Precision for year={year}: {precision}")

    
    # Only add predictions to the test set
    new_df.loc[new_df['year'] == year, 'prediction'] = y_pred
    new_df.loc[new_df['year'] == year, 'prediction_proba'] = y_pred_proba[:,1]

    # Filter the dataset to only include the year we are interested in
    new_df = new_df[new_df['year'] == year]

    # Convert the predictions to integers
    new_df['prediction'] = new_df['prediction'].astype(int)

    return new_df



model = LogisticRegression(max_iter=10000)

min_year = df['year'].min() + 1
max_year = df['year'].max()

print("Min year:", min_year, "\nMax year:", max_year)


def create_final_predictions(df, year):
    # Select the best 4 teams for each conference (confID) ensuring unique teams
    final_predictions = df[df['year'] == year].sort_values(by='prediction_proba', ascending=False).drop_duplicates(subset='tmID').groupby('confID').head(4)

    # remove confID column
    final_predictions.drop(['year', 'playoff', 'confID'], axis=1, inplace=True)

    return final_predictions

# Usage of the function within the loop
for i in range(min_year, max_year + 1):
    new_df = create_predictions(df, i, model, distribution_similarity=False)
    new_df = create_final_predictions(new_df, i)
    new_df.drop(new_df.columns.difference(['tmID', 'confID', 'year', 'playoff', 'prediction', 'prediction_proba']), axis=1, inplace=True)
    new_df.to_csv(f"../data/predictions/predictions_{i}.csv", index=False)



Min year: 2 
Max year: 10

Train/Test size for year=2: (17, 60) (18, 60) (17,) (18,)

AUC for year=2: 0.95
Accuracy for year=2: 0.8888888888888888
Precision for year=2: 0.8333333333333334

Train/Test size for year=3: (35, 60) (19, 60) (35,) (19,)

AUC for year=3: 0.9204545454545454
Accuracy for year=3: 0.8421052631578947
Precision for year=3: 0.7777777777777778

Train/Test size for year=4: (54, 60) (16, 60) (54,) (16,)

AUC for year=4: 0.8095238095238095
Accuracy for year=4: 0.6875
Precision for year=4: 0.7

Train/Test size for year=5: (70, 60) (17, 60) (70,) (17,)

AUC for year=5: 0.9090909090909091
Accuracy for year=5: 0.7647058823529411
Precision for year=5: 0.8888888888888888

Train/Test size for year=6: (87, 60) (15, 60) (87,) (15,)

AUC for year=6: 0.8518518518518519
Accuracy for year=6: 0.7333333333333333
Precision for year=6: 0.7777777777777778

Train/Test size for year=7: (102, 60) (15, 60) (102,) (15,)

AUC for year=7: 0.9464285714285715
Accuracy for year=7: 0.8
Precision for