In [50]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression


import warnings
warnings.filterwarnings("ignore")


df = pd.read_csv("../data/clean/merged.csv")

# Convert "playoff" column to binary (Y: 1, N: 0)
df["playoff"] = df["playoff"].map({"Y": 1, "N": 0})
df.head()

Unnamed: 0,tmID,year,confID,playoff,last_year_rank,last_year_o_fga,last_year_o_fta,last_year_o_3pa,last_year_o_reb,last_year_o_asts,...,avg_team_last_year_allPF,avg_team_last_year_allFGA,avg_team_last_year_allTR,avg_team_last_year_allTRA,avg_team_last_year_allMinutes,num_players_joined,num_players_left,num_players_changed_team,award_points_coach,last_year_mean_wins_coach
0,ATL,9,EA,0.0,3.5,2079.8,640.9,502.7,1077.6,520.7,...,55.259615,166.201122,17.840144,49.144631,510.53766,14.0,9.0,23.0,0.0,1.026701
1,ATL,10,EA,1.0,7.0,2258.0,725.0,598.0,1077.0,492.0,...,54.361367,169.50192,17.682412,55.12404,526.865207,8.0,13.0,21.0,1.0,0.133333
2,ATL,11,EA,,2.0,2428.0,755.0,374.0,1259.0,547.0,...,56.60355,203.643491,12.802515,41.376479,638.306213,0.0,0.0,0.0,0.0,1.0
3,CHA,1,EA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,54.676768,178.505051,16.609091,49.047475,574.079798,13.0,8.0,21.0,0.0,0.0
4,CHA,2,EA,1.0,8.0,1903.0,577.0,386.0,935.0,551.0,...,70.366267,198.638224,21.541417,62.973054,686.959581,7.0,3.0,10.0,0.0,0.391304


In [None]:
def train_test_split_by_year(data, test_year):
    train_set = data[(data['year'] < test_year)]
    test_set = data[data['year'] == test_year]
    return train_set, test_set

def evaluate_model(model, train_set_original, test_set_original):
    
    train_set = train_set_original.copy()
    test_set = test_set_original.copy()

    
    tmIDs = test_set['tmID']
    confIDs = test_set['confID']
    train_set.drop(['tmID', 'confID', 'year'], axis=1, inplace=True)
    test_set.drop(['tmID', 'confID', 'year'], axis=1, inplace=True)
    
    X_train, y_train = train_set.drop(columns=['playoff']), train_set['playoff']
    X_test, y_test = test_set.drop(columns=['playoff']), test_set['playoff']

    # Initialize SMOTE
    smote = SMOTE(random_state=42)
    
    # Fit SMOTE on the training data
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    model.fit(X_train_resampled, y_train_resampled)
    y_pred = model.predict(X_test)
    y_pred = y_pred.astype(int)
    
    predict_proba = model.predict_proba(X_test)[:, 1]
    
    # Normalize predict_proba to ensure the sum is 8
    predict_proba = 8 * (predict_proba / predict_proba.sum())
        
    results = pd.DataFrame({
        'tmID' : tmIDs,
        'confID': confIDs,
        'playoff' : y_pred,
        'predict_proba' : predict_proba
    })

    # Remove duplicates
    results = results.drop_duplicates(subset='tmID', keep='first')
    
    # Ensure that the top 4 teams from each conference are selected --> playoff = 1
    top4 = results.sort_values(by='predict_proba', ascending=False).groupby('confID').head(4)
    # Update the top 4 teams to playoff = 1 in results
    results.loc[results['tmID'].isin(top4['tmID']), 'playoff'] = 1

    # Update the rest of the teams to playoff = 0 in results
    results.loc[~results['tmID'].isin(top4['tmID']), 'playoff'] = 0

    # Remove predict_proba column
    results.drop(columns=['predict_proba'], inplace=True)

    
    # Select the best 4 teams for each conference (confID) ensuring unique teams
    #results = results.sort_values(by='predict_proba', ascending=False).groupby('confID').head(4)


    return results

In [52]:
def train_model(data, test_year, model):
    train_data, test_data = train_test_split_by_year(data, test_year)
    result = evaluate_model(model, train_data, test_data)
    
    return result

In [53]:
params = {
    'max_iter': 1000
}

results = train_model(df, 11, LogisticRegression(**params))

results.drop(['confID'], axis=1, inplace=True)
results.to_csv(f"../data/predictions/predictions_11.csv", index=False)

KeyError: 'predict_proba'