In [78]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier


df = pd.read_csv("../data/clean/merged.csv")

# Convert "playoff" column to binary (Y: 1, N: 0)
df["playoff"] = df["playoff"].map({"Y": 1, "N": 0})
df.head()

Unnamed: 0,year,tmID,confID,rank,playoff,o_fga,o_fta,o_3pa,o_reb,o_asts,...,avg_player_PostPF_last_3_years_avg,avg_player_PostfgAttempted_last_3_years_avg,avg_player_PostfgMade_last_3_years_avg,avg_player_PostftAttempted_last_3_years_avg,avg_player_PostftMade_last_3_years_avg,avg_player_PostthreeAttempted_last_3_years_avg,avg_player_PostthreeMade_last_3_years_avg,avg_player_PostDQ_last_3_years_avg,award_points_coach,mean_wins_coach
0,9,ATL,EA,7.0,0.0,2258.0,725.0,598.0,1077.0,492.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133333
1,10,ATL,EA,2.0,1.0,2428.0,755.0,374.0,1259.0,547.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,11,ATL,EA,-1.0,,-1.0,-1.0,-1.0,-1.0,-1.0,...,4.692308,17.0,6.730769,5.615385,4.576923,4.230769,1.730769,0.038462,0.0,1.0
3,1,CHA,EA,8.0,0.0,1903.0,577.0,386.0,935.0,551.0,...,2.469697,6.681818,2.287879,2.05303,1.719697,2.333333,0.727273,0.030303,0.0,0.333333
4,2,CHA,EA,4.0,1.0,1780.0,528.0,428.0,948.0,467.0,...,2.965278,8.277778,3.048611,2.805556,1.972222,2.534722,0.798611,0.0,0.0,1.222222


### Label Encoding

In [79]:
from sklearn.preprocessing import LabelEncoder

def encode_df(df):
    le = LabelEncoder()
    tmID_mapping = {} 
    
    for col, col_type in df.dtypes.items():
        if col_type == 'object' or col_type == 'datetime64[ns]':
            # store mapping if the column is 'tmID'
            if col == 'tmID':
                df[col] = le.fit_transform(df[col])
                tmID_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
            else:
                df[col] = le.fit_transform(df[col])
                
    return df, tmID_mapping

# Use the function to encode the dataframe and get the tmID mapping
df, tmID_mapping = encode_df(df)


print(tmID_mapping)

{'ATL': 0, 'CHA': 1, 'CHI': 2, 'CLE': 3, 'CON': 4, 'DET': 5, 'HOU': 6, 'IND': 7, 'LAS': 8, 'MIA': 9, 'MIN': 10, 'NYL': 11, 'ORL': 12, 'PHO': 13, 'POR': 14, 'SAC': 15, 'SAS': 16, 'SEA': 17, 'TUL': 18, 'UTA': 19, 'WAS': 20}


In [None]:
def train_test_split_by_year(data, test_year):
    train_set = data[(data['year'] < test_year)]
    test_set = data[data['year'] == test_year]
    return train_set, test_set

def evaluate_model(model, train_set_original, test_set_original):
    
    train_set = train_set_original.copy()
    test_set = test_set_original.copy()
    
    X_train, y_train = train_set.drop(columns=['playoff']), train_set['playoff']
    X_test, y_test = test_set.drop(columns=['playoff']), test_set['playoff']

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred = y_pred.astype(int)
    
    predict_proba = model.predict_proba(X_test)[:, 1]
    
    # Normalize predict_proba to ensure the sum is 8
    predict_proba = 8 * (predict_proba / predict_proba.sum())
        
    results = pd.DataFrame({
        'tmID' : test_set['tmID'],
        'confID': test_set['confID'],
        'playoff' : y_pred,
        'predict_proba' : predict_proba
    })

    # Replace the tmID with the actual team name
    results['tmID'] = results['tmID'].map(dict(map(reversed, tmID_mapping.items())))

    # Remove duplicates
    results = results.drop_duplicates(subset='tmID', keep='first')
    
    # Ensure that the top 4 teams from each conference are selected
    top4 = results.sort_values(by='predict_proba', ascending=False).groupby('confID').head(4)
    
    # Update the top 4 teams to playoff = 1 in results
    results.loc[results['tmID'].isin(top4['tmID']), 'playoff'] = 1

    return results

In [81]:
def train_model(data, test_year, model):
    train_data, test_data = train_test_split_by_year(data, test_year)
    result = evaluate_model(model, train_data, test_data)
    
    return result

In [82]:
params = {
    'n_estimators': 5000,
    'max_depth': 3,
    'min_samples_split': 2,
    'learning_rate': 0.15
}

results = train_model(df, 11, GradientBoostingClassifier(**params))

results.drop(['confID'], axis=1, inplace=True)
results.to_csv(f"../data/predictions/predictions_11.csv", index=False)