In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
import xgboost as xgb  # XGBoost model

# Load your datasets
team_stats_df = pd.read_csv('team_stats_0423_sorted.csv')
standings_df = pd.read_csv('NBA_Standings_Ranked_Classes.csv')
elo_df = pd.read_csv('elo_ratings_per_year.csv')
four_factors_df = pd.read_csv('team_year_avg_four_factors.csv') 

# Merge the new features into your team stats dataset
team_stats_df = pd.merge(team_stats_df, four_factors_df[['Year', 'Team', 'eFG%', 'TOV%', 'ORB%', 'FT_Rate']], how='left', on=['Year', 'Team'])

# Merge the team stats with the updated standings data
merged_data = pd.merge(team_stats_df, standings_df, how='left', on=['Year', 'Team'])
filtered_data = merged_data[merged_data['Year'] >= 2004]
features = ['eFG%', 'TOV%', 'ORB%', 'FT_Rate']


  team_stats_df = pd.read_csv('team_stats_0423_sorted.csv')


In [2]:
def train_and_evaluate_yearly(conference_data, start_year, end_year, model):
    train_data = conference_data[(conference_data['Year'] >= start_year) & (conference_data['Year'] < end_year)]
    test_data = conference_data[conference_data['Year'] == (end_year + 1)]
    
    X_train = train_data[features]
    y_train = train_data['ranking_class']

    X_test = test_data[features]
    y_test = test_data['ranking_class']
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    return accuracy


In [5]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the model
model = RandomForestClassifier(random_state=42)

# Store results
results_summary = []

# Iterate over the years using 1 and 2 years of data to predict the next year
for window_size in [1, 2]:
    for year in range(2008, 2023):
        eastern_data = filtered_data[filtered_data['Conference'] == 'Eastern Conference']
        western_data = filtered_data[filtered_data['Conference'] == 'Western Conference']

        # Train and evaluate for East
        eastern_accuracy = train_and_evaluate_yearly(eastern_data, year - window_size, year, model)
        
        # Train and evaluate for West
        western_accuracy = train_and_evaluate_yearly(western_data, year - window_size, year, model)
        
        results_summary.append({
            "Year": year,
            "Window_Size": window_size,
            "Model": "RandomForest",
            "Eastern_Accuracy": eastern_accuracy,
            "Western_Accuracy": western_accuracy
        })

# Convert results to a DataFrame
results_df = pd.DataFrame(results_summary)

# Group by window size and average the accuracies
RF_FF = results_df.groupby(['Window_Size']).mean()[['Eastern_Accuracy', 'Western_Accuracy']]
print(RF_FF)


             Eastern_Accuracy  Western_Accuracy
Window_Size                                    
1                    0.372927          0.385587
2                    0.386297          0.393362


  RF_FF = results_df.groupby(['Window_Size']).mean()[['Eastern_Accuracy', 'Western_Accuracy']]


In [6]:
from sklearn.linear_model import LogisticRegression

# Initialize the model
model = LogisticRegression(max_iter=1000, random_state=42)

# Store results
results_summary = []

# Iterate over the years using 1 and 2 years of data to predict the next year
for window_size in [1, 2]:
    for year in range(2008, 2023):
        eastern_data = filtered_data[filtered_data['Conference'] == 'Eastern Conference']
        western_data = filtered_data[filtered_data['Conference'] == 'Western Conference']

        # Train and evaluate for East
        eastern_accuracy = train_and_evaluate_yearly(eastern_data, year - window_size, year, model)
        
        # Train and evaluate for West
        western_accuracy = train_and_evaluate_yearly(western_data, year - window_size, year, model)
        
        results_summary.append({
            "Year": year,
            "Window_Size": window_size,
            "Model": "RandomForest",
            "Eastern_Accuracy": eastern_accuracy,
            "Western_Accuracy": western_accuracy
        })

# Convert results to a DataFrame
results_df = pd.DataFrame(results_summary)

# Group by window size and average the accuracies
LR_FF = results_df.groupby(['Window_Size']).mean()[['Eastern_Accuracy', 'Western_Accuracy']]
print(LR_FF)


             Eastern_Accuracy  Western_Accuracy
Window_Size                                    
1                    0.379668          0.362559
2                    0.401851          0.398052


  LR_FF = results_df.groupby(['Window_Size']).mean()[['Eastern_Accuracy', 'Western_Accuracy']]


In [7]:
from sklearn.svm import SVC

model = SVC(random_state=42)

results_summary = []

for window_size in [1, 2]:
    for year in range(2008, 2023):
        eastern_data = filtered_data[filtered_data['Conference'] == 'Eastern Conference']
        western_data = filtered_data[filtered_data['Conference'] == 'Western Conference']

        # Train and evaluate for East
        eastern_accuracy = train_and_evaluate_yearly(eastern_data, year - window_size, year, model)
        
        # Train and evaluate for West
        western_accuracy = train_and_evaluate_yearly(western_data, year - window_size, year, model)
        
        results_summary.append({
            "Year": year,
            "Window_Size": window_size,
            "Model": "RandomForest",
            "Eastern_Accuracy": eastern_accuracy,
            "Western_Accuracy": western_accuracy
        })

# Convert results to a DataFrame
results_df = pd.DataFrame(results_summary)

# Group by window size and average the accuracies
SVC_FF = results_df.groupby(['Window_Size']).mean()[['Eastern_Accuracy', 'Western_Accuracy']]
print(SVC_FF)

             Eastern_Accuracy  Western_Accuracy
Window_Size                                    
1                    0.407282          0.416493
2                    0.434205          0.434289


  SVC_FF = results_df.groupby(['Window_Size']).mean()[['Eastern_Accuracy', 'Western_Accuracy']]


In [8]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(random_state=42)

results_summary = []

# Iterate over the years using 1 and 2 years of data to predict the next year
for window_size in [1, 2]:
    for year in range(2008, 2023):
        eastern_data = filtered_data[filtered_data['Conference'] == 'Eastern Conference']
        western_data = filtered_data[filtered_data['Conference'] == 'Western Conference']

        # Train and evaluate for East
        eastern_accuracy = train_and_evaluate_yearly(eastern_data, year - window_size, year, model)
        
        # Train and evaluate for West
        western_accuracy = train_and_evaluate_yearly(western_data, year - window_size, year, model)
        
        results_summary.append({
            "Year": year,
            "Window_Size": window_size,
            "Model": "RandomForest",
            "Eastern_Accuracy": eastern_accuracy,
            "Western_Accuracy": western_accuracy
        })

# Convert results to a DataFrame
results_df = pd.DataFrame(results_summary)

# Group by window size and average the accuracies
GB_FF = results_df.groupby(['Window_Size']).mean()[['Eastern_Accuracy', 'Western_Accuracy']]
print(GB_FF)


             Eastern_Accuracy  Western_Accuracy
Window_Size                                    
1                    0.429442          0.425273
2                    0.417175          0.415546


  GB_FF = results_df.groupby(['Window_Size']).mean()[['Eastern_Accuracy', 'Western_Accuracy']]


In [9]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()

results_summary = []

# Iterate over the years using 1 and 2 years of data to predict the next year
for window_size in [1, 2]:
    for year in range(2008, 2023):
        eastern_data = filtered_data[filtered_data['Conference'] == 'Eastern Conference']
        western_data = filtered_data[filtered_data['Conference'] == 'Western Conference']

        # Train and evaluate for East
        eastern_accuracy = train_and_evaluate_yearly(eastern_data, year - window_size, year, model)
        
        # Train and evaluate for West
        western_accuracy = train_and_evaluate_yearly(western_data, year - window_size, year, model)

        results_summary.append({
            "Year": year,
            "Window_Size": window_size,
            "Model": "RandomForest",
            "Eastern_Accuracy": eastern_accuracy,
            "Western_Accuracy": western_accuracy
        })

# Convert results to a DataFrame
results_df = pd.DataFrame(results_summary)

# Group by window size and average the accuracies
KNN_FF = results_df.groupby(['Window_Size']).mean()[['Eastern_Accuracy', 'Western_Accuracy']]

print(KNN_FF)


             Eastern_Accuracy  Western_Accuracy
Window_Size                                    
1                    0.434523          0.371994
2                    0.416464          0.371676


  KNN_FF = results_df.groupby(['Window_Size']).mean()[['Eastern_Accuracy', 'Western_Accuracy']]


In [10]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state=42)
results_summary = []

# Iterate over the years using 1 and 2 years of data to predict the next year
for window_size in [1, 2]:
    for year in range(2008, 2023):
        eastern_data = filtered_data[filtered_data['Conference'] == 'Eastern Conference']
        western_data = filtered_data[filtered_data['Conference'] == 'Western Conference']

        # Train and evaluate for East
        eastern_accuracy = train_and_evaluate_yearly(eastern_data, year - window_size, year, model)
        
        # Train and evaluate for West
        western_accuracy = train_and_evaluate_yearly(western_data, year - window_size, year, model)
        results_summary.append({
            "Year": year,
            "Window_Size": window_size,
            "Model": "RandomForest",
            "Eastern_Accuracy": eastern_accuracy,
            "Western_Accuracy": western_accuracy
        })

# Convert results to a DataFrame
results_df = pd.DataFrame(results_summary)

# Group by window size and average the accuracies
DT_FF = results_df.groupby(['Window_Size']).mean()[['Eastern_Accuracy', 'Western_Accuracy']]
print(DT_FF)

             Eastern_Accuracy  Western_Accuracy
Window_Size                                    
1                    0.332845          0.367710
2                    0.355053          0.339991


  DT_FF = results_df.groupby(['Window_Size']).mean()[['Eastern_Accuracy', 'Western_Accuracy']]


In [11]:
print('LR', LR_FF)
print('RF', RF_FF)
print('SVC', SVC_FF)
print('GB', GB_FF)
print('KNN', KNN_FF)
print('DT', DT_FF)



LR              Eastern_Accuracy  Western_Accuracy
Window_Size                                    
1                    0.379668          0.362559
2                    0.401851          0.398052
RF              Eastern_Accuracy  Western_Accuracy
Window_Size                                    
1                    0.372927          0.385587
2                    0.386297          0.393362
SVC              Eastern_Accuracy  Western_Accuracy
Window_Size                                    
1                    0.407282          0.416493
2                    0.434205          0.434289
GB              Eastern_Accuracy  Western_Accuracy
Window_Size                                    
1                    0.429442          0.425273
2                    0.417175          0.415546
KNN              Eastern_Accuracy  Western_Accuracy
Window_Size                                    
1                    0.434523          0.371994
2                    0.416464          0.371676
DT              Eastern

In [12]:
LR_FF['Combined_Accuracy'] = LR_FF.mean(axis=1)
RF_FF['Combined_Accuracy'] = RF_FF.mean(axis=1)
SVC_FF['Combined_Accuracy'] = SVC_FF.mean(axis=1)
GB_FF['Combined_Accuracy'] = GB_FF.mean(axis=1)
KNN_FF['Combined_Accuracy'] = KNN_FF.mean(axis=1)
DT_FF['Combined_Accuracy'] = DT_FF.mean(axis=1)
# Print the result
print(LR_FF[['Combined_Accuracy']])
print(RF_FF[['Combined_Accuracy']])
print(SVC_FF[['Combined_Accuracy']])
print(GB_FF[['Combined_Accuracy']])
print(KNN_FF[['Combined_Accuracy']])
print(DT_FF[['Combined_Accuracy']])

             Combined_Accuracy
Window_Size                   
1                     0.371113
2                     0.399952
             Combined_Accuracy
Window_Size                   
1                     0.379257
2                     0.389829
             Combined_Accuracy
Window_Size                   
1                     0.411887
2                     0.434247
             Combined_Accuracy
Window_Size                   
1                     0.427357
2                     0.416361
             Combined_Accuracy
Window_Size                   
1                     0.403258
2                     0.394070
             Combined_Accuracy
Window_Size                   
1                     0.350277
2                     0.347522
