In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score

team_stats_df = pd.read_csv('team_stats_0423_sorted.csv')
standings_df = pd.read_csv('NBA_Standings_Ranked_Classes.csv')
elo_df = pd.read_csv('elo_ratings_per_year.csv')
four_factors_df = pd.read_csv('team_year_avg_four_factors.csv')

# Merge the Elo ratings into team stats dataset
team_stats_df = pd.merge(team_stats_df, elo_df[['Year', 'Team', 'Elo Rating']], how='left', on=['Year', 'Team'])

# Merge the new features into team stats dataset
team_stats_df = pd.merge(team_stats_df, four_factors_df[['Year', 'Team', 'eFG%', 'TOV%', 'ORB%', 'FT_Rate']], how='left', on=['Year', 'Team'])

# Merge the team stats with the updated standings data
merged_data = pd.merge(team_stats_df, standings_df, how='left', on=['Year', 'Team'])

# Ensure all necessary columns are available before creating new features
required_columns = ['ORtg', 'DRtg', 'W', 'L']
missing_columns = [col for col in required_columns if col not in merged_data.columns]

if missing_columns:
    print(f"Missing columns for feature creation: {missing_columns}")
else:
    #Creating new features
    merged_data['Net_Rating'] = merged_data['ORtg'] - merged_data['DRtg']
    merged_data['Win_Loss_Ratio'] = merged_data['W'] / merged_data['L']

# Filter the data by year range
filtered_data = merged_data[merged_data['Year'] >= 2004]

# Select the relevant features for training, including new ones if created
features = ['Elo Rating', 'eFG%', 'TOV%', 'ORB%', 'FT_Rate', 'Net_Rating', 'Win_Loss_Ratio']

# Check the first few rows to confirm the merge worked correctly
print(filtered_data.head())


  team_stats_df = pd.read_csv('team_stats_0423_sorted.csv')


   Year       game_id                   Team   MP  FG  FGA    FGp  3P  3PA  \
0  2004  3.040102e+10   New Orleans Pelicans  240  35   83  0.422   3   12   
1  2004  3.040102e+10        Toronto Raptors  240  26   69  0.377   3   17   
2  2004  3.040102e+10  Golden State Warriors  240  31   77  0.403   8   21   
3  2004  3.040102e+10     Washington Wizards  240  41   92  0.446   4   10   
4  2004  3.040102e+10         Indiana Pacers  240  38   87  0.437   8   24   

     3Pp  ...   FT_Rate  Rank   W   L   W/L%           Division  \
0  0.250  ...  0.225261     5  41  41  0.500   Central Division   
1  0.176  ...  0.195909    11  33  49  0.402   Central Division   
2  0.381  ...  0.229168    11  37  45  0.451   Pacific Division   
3  0.400  ...  0.243118    13  25  57  0.305  Atlantic Division   
4  0.333  ...  0.248384     1  61  21  0.744   Central Division   

           Conference  ranking_class  Net_Rating  Win_Loss_Ratio  
0  Eastern Conference      Upper Mid        13.7        1.000

In [2]:
# Split the data by conference
eastern_data = filtered_data[filtered_data['Conference'] == 'Eastern Conference']
western_data = filtered_data[filtered_data['Conference'] == 'Western Conference']


In [3]:
def train_and_evaluate_yearly(conference_data, start_year, end_year, model):
    # Train on data from start_year to end_year-1 and test on end_year plu 1
    train_data = conference_data[(conference_data['Year'] >= start_year) & (conference_data['Year'] < end_year)]
    test_data = conference_data[conference_data['Year'] == (end_year + 1)]
    
    X_train = train_data[features]
    y_train = train_data['ranking_class']

    X_test = test_data[features]
    y_test = test_data['ranking_class']

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = model.predict(X_test)

    # Calculate and return the accuracy score
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy


In [33]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
results_summary = []

# Iterate over the years using 1 and 2 years of data to predict the next year
for window_size in [1, 2]:
    for year in range(2008, 2023):
        eastern_data = filtered_data[filtered_data['Conference'] == 'Eastern Conference']
        western_data = filtered_data[filtered_data['Conference'] == 'Western Conference']

        # Train and evaluate for East and west
        try:
            eastern_accuracy = train_and_evaluate_yearly(eastern_data, year - window_size, year, model)
            western_accuracy = train_and_evaluate_yearly(western_data, year - window_size, year, model)
            
            results_summary.append({
                "Year": year,
                "Window_Size": window_size,
                "Model": "RandomForest",
                "Eastern_Accuracy": eastern_accuracy,
                "Western_Accuracy": western_accuracy
            })
        except ValueError as e:
            print(f"Skipping year {year} due to insufficient data: {e}")

# Convert results to a DataFrame
results_df = pd.DataFrame(results_summary)

# Group by window size and average the accuracies
RF_NetRating = results_df.groupby(['Window_Size']).mean()[['Eastern_Accuracy', 'Western_Accuracy']]
print(RF_NetRating)


             Eastern_Accuracy  Western_Accuracy
Window_Size                                    
1                    0.688031          0.682598
2                    0.738291          0.679202


  RF_NetRating = results_df.groupby(['Window_Size']).mean()[['Eastern_Accuracy', 'Western_Accuracy']]


In [32]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000, random_state=42)
results_summary = []

# Iterate over the years using 1 and 2 years of data to predict the next year
for window_size in [1, 2]:
    for year in range(2008, 2023):
        eastern_data = filtered_data[filtered_data['Conference'] == 'Eastern Conference']
        western_data = filtered_data[filtered_data['Conference'] == 'Western Conference']

        # Train and evaluate for East and west
        eastern_accuracy = train_and_evaluate_yearly(eastern_data, year - window_size, year, model)
        western_accuracy = train_and_evaluate_yearly(western_data, year - window_size, year, model)
        
        results_summary.append({
            "Year": year,
            "Window_Size": window_size,
            "Model": "RandomForest",
            "Eastern_Accuracy": eastern_accuracy,
            "Western_Accuracy": western_accuracy
        })

# Convert results to a DataFrame
results_df = pd.DataFrame(results_summary)

# Group by window size and average the accuracies
LR_NetRating = results_df.groupby(['Window_Size']).mean()[['Eastern_Accuracy', 'Western_Accuracy']]
print(LR_NetRating)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

             Eastern_Accuracy  Western_Accuracy
Window_Size                                    
1                    0.818272          0.746923
2                    0.789255          0.776712


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  LR_NetRating = results_df.groupby(['Window_Size']).mean()[['Eastern_Accuracy', 'Western_Accuracy']]


In [31]:
from sklearn.svm import SVC

model = SVC(random_state=42)
results_summary = []

# Iterate over the years using 1 and 2 years of data to predict the next year
for window_size in [1, 2]:
    for year in range(2008, 2023):
        eastern_data = filtered_data[filtered_data['Conference'] == 'Eastern Conference']
        western_data = filtered_data[filtered_data['Conference'] == 'Western Conference']

        # Train and evaluate for East and west
        eastern_accuracy = train_and_evaluate_yearly(eastern_data, year - window_size, year, model)
        western_accuracy = train_and_evaluate_yearly(western_data, year - window_size, year, model)
        
        results_summary.append({
            "Year": year,
            "Window_Size": window_size,
            "Model": "RandomForest",
            "Eastern_Accuracy": eastern_accuracy,
            "Western_Accuracy": western_accuracy
        })

# Convert results to a DataFrame
results_df = pd.DataFrame(results_summary)

# Group by window size and average the accuracies
SVC_NetRating = results_df.groupby(['Window_Size']).mean()[['Eastern_Accuracy', 'Western_Accuracy']]
print(SVC_NetRating)

             Eastern_Accuracy  Western_Accuracy
Window_Size                                    
1                    0.684095          0.642004
2                    0.676738          0.667941


  SVC_NetRating = results_df.groupby(['Window_Size']).mean()[['Eastern_Accuracy', 'Western_Accuracy']]


In [30]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(random_state=42)
results_summary = []

# Iterate over the years using 1 and 2 years of data to predict the next year
for window_size in [1, 2]:
    for year in range(2008, 2023):
        eastern_data = filtered_data[filtered_data['Conference'] == 'Eastern Conference']
        western_data = filtered_data[filtered_data['Conference'] == 'Western Conference']

        # Train and evaluate for East and west
        eastern_accuracy = train_and_evaluate_yearly(eastern_data, year - window_size, year, model)
        western_accuracy = train_and_evaluate_yearly(western_data, year - window_size, year, model)
        
        # Store results
        results_summary.append({
            "Year": year,
            "Window_Size": window_size,
            "Model": "RandomForest",
            "Eastern_Accuracy": eastern_accuracy,
            "Western_Accuracy": western_accuracy
        })

# Convert results to a DataFrame
results_df = pd.DataFrame(results_summary)

# Group by window size and average the accuracies
GB_NetRating = results_df.groupby(['Window_Size']).mean()[['Eastern_Accuracy', 'Western_Accuracy']]
print(GB_NetRating)


             Eastern_Accuracy  Western_Accuracy
Window_Size                                    
1                    0.661622          0.603376
2                    0.733372          0.692374


  GB_NetRating = results_df.groupby(['Window_Size']).mean()[['Eastern_Accuracy', 'Western_Accuracy']]


In [29]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()
results_summary = []

# Iterate over the years using 1 and 2 years of data to predict the next year
for window_size in [1, 2]:
    for year in range(2008, 2023):
        eastern_data = filtered_data[filtered_data['Conference'] == 'Eastern Conference']
        western_data = filtered_data[filtered_data['Conference'] == 'Western Conference']

        # Train and evaluate for East and west
        eastern_accuracy = train_and_evaluate_yearly(eastern_data, year - window_size, year, model)
        western_accuracy = train_and_evaluate_yearly(western_data, year - window_size, year, model)
        
        results_summary.append({
            "Year": year,
            "Window_Size": window_size,
            "Model": "RandomForest",
            "Eastern_Accuracy": eastern_accuracy,
            "Western_Accuracy": western_accuracy
        })

# Convert results to a DataFrame
results_df = pd.DataFrame(results_summary)

# Group by window size and average the accuracies
KNN_NetRating = results_df.groupby(['Window_Size']).mean()[['Eastern_Accuracy', 'Western_Accuracy']]
print(KNN_NetRating)


             Eastern_Accuracy  Western_Accuracy
Window_Size                                    
1                    0.611633          0.653119
2                    0.648213          0.643803


  KNN_NetRating = results_df.groupby(['Window_Size']).mean()[['Eastern_Accuracy', 'Western_Accuracy']]


In [27]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state=42)
results_summary = []

# Iterate over the years using 1 and 2 years of data to predict the next year
for window_size in [1, 2]:
    for year in range(2008, 2023):
        eastern_data = filtered_data[filtered_data['Conference'] == 'Eastern Conference']
        western_data = filtered_data[filtered_data['Conference'] == 'Western Conference']

        # Train and evaluate for East and west
        eastern_accuracy = train_and_evaluate_yearly(eastern_data, year - window_size, year, model)
        western_accuracy = train_and_evaluate_yearly(western_data, year - window_size, year, model)
        
        results_summary.append({
            "Year": year,
            "Window_Size": window_size,
            "Model": "RandomForest",
            "Eastern_Accuracy": eastern_accuracy,
            "Western_Accuracy": western_accuracy
        })

# Convert results to a DataFrame
results_df = pd.DataFrame(results_summary)

# Group by window size and average the accuracies
DT_NetRating = results_df.groupby(['Window_Size']).mean()[['Eastern_Accuracy', 'Western_Accuracy']]
print(DT_NetRating)

             Eastern_Accuracy  Western_Accuracy
Window_Size                                    
1                    0.732761          0.664459
2                    0.715454          0.732914


  DT_NetRating = results_df.groupby(['Window_Size']).mean()[['Eastern_Accuracy', 'Western_Accuracy']]


In [34]:
print('LR', LR_NetRating)
print('RF', RF_NetRating)
print('SVC', SVC_NetRating)
print('GB', GB_NetRating)
print('KNN', KNN_NetRating)
print('DT', DT_NetRating)

LR              Eastern_Accuracy  Western_Accuracy
Window_Size                                    
1                    0.818272          0.746923
2                    0.789255          0.776712
RF              Eastern_Accuracy  Western_Accuracy
Window_Size                                    
1                    0.688031          0.682598
2                    0.738291          0.679202
SVC              Eastern_Accuracy  Western_Accuracy
Window_Size                                    
1                    0.684095          0.642004
2                    0.676738          0.667941
GB              Eastern_Accuracy  Western_Accuracy
Window_Size                                    
1                    0.661622          0.603376
2                    0.733372          0.692374
KNN              Eastern_Accuracy  Western_Accuracy
Window_Size                                    
1                    0.611633          0.653119
2                    0.648213          0.643803
DT              Eastern

In [35]:
LR_NetRating['Combined_Accuracy'] = LR_NetRating.mean(axis=1)
RF_NetRating['Combined_Accuracy'] = RF_NetRating.mean(axis=1)
SVC_NetRating['Combined_Accuracy'] = SVC_NetRating.mean(axis=1)
GB_NetRating['Combined_Accuracy'] = GB_NetRating.mean(axis=1)
KNN_NetRating['Combined_Accuracy'] = KNN_NetRating.mean(axis=1)
DT_NetRating['Combined_Accuracy'] = DT_NetRating.mean(axis=1)
# Print the result
print(LR_NetRating[['Combined_Accuracy']])
print(RF_NetRating[['Combined_Accuracy']])
print(SVC_NetRating[['Combined_Accuracy']])
print(GB_NetRating[['Combined_Accuracy']])
print(KNN_NetRating[['Combined_Accuracy']])
print(DT_NetRating[['Combined_Accuracy']])


             Combined_Accuracy
Window_Size                   
1                     0.782598
2                     0.782983
             Combined_Accuracy
Window_Size                   
1                     0.685314
2                     0.708746
             Combined_Accuracy
Window_Size                   
1                     0.663050
2                     0.672339
             Combined_Accuracy
Window_Size                   
1                     0.632499
2                     0.712873
             Combined_Accuracy
Window_Size                   
1                     0.632376
2                     0.646008
             Combined_Accuracy
Window_Size                   
1                     0.698610
2                     0.724184
