In [4]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

# Load your datasets
team_stats_df = pd.read_csv('team_stats_0423_sorted.csv')
standings_df = pd.read_csv('NBA_Standings_Ranked_Classes.csv')
elo_df = pd.read_csv('elo_ratings_per_year.csv')
four_factors_df = pd.read_csv('team_year_avg_four_factors.csv')

# Merge the Elo ratings into your team stats dataset
team_stats_df = pd.merge(team_stats_df, elo_df[['Year', 'Team', 'Elo Rating']], how='left', on=['Year', 'Team'])

# Merge the new features into your team stats dataset
team_stats_df = pd.merge(team_stats_df, four_factors_df[['Year', 'Team', 'eFG%', 'TOV%', 'ORB%', 'FT_Rate']], how='left', on=['Year', 'Team'])

# Merge the team stats with the updated standings data
merged_data = pd.merge(team_stats_df, standings_df, how='left', on=['Year', 'Team'])

# Ensure all necessary columns are available before creating new features
required_columns = ['ORtg', 'DRtg', 'W', 'L']
missing_columns = [col for col in required_columns if col not in merged_data.columns]

if missing_columns:
    print(f"Missing columns for feature creation: {missing_columns}")
else:
    # Create new features
    merged_data['Net_Rating'] = merged_data['ORtg'] - merged_data['DRtg']
    merged_data['Win_Loss_Ratio'] = merged_data['W'] / merged_data['L']

# Filter the data by year range
filtered_data = merged_data[merged_data['Year'] >= 2004]

# Select the relevant features for training
features = ['Elo Rating', 'eFG%', 'TOV%', 'ORB%', 'FT_Rate', 'Net_Rating', 'Win_Loss_Ratio']

# Function to train and evaluate for different windows
def train_and_evaluate_yearly(conference_data, start_year, end_year, model):
    train_data = conference_data[(conference_data['Year'] >= start_year) & (conference_data['Year'] < end_year)]
    test_data = conference_data[conference_data['Year'] == end_year]
    
    X_train = train_data[features]
    y_train = train_data['ranking_class']

    X_test = test_data[features]
    y_test = test_data['ranking_class']

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    return accuracy




  team_stats_df = pd.read_csv('team_stats_0423_sorted.csv')


In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score

# Load your datasets
team_stats_df = pd.read_csv('team_stats_0423_sorted.csv')
standings_df = pd.read_csv('NBA_Standings_Ranked_Classes.csv')
elo_df = pd.read_csv('elo_ratings_per_year.csv')
four_factors_df = pd.read_csv('team_year_avg_four_factors.csv')

# Merge the Elo ratings into your team stats dataset
team_stats_df = pd.merge(team_stats_df, elo_df[['Year', 'Team', 'Elo Rating']], how='left', on=['Year', 'Team'])

# Merge the new features into your team stats dataset
team_stats_df = pd.merge(team_stats_df, four_factors_df[['Year', 'Team', 'eFG%', 'TOV%', 'ORB%', 'FT_Rate']], how='left', on=['Year', 'Team'])

# Merge the team stats with the updated standings data
merged_data = pd.merge(team_stats_df, standings_df, how='left', on=['Year', 'Team'])

# Ensure all necessary columns are available before creating new features
required_columns = ['ORtg', 'DRtg', 'W', 'L']
missing_columns = [col for col in required_columns if col not in merged_data.columns]

if missing_columns:
    print(f"Missing columns for feature creation: {missing_columns}")
else:
    # Create new features
    merged_data['Net_Rating'] = merged_data['ORtg'] - merged_data['DRtg']
    merged_data['Win_Loss_Ratio'] = merged_data['W'] / merged_data['L']

# Filter the data by year range
filtered_data = merged_data[merged_data['Year'] >= 2004]

# Select the relevant features for training
features = ['Elo Rating', 'eFG%', 'TOV%', 'ORB%', 'FT_Rate', 'Net_Rating', 'Win_Loss_Ratio']

# Function to train and evaluate for different windows
def train_and_evaluate_yearly(conference_data, start_year, end_year, model):
    train_data = conference_data[(conference_data['Year'] >= start_year) & (conference_data['Year'] < end_year)]
    test_data = conference_data[conference_data['Year'] == end_year]
    
    X_train = train_data[features]
    y_train = train_data['ranking_class']

    X_test = test_data[features]
    y_test = test_data['ranking_class']

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

# Initialize the model
model = RandomForestClassifier(random_state=42)

# Store results
results_summary = []

# Iterate over the years using different data windows
for window_size in [1, 2, 3, 4, 5, 7]:
    for year in range(2008, 2023):
        eastern_data = filtered_data[filtered_data['Conference'] == 'Eastern Conference']
        western_data = filtered_data[filtered_data['Conference'] == 'Western Conference']

        # Evaluate for Eastern Conference
        eastern_accuracy = train_and_evaluate_yearly(eastern_data, year - window_size, year, model)
        
        # Evaluate for Western Conference
        western_accuracy = train_and_evaluate_yearly(western_data, year - window_size, year, model)
        
        # Store results
        results_summary.append({
            "Year": year,
            "Window_Size": window_size,
            "Model": "RandomForest",
            "Eastern_Accuracy": eastern_accuracy,
            "Western_Accuracy": western_accuracy
        })

# Convert results to a DataFrame for easy viewing
results_df = pd.DataFrame(results_summary)

# Group by window size and calculate the mean accuracy
RF_NetRating = results_df.groupby(['Window_Size']).mean()[['Eastern_Accuracy', 'Western_Accuracy']]

# Print the summary of results
print(RF_NetRating)

# If desired, perform a final cross-validation on the best window size to validate the findings
best_window = RF_NetRating.mean(axis=1).idxmax()
print(f"The optimal window size is {best_window} years.")

# Optionally, apply RandomizedSearchCV or GridSearchCV for hyperparameter tuning on the best window
# Example:
# best_rf_model = RandomForestClassifier(random_state=42)
# random_search = RandomizedSearchCV(estimator=best_rf_model, param_distributions=param_grid, n_iter=50, cv=5, scoring='accuracy', verbose=1, random_state=42, n_jobs=-1)
# random_search.fit(X_train, y_train)


  team_stats_df = pd.read_csv('team_stats_0423_sorted.csv')


             Eastern_Accuracy  Western_Accuracy
Window_Size                                    
1                    0.700483          0.715840
2                    0.766086          0.698286
3                    0.778638          0.724101
4                    0.766300          0.714044
5                    0.774048          0.698719
7                    0.802327          0.728245
The optimal window size is 7 years.


  RF_NetRating = results_df.groupby(['Window_Size']).mean()[['Eastern_Accuracy', 'Western_Accuracy']]


In [2]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

# Load your datasets
team_stats_df = pd.read_csv('team_stats_0423_sorted.csv')
standings_df = pd.read_csv('NBA_Standings_Ranked_Classes.csv')
elo_df = pd.read_csv('elo_ratings_per_year.csv')
four_factors_df = pd.read_csv('team_year_avg_four_factors.csv')

# Merge the Elo ratings into your team stats dataset
team_stats_df = pd.merge(team_stats_df, elo_df[['Year', 'Team', 'Elo Rating']], how='left', on=['Year', 'Team'])

# Merge the new features into your team stats dataset
team_stats_df = pd.merge(team_stats_df, four_factors_df[['Year', 'Team', 'eFG%', 'TOV%', 'ORB%', 'FT_Rate']], how='left', on=['Year', 'Team'])

# Merge the team stats with the updated standings data
merged_data = pd.merge(team_stats_df, standings_df, how='left', on=['Year', 'Team'])

# Ensure all necessary columns are available before creating new features
required_columns = ['ORtg', 'DRtg', 'W', 'L']
missing_columns = [col for col in required_columns if col not in merged_data.columns]

if missing_columns:
    print(f"Missing columns for feature creation: {missing_columns}")
else:
    # Create new features
    merged_data['Net_Rating'] = merged_data['ORtg'] - merged_data['DRtg']
    merged_data['Win_Loss_Ratio'] = merged_data['W'] / merged_data['L']

# Filter the data by year range
filtered_data = merged_data[merged_data['Year'] >= 2004]

# Select the relevant features for training
features = ['Elo Rating', 'eFG%', 'TOV%', 'ORB%', 'FT_Rate', 'Net_Rating', 'Win_Loss_Ratio']

# Function to train and evaluate for different windows
def train_and_evaluate_yearly(conference_data, start_year, end_year, model):
    train_data = conference_data[(conference_data['Year'] >= start_year) & (conference_data['Year'] < end_year)]
    test_data = conference_data[conference_data['Year'] == end_year]
    
    X_train = train_data[features]
    y_train = train_data['ranking_class']

    X_test = test_data[features]
    y_test = test_data['ranking_class']

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

# List of models to evaluate
models = {
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "kNN": KNeighborsClassifier(),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "SVM": SVC(random_state=42)
}

# Store results
results_summary = []

# Iterate over the years using different data windows
for window_size in [1, 2, 3, 4, 5, 7]:
    for year in range(2008, 2024):
        eastern_data = filtered_data[filtered_data['Conference'] == 'Eastern Conference']
        western_data = filtered_data[filtered_data['Conference'] == 'Western Conference']

        for model_name, model in models.items():
            # Evaluate for Eastern Conference
            eastern_accuracy = train_and_evaluate_yearly(eastern_data, year - window_size, year, model)
            
            # Evaluate for Western Conference
            western_accuracy = train_and_evaluate_yearly(western_data, year - window_size, year, model)
            
            # Store results
            results_summary.append({
                "Year": year,
                "Window_Size": window_size,
                "Model": model_name,
                "Eastern_Accuracy": eastern_accuracy,
                "Western_Accuracy": western_accuracy
            })

# Convert results to a DataFrame for easy viewing
results_df = pd.DataFrame(results_summary)

# Group by model and window size, then calculate the mean accuracy
grouped_results = results_df.groupby(['Model', 'Window_Size']).mean()[['Eastern_Accuracy', 'Western_Accuracy']]

# Print the summary of results
print(grouped_results)

# Determine and print the best window size for each model
for model_name in models.keys():
    best_window = grouped_results.loc[model_name].mean(axis=1).idxmax()
    print(f"The optimal window size for {model_name} is {best_window} years.")


  team_stats_df = pd.read_csv('team_stats_0423_sorted.csv')
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Pleas

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

                                Eastern_Accuracy  Western_Accuracy
Model              Window_Size                                    
DecisionTree       1                    0.737318          0.737217
                   2                    0.745465          0.720207
                   3                    0.766629          0.724599
                   4                    0.766558          0.686604
                   5                    0.765980          0.670817
                   7                    0.766976          0.682539
GradientBoosting   1                    0.687406          0.657456
                   2                    0.745042          0.681047
                   3                    0.762418          0.707897
                   4                    0.787405          0.699326
                   5                    0.782864          0.711981
                   7                    0.783024          0.732426
LogisticRegression 1                    0.796166          0.79