In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import os
import math
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# inputs
########
MIN_TRAIN_SAMPLES = 300
########

dfs = []
count = 0
for name in os.listdir('./player_game_logs'):
    df = pd.read_csv(f"./player_game_logs/{name}/{name}_SUMMARY.csv")
    if df.loc[0]['n_train'] >= MIN_TRAIN_SAMPLES:
        count += 1
        dfs.append(df)
combined_df = pd.concat(dfs, ignore_index=True).set_index('model_type')
print(count)

In [None]:
summary_df = combined_df.groupby(combined_df.index).mean()
pd.options.display.float_format = '{:,.4f}'.format
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
summary_df.sort_values('r2_test', ascending=False)

In [None]:
## Model performance - ablated Team and Opponent features

In [None]:
dfs = []
count = 0
for name in os.listdir('./player_game_logs'):
    df = pd.read_csv(f"./player_game_logs/{name}/{name}_SUMMARY_ABLATED.csv")
    if df.loc[0]['n_train'] >= MIN_TRAIN_SAMPLES:
        count += 1
        dfs.append(df)
combined_df = pd.concat(dfs, ignore_index=True).set_index('model_type')
print(count)

In [None]:
summary_df = combined_df.groupby(combined_df.index).mean()
summary_df.sort_values('r2_test', ascending=False)

In [None]:
##### r2_test vs. n_train analysis

In [None]:
dfs = {}
for name in os.listdir('./player_game_logs'):
    df = pd.read_csv(f"./player_game_logs/{name}/{name}_SUMMARY.csv")
    if df.loc[0]['n_train'] >= MIN_TRAIN_SAMPLES:
        dfs[name] = df.set_index('model_type')

data = []
for player, df in dfs.items():
    for model in df.index:
        n_train = df.loc[model, 'n_train']
        r2_test = df.loc[model, 'r2_test']
        data.append({'Player': player, 'Model': model, 'n_train': n_train, 'r2_test': r2_test})
combined_df = pd.DataFrame(data)

In [None]:
combined_df

In [None]:
# Determine layout
models = sorted(combined_df['Model'].unique())
fig, axs = plt.subplots(4, 5, figsize=(20, 16))  # Adjust the figsize as needed
axs = axs.flatten()  # Flatten the axis array

# Predefined axes limits
x_limits = (0, 1500)
y_limits = (-1.0, 0.75)

# Plotting with modifications
for i, model in enumerate(models):
    model_df = combined_df[combined_df['Model'] == model]
    # Plot points
    axs[i].scatter(model_df['n_train'], model_df['r2_test'], color='blue')
    axs[i].set_xlim(x_limits)
    axs[i].set_ylim(y_limits)
    
    # Best-fit line and equation
    if not model_df.empty:
        x = model_df['n_train']
        y = model_df['r2_test']
        m, b = np.polyfit(x, y, 1)  # Linear fit
        axs[i].plot(x, m*x + b, color='red')  # Add the best-fit line
        equation_text = f'y = {m:.4f}x + {b:.4f}'
    else:
        equation_text = 'No data'

    # Set title with model and below add equation text
    axs[i].set_title(f'{model}')
    axs[i].set_xlabel('n_train')
    axs[i].set_ylabel('r2_test')
    axs[i].text(0.5, -0.2, equation_text, transform=axs[i].transAxes, ha="center", va="top", fontsize=9, color="green")

# Hide unused subplots
for ax in axs[len(models):]:
    ax.set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
### TOP MODELS Analysis

In [None]:
dfs = {}
for name in os.listdir('./player_game_logs'):
    dfs[name] = pd.read_csv(f"./player_game_logs/{name}/{name}_SUMMARY.csv")

# Step 1: Combine the DataFrames
data = []
for player, df in dfs.items():
    df_copy = df.copy()
    df_copy.insert(0, 'player_name', player)
    data.append(df_copy)
combined_df = pd.concat(data)

# Step 2 and 3: Group by ModelType, sort by r2_test, and get top 10 for each group
top_models_per_type = combined_df.groupby('model_type').apply(lambda x: x.sort_values('r2_test', ascending=False).head(50))

# Reset index if you want a cleaner DataFrame
top_models_per_type = top_models_per_type.reset_index(drop=True)

In [None]:
top_models_per_type['model_type'].unique()

In [None]:
model = 'Random Forest (500 estimators, 5 depth)'
linear_models_no_linreg = [
    'Lasso (lambda=0.01)', 'Lasso (lambda=0.05)', 
    'Lasso (lambda=0.1)',
    'Lasso (lambda=1.0)', 'Ridge (lambda=0.1)',
    'Ridge (lambda=1.0)', 'Ridge (lambda=2.0)'
]
top_linear_models = top_models_per_type[top_models_per_type['model_type'].isin(linear_models_no_linreg)]

In [None]:
top_linear_models

In [None]:
means = top_linear_models.drop(columns=['player_name', 'model_type', 'n_train', 'n_test', 'mse_train', 'rmse_train', 'r2_train', 'mse_test', 'rmse_test', 'r2_test', 'y_test_mean', 'bias']).mean()
avg_weights = pd.DataFrame(means).reset_index()
avg_weights.columns = ['weight_name', 'weight_avg']
avg_weights.sort_values('weight_avg', ascending=False).head(10)

In [None]:
avg_weights.sort_values('weight_avg', ascending=False).tail(10) ablation

In [None]:
ds = pd.read_csv('./player_game_logs/derek_strong/derek_strong_SUMMARY.csv')
ds.sort_values('r2_test', ascending=False)