In [None]:
import pandas as pd
import nfl_data_py as nfl
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# Define the recency_analysis function
def recency_analysis(pbp_long, a=10):
    # Cumulative stats
    pbp_long['cumulative_player_snaps'] = pbp_long.groupby('player_id').cumcount() + 1
    pbp_long['cumulative_player_epa'] = pbp_long.groupby('player_id')['epa'].cumsum()
    pbp_long['cumulative_player_wpa'] = pbp_long.groupby('player_id')['wpa'].cumsum()
    pbp_long['cumulative_player_yards'] = pbp_long.groupby('player_id')['yards_gained'].cumsum()

    # Filter for snaps
    pbp_long = pbp_long[pbp_long['cumulative_player_snaps'] >= (a/2)]

    # Rolling averages without data leakage
    pbp_long['running_avg_player_epa'] = pbp_long.groupby('player_id')['epa'].apply(lambda x: x.shift(1).rolling(a, min_periods=1).mean(), group_keys=False)
    pbp_long['running_avg_player_wpa'] = pbp_long.groupby('player_id')['wpa'].apply(lambda x: x.shift(1).rolling(a, min_periods=1).mean(), group_keys=False)
    pbp_long['running_avg_player_yards'] = pbp_long.groupby('player_id')['yards_gained'].apply(lambda x: x.shift(1).rolling(a, min_periods=1).mean(), group_keys=False)

    # Future rolling averages
    pbp_long['future_avg_player_epa'] = pbp_long.groupby('player_id')['epa'].apply(lambda x: x.shift(-1).rolling(500, min_periods=1).mean(), group_keys=False)
    pbp_long['future_avg_player_wpa'] = pbp_long.groupby('player_id')['wpa'].apply(lambda x: x.shift(-1).rolling(500, min_periods=1).mean(), group_keys=False)

    # Delta features: difference between current and future rolling average
    pbp_long['delta_avg_player_epa'] = pbp_long['future_avg_player_epa'] - pbp_long['running_avg_player_epa']
    pbp_long['delta_avg_player_wpa'] = pbp_long['future_avg_player_wpa'] - pbp_long['running_avg_player_wpa']

    # Debug statement
    print(f"Initial rows: {len(pbp_long)}")
    pbp_long.dropna(subset=['delta_avg_player_epa', 'delta_avg_player_wpa'], inplace=True)
    print(f"After dropna: {len(pbp_long)}")
    pbp_long = pbp_long[pbp_long['cumulative_player_snaps'] >= a]
    print(f"After filtering by cumulative_player_snaps: {len(pbp_long)}")

    # Initialize the model
    model = Ridge(alpha=1.0)
    scaler = StandardScaler()
    kf = KFold(n_splits=2, shuffle=True, random_state=42)
    results = []

    dependent_vars = ['delta_avg_player_epa', 'delta_avg_player_wpa']
    independent_vars = ['running_avg_player_epa', 'running_avg_player_wpa', 'cumulative_player_snaps']

    for dep_var in dependent_vars:
        for train_index, test_index in kf.split(pbp_long):
            # Split the data into training and testing sets
            X_train, X_test = pbp_long[independent_vars].iloc[train_index], pbp_long[independent_vars].iloc[test_index]
            y_train, y_test = pbp_long[dep_var].iloc[train_index], pbp_long[dep_var].iloc[test_index]

            # Scale the features
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)

            # Fit the model
            model.fit(X_train_scaled, y_train)

            # Make predictions
            y_pred = model.predict(X_test_scaled)

            # Calculate metrics
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            results.append({'dep_var': dep_var, 'mse': mse, 'r2': r2, 'window_size': a})

    # Convert results to DataFrame
    results_df = pd.DataFrame(results)
    return results_df

# Run the recency analysis with different window sizes
recency_results_a = recency_analysis(pbp_long, a=10)
recency_results_b = recency_analysis(pbp_long, a=50)
recency_results_c = recency_analysis(pbp_long, a=100)
recency_results_d = recency_analysis(pbp_long, a=500)

# Add window size to each results DataFrame
recency_results_a['window_size'] = 10
recency_results_b['window_size'] = 50
recency_results_c['window_size'] = 100
recency_results_d['window_size'] = 500

# Concatenate the results
recency_results = pd.concat([recency_results_a, recency_results_b, recency_results_c, recency_results_d])

# Pivot the results DataFrame using pivot_table to handle duplicates
recency_results_pivot = recency_results.pivot_table(index='window_size', columns='dep_var', values=['mse', 'r2'], aggfunc='mean').reset_index()

# Plot the results
plt.figure(figsize=(12, 10))
plt.subplot(2, 1, 1)
sns.lineplot(data=recency_results_pivot, x='window_size', y=('mse', 'delta_avg_player_epa'), label='EPA MSE', marker='o')
sns.lineplot(data=recency_results_pivot, x='window_size', y=('mse', 'delta_avg_player_wpa'), label='WPA MSE', marker='o')
plt.title('Recency Analysis: MSE vs Window Size')
plt.xlabel('Window Size')
plt.ylabel('MSE (mse)')
plt.legend()
plt.grid(True)

plt.subplot(2, 1, 2)
sns.lineplot(data=recency_results_pivot, x='window_size', y=('r2', 'delta_avg_player_epa'), label='EPA r2', marker='o')
sns.lineplot(data=recency_results_pivot, x='window_size', y=('r2', 'delta_avg_player_wpa'), label='WPA r2', marker='o')
plt.title('Recency Analysis: r2 vs Window Size')
plt.xlabel('Window Size')
plt.ylabel('r2 (r2)')
plt.legend()
plt.grid(True)
plt.show()