# Barry

#### Dependencies

In [None]:
import boto3
import time
import pandas as pd
import numpy as np
import pymysql as mysql
import os
from datetime import datetime
import warnings
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error
import joblib

# Suppress all warnings
warnings.filterwarnings("ignore")

# AWS Credentials & Region
AWS_REGION = "us-east-2"  # Change to your region
DATABASE = "chalk"
TABLE = "chalkjuice_data"
S3_OUTPUT = "s3://chalkjuice/golden_athena/"  # Replace with your actual S3 bucket

# Initialize Athena Client
athena_client = boto3.client("athena", region_name=AWS_REGION)

weights = [5,7,9,13,.3,.25,.25,.2]
games_back = weights[0] + weights[1] + weights[2] + weights[3] + 1

In [536]:
# AWS Credentials & Region
AWS_REGION = "us-east-2"  # Change to your region
DATABASE = "chalk"
TABLE = "chalkjuice_data"
S3_OUTPUT = "s3://chalkjuice/golden_athena/"  # Replace with your actual S3 bucket

# Initialize Athena Client
athena_client = boto3.client("athena", region_name=AWS_REGION)

In [110]:
weights = [5,7,9,13,.3,.25,.25,.2]
games_back = weights[0] + weights[1] + weights[2] + weights[3] + 1

#### Definitions

In [541]:
## outliers
def handle_outliers(df, feature):

    # calculate the mean and standard deviation of the feature
    mean = df[feature].mean()
    std = df[feature].std()

    # define the threshold for outliers (3 standard deviations)
    threshold = 3 * std

    # save the indices of outliers
    outlier_indices = df[(df[feature] < mean - threshold) | (df[feature] > mean + threshold)].index
   
   # replace outliers with NaN values
    #df.loc[outlier_indices, feature] = np.nan
    #print(f"Number of rows dropped for feature '{feature}': {len(outlier_indices)}")

    # you can also remove outliers from the DataFrame completely
    df = df.drop(outlier_indices)

    return df

In [542]:
def feature_importance(model, original_x):
  # get feature importances and corresponding feature names
  importances = model.feature_importances_
  feature_names = original_x.columns

  # create a dictionary of feature names and importances then sort and extract
  feature_importance_dict = dict(zip(feature_names, importances))
  sorted_feature_importance = dict(sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True))
  sorted_feature_names = list(sorted_feature_importance.keys())
  sorted_importances = list(sorted_feature_importance.values())

  # create the barplot
  plt.figure(figsize=(10, 6))
  ax = sns.barplot(y=sorted_feature_names, x=sorted_importances, orient='h')

  # calculate the total importance
  total_importance = sum(sorted_importances)

  # add percentages to the end of each bar
  for i, importance in enumerate(sorted_importances):
      percentage = (importance / total_importance) * 100
      ax.text(importance + 0.01, i, f'{percentage:.2f}%', va='center')

  # calculate top 5 leaderboard
  top_features = sorted_feature_names[:10]
  top_leaderboard = "\n".join([f"{i+1}. {feature}" for i, feature in enumerate(top_features)])

  # add top leaderboard text to the bottom right
  plt.text(0.9, 0.1, top_leaderboard, transform=ax.transAxes, fontsize=10, ha='right')

  plt.xlabel('Importance')
  plt.ylabel('Feature')
  plt.title('Feature Importances')

  plt.show()

In [543]:
def query_athena(query):
    # Start Query Execution
    response = athena_client.start_query_execution(
        QueryString=query,
        QueryExecutionContext={"Database": DATABASE},
        ResultConfiguration={"OutputLocation": S3_OUTPUT},
    )

    # Get Query Execution ID
    query_execution_id = response["QueryExecutionId"]

    # Wait for Query to Complete
    while True:
        status = athena_client.get_query_execution(QueryExecutionId=query_execution_id)
        state = status["QueryExecution"]["Status"]["State"]
        
        if state in ["SUCCEEDED", "FAILED", "CANCELLED"]:
            break
        
        time.sleep(.1)  # Check every .1 seconds

    if state != "SUCCEEDED":
        failure_reason = status["QueryExecution"]["Status"].get("StateChangeReason", "Unknown Error")
        raise Exception(f"Athena query failed with state: {state}, Reason: {failure_reason}")


    # Get Query Results
    results = athena_client.get_query_results(QueryExecutionId=query_execution_id)

    return results

In [161]:
def query_athena_df(query):
    # Start Query Execution
    response = athena_client.start_query_execution(
        QueryString=query,
        QueryExecutionContext={"Database": DATABASE},
        ResultConfiguration={"OutputLocation": S3_OUTPUT},
    )

    # Get Query Execution ID
    query_execution_id = response["QueryExecutionId"]

    # Wait for Query to Complete
    while True:
        status = athena_client.get_query_execution(QueryExecutionId=query_execution_id)
        state = status["QueryExecution"]["Status"]["State"]
        
        if state in ["SUCCEEDED", "FAILED", "CANCELLED"]:
            break
        
        time.sleep(.1)  # Check every .1 seconds

    if state != "SUCCEEDED":
        failure_reason = status["QueryExecution"]["Status"].get("StateChangeReason", "Unknown Error")
        raise Exception(f"Athena query failed with state: {state}, Reason: {failure_reason}")


    # Get Query Results
    results = athena_client.get_query_results(QueryExecutionId=query_execution_id)

    columns = [col["Label"] for col in results["ResultSet"]["ResultSetMetadata"]["ColumnInfo"]]

    # Extract Rows
    rows = []
    for row in results["ResultSet"]["Rows"][1:]:  # Skip header row
        extracted_row = [col.get("VarCharValue", None) for col in row["Data"]]  # Extract actual values
        rows.append(extracted_row)

    # Convert to Pandas DataFrame
    df = pd.DataFrame(rows, columns=columns)
    df = df.fillna("NA")

    return df

In [106]:
def oldest_usable_game(team, games_back):
    query = f"""
        WITH ordered_games AS (
            SELECT date,
                ROW_NUMBER() OVER (PARTITION BY team ORDER BY date ASC) AS row_num
            FROM "{DATABASE}"."{TABLE}"
            WHERE team = '{team}'
        )
        SELECT date
        FROM ordered_games
    """
    results = query_athena(query)

    columns = [col["Label"] for col in results["ResultSet"]["ResultSetMetadata"]["ColumnInfo"]]

    # Extract Rows
    rows = []
    for row in results["ResultSet"]["Rows"][1:]:  # Skip header row
        extracted_row = [col.get("VarCharValue", None) for col in row["Data"]]  # Extract actual values
        rows.append(extracted_row)

    # Convert to Pandas DataFrame
    df = pd.DataFrame(rows, columns=columns)
    df = df.fillna("NA")
    df.columns = df.columns.str.replace('_', ' ').str.title()  # Format column names
    df['Date'] = pd.to_datetime(df['Date'])  # Ensure 'Date' is in datetime format
    df = df.sort_values(by='Date', ascending=True)  # Sort from oldest to newest

    
    return str(df['Date'].iloc[(games_back)])[:10]

In [409]:
def weighted_avg(df, col, gb1, gb2, gb3, gb4, weight1, weight2, weight3, weight4, inte = None):


    gb2 = gb1 + gb2
    gb3 = gb2 + gb3
    gb4 = gb3 + gb4

    average_gb1 = df[col].iloc[:gb1].mean()
    weighted_gb1 = average_gb1 * weight1

    average_gb2 = df[col].iloc[gb1:gb2].mean()
    weighted_gb2 = average_gb2 * weight2


    average_gb3 = df[col].iloc[gb2:gb3].mean()
    weighted_gb3 = average_gb3 * weight3

    average_gb4 = df[col].iloc[gb3:gb4].mean()
    weighted_gb4 = average_gb4 * weight4


    weighted_avg = round(((weighted_gb1 + weighted_gb2 + weighted_gb3 + weighted_gb4) / sum([weight1, weight2, weight3, weight4])), 3)

    
    if inte == 1:
        weighted_avg = int(weighted_avg)

    return weighted_avg

#### 1) To predict the points scored by the offense I need to be able to gather data from their previous games. Meaning I cant use all of the games in the database - I need to remove the first 34 games played by each team. 
#### This part gathers the all of the matchups that meet this criteria, a template with the date, team, and opponent.

In [162]:
# List of all 32 NFL teams (3-letter abbreviations)
nfl_teams = [
    "ARI", "ATL", "BAL", "BUF", "CAR", "CHI", "CIN", "CLE",
    "DAL", "DEN", "DET", "GNB", "HOU", "IND", "JAX", "KAN",
    "LVR", "LAC", "LAR", "MIA", "MIN", "NWE", "NOR", "NYG",
    "NYJ", "PHI", "PIT", "SFO", "SEA", "TAM", "TEN", "WAS"
]

# Initialize an empty master DataFrame
master_template = pd.DataFrame()

for team in nfl_teams:
    games_back = 34

    oldest_usable_game_for_this_team = oldest_usable_game(team, games_back)

    # Construct Query
    query = f'''
        SELECT date_parse(date, '%c/%e/%Y') AS parsed_date, team, opponent
        FROM "{DATABASE}"."{TABLE}"
        WHERE date_parse(date, '%c/%e/%Y') >= DATE '{oldest_usable_game_for_this_team}'
            AND team = '{team}';
    '''
    df = query_athena_df(query)

    df["date"] = df["parsed_date"].astype(str).str[:10]
    df = df.drop(columns=["parsed_date"])  # Optionally drop the old column if not needed
    print(team, oldest_usable_game_for_this_team, df.size)

    # Append the team DataFrame to the master DataFrame
    master_template = pd.concat([master_template, df], ignore_index=True)

master_template = master_template[master_template['date'] != '1/2/2023']
master_template = master_template.sort_values(by='date', ascending=True)  # Sort from oldest to newest
master_template['date'] = pd.to_datetime(master_template['date']).dt.strftime("%Y-%m-%d")
master_template.head(10)
#master_template.to_csv('master_template.csv')


ARI 1969-11-02 2553
ATL 1969-11-02 2553
BAL 1969-11-02 2553
BUF 1969-10-26 2553
CAR 1997-09-14 1299
CHI 1969-11-02 2553
CIN 1970-11-02 2511
CLE 2001-09-30 1107
DAL 1969-11-02 2553
DEN 1969-10-26 2553
DET 1969-11-02 2553
GNB 1969-11-02 2553
HOU 2004-09-26 963
IND 1969-11-02 2553
JAX 1997-09-22 1299
KAN 1969-10-26 2553
LVR 1969-10-26 2553
LAC 1969-10-26 2553
LAR 1969-11-02 2553
MIA 1969-10-26 2553
MIN 1969-11-02 2553
NWE 1969-10-26 2553
NOR 1969-11-02 2553
NYG 1969-11-02 2553
NYJ 1969-10-26 2553
PHI 1969-11-02 2553
PIT 1969-11-02 2553
SFO 1969-11-02 2553
SEA 1978-10-15 2175
TAM 1978-10-15 2175
TEN 1969-10-26 2553
WAS 1969-11-02 2553


Unnamed: 0,team,opponent,date
12047,LVR,LAC,1969-10-26
7471,DEN,TEN,1969-10-26
14385,MIA,BUF,1969-10-26
10903,KAN,CIN,1969-10-26
16009,NWE,NYJ,1969-10-26
12605,LAC,LVR,1969-10-26
19075,NYJ,NWE,1969-10-26
24087,TEN,DEN,1969-10-26
3396,BUF,MIA,1969-10-26
8965,GNB,PIT,1969-11-02


In [614]:
# drop the canceled game between the bengals and the ravens
master_template = master_template[master_template['date'] != '1/2/2023']

#### 2) For each matchup from step 1, aggregate the team's offense and opponent's defense stats from the last 34 games (weighted) and add it to the dataframe. The goal is to predict the points scored by the offense. To predict a games outcome simply run the model twice - first predict team1 points and then team 2 points. Basically, I am treating an NFL game as two differennt matchups 1) team 1 offense  vs team 2 defense 2) team 2 offense vs team defense 1.  

#### This part aggregates the stats from the previous 34 games (weighted) that came before each matchup. 


#### Currently only works with rows that have no N/A. I made the cutoff date 2003 so that there would be no N/A values. If i negated using the columns with N/A like 3d con/att thens I could go all the way back to 1969.


#### I built this to mimic the way requests will be coming in the background. It generates the required data summaries one matchup at at time as oppposed to querying all the data fro one team at once to save money. For each date/team date/team combo, I have to query athena 4 times to generate all the data. In the future it'll be better to query all data at once, then use pandas to process it further. But this way I already have the code for the lambda function ready for deployment. 

In [615]:
master_template['date'] = pd.to_datetime(master_template['date'])
master_template_2 = master_template[master_template['date'] >= '2003-08-08']

In [616]:
len(master_template_2)

10828

In [None]:
chalk_22_model_2 = pd.DataFrame(columns=[ 
    'passing_yds', 'passing_int', 'passing_times_sacked', 'rushing_yds', 'fmb', 'time_of_possession',
    'punts_yds', 'penalty_yds', 'def_passing_yds',
    'def_passing_int', 'def_passing_times_sacked',
    'def_rushing_yds', 'def_fmb', 'def_time_of_possession',
    'pass_play_percentage', 'def_pass_play_percentage', 'drives',
    'def_drives', 'tds_per_yard', 'def_tds_per_yard',
    'clutch_conversion_percentage', 'def_clutch_conversion_percentage',
    'fg_percentage', 'points', 'date', 'home_game'
    ])

In [639]:
def create_features(df):  

    merged_df_2 = df
    ##### pass_play_percentage
    merged_df_2['pass_play_percentage'] = 100*(merged_df_2['passing_att'] / (merged_df_2['passing_att'] + merged_df_2['rushing_att']))
    merged_df_2['def_pass_play_percentage'] = 100*(merged_df_2['def_passing_att'] / (merged_df_2['def_passing_att'] + merged_df_2['def_rushing_att']))

    merged_df_2 = merged_df_2.drop(columns=['passing_att', 'rushing_att', 'def_passing_att', 'def_rushing_att'])

    ##### drives
    # Offensive drives
    merged_df_2['drives'] = merged_df_2['passing_tds'] + merged_df_2['rush_tds'] + merged_df_2['fga'] + merged_df_2['punts_total']

    # Defensive drives
    merged_df_2['def_drives'] = merged_df_2['def_passing_tds'] + merged_df_2['def_rush_tds'] + merged_df_2['def_fga'] + merged_df_2['def_punts_total']

    # drop
    merged_df_2 = merged_df_2.drop(columns=['punts_total', 'def_punts_total', 'def_fga'])

    ##### tds per 10000 yards
    # Offensive touchdowns per yard
    merged_df_2['tds_per_yard'] = 10000 * ((merged_df_2['passing_tds'] + merged_df_2['rush_tds']) / \
                                (merged_df_2['passing_yds'] + merged_df_2['rushing_yds']))

    # Defensive touchdowns per yard
    merged_df_2['def_tds_per_yard'] = 10000 * ((merged_df_2['def_passing_tds'] + merged_df_2['def_rush_tds']) / \
                                    (merged_df_2['def_passing_yds'] + merged_df_2['def_rushing_yds']))

    merged_df_2 = merged_df_2.drop(columns=['passing_tds', 'rush_tds', 'def_passing_tds', 'def_rush_tds'])

    ##### cluth metric
    # Offensive clutch conversion percentage
    merged_df_2['clutch_conversion_percentage'] = 100*((merged_df_2['3d_conversions'] + merged_df_2['4d_conversions'] + merged_df_2['2pm']) / \
                                                (merged_df_2['3d_att'] + merged_df_2['4d_att'] + merged_df_2['2pa']))

    # Defensive clutch conversion percentage
    merged_df_2['def_clutch_conversion_percentage'] = 100*(1 - ((merged_df_2['def_3d_conversions'] + merged_df_2['def_4d_conversions'] + merged_df_2['def_2pm']) / \
                                                    (merged_df_2['def_3d_att'] + merged_df_2['def_4d_att'] + merged_df_2['def_2pa'])))

    # Drop the original columns
    merged_df_2 = merged_df_2.drop(columns=[
        '3d_att', '4d_att', '2pa', '3d_conversions', '4d_conversions', '2pm',
        'def_3d_att', 'def_4d_att', 'def_2pa', 'def_3d_conversions', 'def_4d_conversions', 'def_2pm'
    ])

    ##### fg_percentage
    merged_df_2['fg_percentage'] = 100*(np.where(merged_df_2['fga'] == 0, 0, merged_df_2['fgm'] / merged_df_2['fga']))

    # Now drop 'fgm' and 'fga'
    merged_df_2 = merged_df_2.drop(columns=['fgm', 'fga'])

    merged_df_2 = merged_df_2.astype(int)


    return merged_df_2

In [653]:
def collect_df_for_each_matchup(team, opponent, date1, date2):
    
    if date2 is None:
        date2 = date1

    # create data frames with the 35 most recent games =< the proivided date for both team offense and opponent defense
    query_offense = f'''
        SELECT date, team, opponent, points, home_game,
            passing_yds, passing_tds, passing_int, 
            passing_times_sacked, rushing_yds, 
            rush_tds, fmb, "3d_att", "3d_conversions", "4d_att", "4d_conversions", time_of_possession, 
            fga, punts_yds, punts_total, "2pm", "2pa",
            penalty_yds, fgm, passing_att, rushing_att
        FROM "{DATABASE}"."{TABLE}"
        WHERE team = '{team}'
            AND TRY_CAST(DATE_PARSE(date, '%m/%d/%Y') AS DATE) <= DATE '{date1}'
        ORDER BY TRY_CAST(DATE_PARSE(date, '%m/%d/%Y') AS DATE) DESC
        LIMIT {games_back+1};
    '''
    off_df = query_athena_df(query_offense)

    query_defense = f'''
        SELECT 
            date, team, opponent, passing_yds AS def_passing_yds, 
            passing_tds AS def_passing_tds, passing_int AS def_passing_int, 
            passing_times_sacked AS def_passing_times_sacked, rushing_yds AS def_rushing_yds, 
            rush_tds AS def_rush_tds, fmb AS def_fmb, 
            "3d_att" AS def_3d_att, "3d_conversions" AS def_3d_conversions, 
            "4d_att" AS def_4d_att, "4d_conversions" AS def_4d_conversions, 
            time_of_possession AS def_time_of_possession, fga AS def_fga, 
            punts_total AS def_punts_total, "2pm" AS def_2pm, 
            "2pa" AS def_2pa, passing_att AS def_passing_att, rushing_att AS def_rushing_att
        FROM "{DATABASE}"."{TABLE}"
        WHERE opponent = '{opponent}'
            AND TRY_CAST(DATE_PARSE(date, '%m/%d/%Y') AS DATE) <= DATE '{date2}' 
        ORDER BY TRY_CAST(DATE_PARSE(date, '%m/%d/%Y') AS DATE) DESC
        LIMIT {games_back+1};
    '''
    def_df = query_athena_df(query_defense)

    # join the two dfs on the index column because teams may play on different days on the same week. 
    merged_df = off_df.merge(def_df, left_index=True, right_index=True, how='inner')
    merged_df = merged_df[merged_df['date_x'] != '1/2/2023']
    merged_df = merged_df[merged_df['date_y'] != '1/2/2023']

    # first take out any information from the row date in question
    points = int(merged_df['points'][0])
    home_game = int(merged_df['home_game'][0])

    # remove the top row because you dont want to train the model on data from the same week we are uses for points
    merged_df_2 = merged_df.drop(merged_df.index[0])
    merged_df_2 = merged_df_2.drop(columns=['date_x', 'team_x', 'opponent_x', 'date_y', 'team_y', 'opponent_y', 'home_game', 'points'])

    # convert to integers. this is the part that despises N/A values 
    merged_df_2 = merged_df_2.astype(int)

    return(merged_df_2, points, home_game)

In [None]:
for index, row in master_template_2.iterrows():
    # Save row 1 column values into individual variables before dropping
    team = row['team']
    opponent = row['opponent']
    date = str(row['date'])[:10]
    print(team, opponent, date)

    # create a 34 most recent game df fro each matchup
    merged_df_2, points, home_game = collect_df_for_each_matchup(team, opponent, date)

    # create features
    merged_df_2 = create_features(merged_df_2)

    # Dictionary to store weighted averages
    weighted_averages = {}
    # List of columns to calculate weighted averages for
    columns = [
        'passing_yds', 'passing_int', 'passing_times_sacked', 'rushing_yds', 'fmb', 'time_of_possession',
        'punts_yds', 'penalty_yds', 'def_passing_yds',
        'def_passing_int', 'def_passing_times_sacked',
        'def_rushing_yds', 'def_fmb', 'def_time_of_possession', 'def_passing_times_sacked',
        'pass_play_percentage', 'def_pass_play_percentage', 'drives',
        'def_drives', 'tds_per_yard', 'def_tds_per_yard',
        'clutch_conversion_percentage', 'def_clutch_conversion_percentage',
        'fg_percentage'
    ]

    # Calculate weighted averages and store in the dictionary
    for col in columns:
        weighted_averages[col] = weighted_avg(merged_df_2, col, *weights)

    # Convert dictionary to a DataFrame (single-row)
    weighted_avg_df = pd.DataFrame([weighted_averages])
    
    # add home_game the only varible that doesnt need to be weighted
    weighted_avg_df['home_game'] = home_game

    # kept this for debugging
    weighted_avg_df['points'] = points
    weighted_avg_df['date'] = date

    # Concatenate the new row to chalk_22_model
    chalk_22_model = pd.concat([chalk_22_model, weighted_avg_df], ignore_index=True)

NYJ WAS 2003-09-04
WAS NYJ 2003-09-04
DAL ATL 2003-09-07
LAC KAN 2003-09-07
ATL DAL 2003-09-07
JAX CAR 2003-09-07
CHI SFO 2003-09-07
NWE BUF 2003-09-07
DEN CIN 2003-09-07
MIA HOU 2003-09-07
PIT BAL 2003-09-07
IND CLE 2003-09-07
GNB MIN 2003-09-07
LAR NYG 2003-09-07
BUF NWE 2003-09-07
DET ARI 2003-09-07
LVR TEN 2003-09-07
BAL PIT 2003-09-07
ARI DET 2003-09-07
NYG LAR 2003-09-07
CIN DEN 2003-09-07
NOR SEA 2003-09-07
SFO CHI 2003-09-07
KAN LAC 2003-09-07
MIN GNB 2003-09-07
SEA NOR 2003-09-07
CAR JAX 2003-09-07
CLE IND 2003-09-07
TEN LVR 2003-09-07
TAM PHI 2003-09-08
PHI TAM 2003-09-08
GNB DET 2003-09-14
CIN LVR 2003-09-14
DEN LAC 2003-09-14
BAL CLE 2003-09-14
ARI SEA 2003-09-14
BUF JAX 2003-09-14
TEN IND 2003-09-14
ATL WAS 2003-09-14
CAR TAM 2003-09-14
JAX BUF 2003-09-14
DET GNB 2003-09-14
NWE PHI 2003-09-14
NOR HOU 2003-09-14
WAS ATL 2003-09-14
MIA NYJ 2003-09-14
CLE BAL 2003-09-14
IND TEN 2003-09-14
KAN PIT 2003-09-14
MIN CHI 2003-09-14
NYJ MIA 2003-09-14
LAR SFO 2003-09-14
PIT KAN 2003

In [None]:
chalk_22_model.to_csv('aggregated_stats_modeling.csv') 




In [654]:
chalk_22_model_2 = pd.DataFrame(columns=[ 
    'passing_yds', 'passing_int', 'passing_times_sacked', 'rushing_yds', 'fmb', 'time_of_possession',
    'punts_yds', 'penalty_yds', 'def_passing_yds',
    'def_passing_int', 'def_passing_times_sacked',
    'def_rushing_yds', 'def_fmb', 'def_time_of_possession',
    'pass_play_percentage', 'def_pass_play_percentage', 'drives',
    'def_drives', 'tds_per_yard', 'def_tds_per_yard',
    'clutch_conversion_percentage', 'def_clutch_conversion_percentage',
    'fg_percentage', 'points', 'date', 'home_game'
    ])

In [655]:
# Save row 1 column values into individual variables before dropping
team = 'MIN'
opponent = 'DET'
date1 = '2022-10-08'
date2 = '2022-10-08'


# create a 34 most recent game df fro each matchup
merged_df_2, points, home_game = collect_df_for_each_matchup(team, opponent, date1, date2)

# create features
merged_df_2 = create_features(merged_df_2)

# Dictionary to store weighted averages
weighted_averages = {}
# List of columns to calculate weighted averages for
columns = [
    'passing_yds', 'passing_int', 'passing_times_sacked', 'rushing_yds', 'fmb', 'time_of_possession',
    'punts_yds', 'penalty_yds', 'def_passing_yds',
    'def_passing_int', 'def_passing_times_sacked',
    'def_rushing_yds', 'def_fmb', 'def_time_of_possession', 'def_passing_times_sacked',
    'pass_play_percentage', 'def_pass_play_percentage', 'drives',
    'def_drives', 'tds_per_yard', 'def_tds_per_yard',
    'clutch_conversion_percentage', 'def_clutch_conversion_percentage',
    'fg_percentage'
]

# Calculate weighted averages and store in the dictionary
for col in columns:
    weighted_averages[col] = weighted_avg(merged_df_2, col, *weights)

# Convert dictionary to a DataFrame (single-row)
weighted_avg_df = pd.DataFrame([weighted_averages])

# add home_game the only varible that doesnt need to be weighted
weighted_avg_df['home_game'] = home_game

# kept this for debugging
weighted_avg_df['points'] = points
weighted_avg_df['date'] = date1

# Concatenate the new row to chalk_22_model
chalk_22_model_2 = pd.concat([chalk_22_model_2, weighted_avg_df], ignore_index=True)

In [657]:
# Save row 1 column values into individual variables before dropping
team = 'DET'
opponent = 'MIN'
date1 = '2022-10-08'
date2 = '2022-10-08'


# create a 34 most recent game df fro each matchup
merged_df_2, points, home_game = collect_df_for_each_matchup(team, opponent, date1, date2)

# create features
merged_df_2 = create_features(merged_df_2)

# Dictionary to store weighted averages
weighted_averages = {}
# List of columns to calculate weighted averages for
columns = [
    'passing_yds', 'passing_int', 'passing_times_sacked', 'rushing_yds', 'fmb', 'time_of_possession',
    'punts_yds', 'penalty_yds', 'def_passing_yds',
    'def_passing_int', 'def_passing_times_sacked',
    'def_rushing_yds', 'def_fmb', 'def_time_of_possession', 'def_passing_times_sacked',
    'pass_play_percentage', 'def_pass_play_percentage', 'drives',
    'def_drives', 'tds_per_yard', 'def_tds_per_yard',
    'clutch_conversion_percentage', 'def_clutch_conversion_percentage',
    'fg_percentage'
]

# Calculate weighted averages and store in the dictionary
for col in columns:
    weighted_averages[col] = weighted_avg(merged_df_2, col, *weights)

# Convert dictionary to a DataFrame (single-row)
weighted_avg_df = pd.DataFrame([weighted_averages])

# add home_game the only varible that doesnt need to be weighted
weighted_avg_df['home_game'] = home_game

# kept this for debugging
weighted_avg_df['points'] = points
weighted_avg_df['date'] = date1

# Concatenate the new row to chalk_22_model
chalk_22_model_2 = pd.concat([chalk_22_model_2, weighted_avg_df], ignore_index=True)

In [658]:
chalk_22_model_2

Unnamed: 0,passing_yds,passing_int,passing_times_sacked,rushing_yds,fmb,time_of_possession,punts_yds,penalty_yds,def_passing_yds,def_passing_int,...,drives,def_drives,tds_per_yard,def_tds_per_yard,clutch_conversion_percentage,def_clutch_conversion_percentage,fg_percentage,points,date,home_game
0,249.1,0.553,1.914,116.221,0.404,28.757,202.043,47.619,260.781,0.577,...,9.022,8.33,73.843,86.906,37.291,52.865,65.466,28,2022-10-08,1
1,229.677,0.827,1.949,117.984,0.417,28.431,182.958,46.545,259.978,0.972,...,8.181,8.271,74.509,65.587,39.8,59.563,62.555,45,2022-10-08,0


#### 3) Train the model to predict points based on the matchup data. The specific teams and dates are not important here. 

In [643]:
def preprocessing(df, target):

    # outliers
    for feature in df.select_dtypes(include=[np.number]).columns:
        df = handle_outliers(df, feature)
    print(target)
    # partitioning
    X = df.drop(columns=[target])
    y = df[target]
    X_train, X_remaining, y_train, y_remaining = train_test_split(X, y, test_size=0.2, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_remaining, y_remaining, test_size=0.5, random_state=42)

    # scaling
    standard_scaler = StandardScaler()
    X_train = pd.DataFrame(standard_scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
    X_val   = pd.DataFrame(standard_scaler.transform(X_val), columns=X_val.columns, index=X_val.index)
    X_test  = pd.DataFrame(standard_scaler.transform(X_test), columns=X_test.columns, index=X_test.index)
    scaler_filename = 'chalk_22_scaler.pkl'
    joblib.dump(standard_scaler, scaler_filename)


    # feature importance
    #model = RandomForestRegressor(random_state=42)
    #clf = model.fit(X_train, y_train)
    #feature_importance(clf, X_train)

    # feature independece
    numerical_cols = X_test.select_dtypes(include=[float, int])
    results_list = []
    # iterate over all combinations of numerical columns
    for i, col1 in enumerate(numerical_cols.columns):
        for col2 in numerical_cols.columns[i+1:]:
            x = numerical_cols[col1]
            y = numerical_cols[col2]
            # calculate Pearson's correlation coefficient and p-value
            corr_coefficient, p_value = pearsonr(x, y)
            # append the results to the list
            results_list.append({'Variable1': col1, 'Variable2': col2, 'Correlation Coefficient': corr_coefficient, 'P-Value': p_value})
 
    # convert the list to a DataFrame
    results_df = pd.DataFrame(results_list)
    results_df = results_df.sort_values(by='Correlation Coefficient', ascending=False)

    return X_train, X_test, y_train, y_test, X_val, y_val, results_df

In [644]:
chalk_22_model = pd.read_csv('aggregated_stats_modeling.csv')

In [645]:
df = chalk_22_model
target = 'points'
X_train, X_test, y_train, y_test, X_val, y_val, results_df = preprocessing(df, target)

points


In [None]:
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import matplotlib.pyplot as plt
import joblib

# Fit the model
model = LinearRegression()
model.fit(X_train, y_train)
joblib.dump(model, 'lr_model.joblib')
predictions = model.predict(X_val)
predictions

In [647]:
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import matplotlib.pyplot as plt

desired_confidence = 0.51

# Fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the validation set
X_val_sm = sm.add_constant(X_val)  # Add a constant term for statsmodels
predictions = model.predict(X_val)

# Calculate prediction intervals using statsmodels
X_train_sm = sm.add_constant(X_train)
model_sm = sm.OLS(y_train, X_train_sm).fit()
predicted = model_sm.get_prediction(X_val_sm)
prediction_summary = predicted.summary_frame(alpha=(1 - desired_confidence))

In [648]:
# Calculate RMSE on the y_test set
y_test_predictions = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_test_predictions))
# Calculate the average difference between high and low confidence intervals
confidence_interval_diff = prediction_summary['obs_ci_upper'] - prediction_summary['obs_ci_lower']
average_confidence_interval_diff = confidence_interval_diff.mean()

In [650]:
y_test_predictions

array([24.34774612, 27.86278851, 22.02875411, ..., 24.91093295,
       17.73398738, 16.65753104])

#### 4) To use the model with just two inputs (team and date), aggregate previous game data in the same format as used to train the model. 

In [None]:
team1 = 'MIN'
game_date1 = '10-1-2022'

team2 = 'DET'
game_date2 = '10-1-2022'

#### 5) Test the model. To predict a game run the model twice, first for team1 points and then second for team2 points. The date can be idfferent for each team.

#### The ouput of this step should be a dataframe with 100 rows. each row represents one simulated game. 

#### 6) Java script that sends inputs to lamda and processes the dataframe from step 5. 