# Modeling

## Dependencies

In [1]:
from packages.helpers.helpers import joel_boto
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import joblib

In [146]:
# Connect to custom AWS class
jb = joel_boto()

scaler_path = "../data/chalk_22_scaler.pkl"
model_path = "../data/lr_model.joblib"
model_path2 = "../data/lr_model_2.joblib"
scaler_path2 = "../data/chalk_22_scaler_2.pkl"

model_1995 = joblib.load(model_path)
model_1994 = joblib.load(model_path2)
scaler_for_model_1995 = joblib.load(scaler_path)
scaler_for_model_1994 = joblib.load(scaler_path2)

running local credentials
✅ Logged in to ECR successfully.
✅ Connected to all clients successfully.


## Functions

In [3]:
def grab_data(team, off_or_def, season, games_back = 34):

    if off_or_def:
        where_clause = f"WHERE team = '{team}'"
    else:
        where_clause = f"WHERE opponent = '{team}'"
        
    query = f'''
        SELECT passing_yds, passing_tds, passing_int, 
            passing_times_sacked, rushing_yds, 
            rush_tds, fmb, "3d_att", "3d_conversions", "4d_att", "4d_conversions", time_of_possession, 
            fga, punts_yds, punts_total, "2pm", "2pa",
            penalty_yds, fgm, passing_att, rushing_att
        FROM "{athena_database}"."{dynamodb_table}"
        {where_clause}
            AND season <= {season}
        ORDER BY season DESC, week DESC
        LIMIT {games_back};
    '''
    query_execution_id = jb.query_athena(query, athena_database, athena_output_location)
    df = jb.create_df_from_athena_query(query_execution_id)

    if not off_or_def:
        df = df.drop(columns=['punts_yds', 'penalty_yds', 'fgm'])
        columns_to_exclude = ['week', 'season', 'team', 'opponent']

        df = df.rename(columns={
            col: f'def_{col}' if col not in columns_to_exclude else col
            for col in df.columns
        })

    return df

In [140]:
def create_features_for_model_1995(df):  

    merged_df_2 = df
    ##### pass_play_percentage
    merged_df_2['pass_play_percentage'] = 100*(merged_df_2['passing_att'] / (merged_df_2['passing_att'] + merged_df_2['rushing_att']))
    merged_df_2['def_pass_play_percentage'] = 100*(merged_df_2['def_passing_att'] / (merged_df_2['def_passing_att'] + merged_df_2['def_rushing_att']))

    merged_df_2 = merged_df_2.drop(columns=['passing_att', 'rushing_att', 'def_passing_att', 'def_rushing_att'])

    ##### drives
    # Offensive drives
    merged_df_2['drives'] = merged_df_2['passing_tds'] + merged_df_2['rush_tds'] + merged_df_2['fga'] + merged_df_2['punts_total']

    # Defensive drives
    merged_df_2['def_drives'] = merged_df_2['def_passing_tds'] + merged_df_2['def_rush_tds'] + merged_df_2['def_fga'] + merged_df_2['def_punts_total']

    # drop
    merged_df_2 = merged_df_2.drop(columns=['punts_total', 'def_punts_total', 'def_fga'])

    ##### tds per 10000 yards
    # Offensive touchdowns per yard
    merged_df_2['tds_per_yard'] = 10000 * ((merged_df_2['passing_tds'] + merged_df_2['rush_tds']) / \
                                (merged_df_2['passing_yds'] + merged_df_2['rushing_yds']))

    # Defensive touchdowns per yard
    merged_df_2['def_tds_per_yard'] = 10000 * ((merged_df_2['def_passing_tds'] + merged_df_2['def_rush_tds']) / \
                                    (merged_df_2['def_passing_yds'] + merged_df_2['def_rushing_yds']))

    merged_df_2 = merged_df_2.drop(columns=['passing_tds', 'rush_tds', 'def_passing_tds', 'def_rush_tds'])


    ##### fg_percentage
    merged_df_2['fg_percentage'] = 100*(np.where(merged_df_2['fga'] == 0, 0, merged_df_2['fgm'] / merged_df_2['fga']))

    # Now drop 'fgm' and 'fga'
    merged_df_2 = merged_df_2.drop(columns=['fgm', 'fga'])

    #merged_df_2 = merged_df_2.astype(int)



    ##### cluth metric
    # Offensive clutch conversion percentage
    merged_df_2['clutch_conversion_percentage'] = 100*((merged_df_2['3d_conversions'] + merged_df_2['4d_conversions'] + merged_df_2['2pm']) / \
                                                (merged_df_2['3d_att'] + merged_df_2['4d_att'] + merged_df_2['2pa']))

    # Defensive clutch conversion percentage
    merged_df_2['def_clutch_conversion_percentage'] = 100*(1 - ((merged_df_2['def_3d_conversions'] + merged_df_2['def_4d_conversions'] + merged_df_2['def_2pm']) / \
                                                    (merged_df_2['def_3d_att'] + merged_df_2['def_4d_att'] + merged_df_2['def_2pa'])))

    # Drop the original columns
    merged_df_2 = merged_df_2.drop(columns=[
        '3d_att', '4d_att', '2pa', '3d_conversions', '4d_conversions', '2pm',
        'def_3d_att', 'def_4d_att', 'def_2pa', 'def_3d_conversions', 'def_4d_conversions', 'def_2pm'
    ])

    return merged_df_2

In [141]:
def create_features_for_model_1994(df):  

    merged_df_2 = df
    ##### pass_play_percentage
    merged_df_2['pass_play_percentage'] = 100*(merged_df_2['passing_att'] / (merged_df_2['passing_att'] + merged_df_2['rushing_att']))
    merged_df_2['def_pass_play_percentage'] = 100*(merged_df_2['def_passing_att'] / (merged_df_2['def_passing_att'] + merged_df_2['def_rushing_att']))

    merged_df_2 = merged_df_2.drop(columns=['passing_att', 'rushing_att', 'def_passing_att', 'def_rushing_att'])

    ##### drives
    # Offensive drives
    merged_df_2['drives'] = merged_df_2['passing_tds'] + merged_df_2['rush_tds'] + merged_df_2['fga'] + merged_df_2['punts_total']

    # Defensive drives
    merged_df_2['def_drives'] = merged_df_2['def_passing_tds'] + merged_df_2['def_rush_tds'] + merged_df_2['def_fga'] + merged_df_2['def_punts_total']

    # drop
    merged_df_2 = merged_df_2.drop(columns=['punts_total', 'def_punts_total', 'def_fga'])

    ##### tds per 10000 yards
    # Offensive touchdowns per yard
    merged_df_2['tds_per_yard'] = 10000 * ((merged_df_2['passing_tds'] + merged_df_2['rush_tds']) / \
                                (merged_df_2['passing_yds'] + merged_df_2['rushing_yds']))

    # Defensive touchdowns per yard
    merged_df_2['def_tds_per_yard'] = 10000 * ((merged_df_2['def_passing_tds'] + merged_df_2['def_rush_tds']) / \
                                    (merged_df_2['def_passing_yds'] + merged_df_2['def_rushing_yds']))

    merged_df_2 = merged_df_2.drop(columns=['passing_tds', 'rush_tds', 'def_passing_tds', 'def_rush_tds'])


    ##### fg_percentage
    merged_df_2['fg_percentage'] = 100*(np.where(merged_df_2['fga'] == 0, 0, merged_df_2['fgm'] / merged_df_2['fga']))

    # Now drop 'fgm' and 'fga'
    merged_df_2 = merged_df_2.drop(columns=['fgm', 'fga'])

    #merged_df_2 = merged_df_2.astype(int)


    return merged_df_2

In [142]:
def weighted_avg(df, col, gb1, gb2, gb3, gb4, weight1, weight2, weight3, weight4, inte = None):

    # gb stands for games back 
    gb2 = gb1 + gb2
    gb3 = gb2 + gb3
    gb4 = gb3 + gb4

    average_gb1 = df[col].iloc[:gb1].mean()
    weighted_gb1 = average_gb1 * weight1

    average_gb2 = df[col].iloc[gb1:gb2].mean()
    weighted_gb2 = average_gb2 * weight2


    average_gb3 = df[col].iloc[gb2:gb3].mean()
    weighted_gb3 = average_gb3 * weight3

    average_gb4 = df[col].iloc[gb3:gb4].mean()
    weighted_gb4 = average_gb4 * weight4


    weighted_avg = round(((weighted_gb1 + weighted_gb2 + weighted_gb3 + weighted_gb4) / sum([weight1, weight2, weight3, weight4])), 3)

    
    if inte == 1:
        weighted_avg = int(weighted_avg)

    return weighted_avg

In [143]:
def weighting_for_model_1995(merged_df_2, weights):    
    # Dictionary to store weighted averages
    weighted_averages = {}
    # List of columns to calculate weighted averages for

    columns = [
        'passing_yds', 'passing_int', 'passing_times_sacked', 'rushing_yds', 'fmb', 'time_of_possession',
        'punts_yds', 'penalty_yds', 'def_passing_yds',
        'def_passing_int', 'def_passing_times_sacked',
        'def_rushing_yds', 'def_fmb', 'def_time_of_possession', 'def_passing_times_sacked',
        'pass_play_percentage', 'def_pass_play_percentage', 'drives',
        'def_drives', 'tds_per_yard', 'def_tds_per_yard',
        'clutch_conversion_percentage', 'def_clutch_conversion_percentage',
        'fg_percentage'
        ]


    # Calculate weighted averages and store in the dictionary
    for col in columns:
        weighted_averages[col] = weighted_avg(merged_df_2, col, *weights)

    # Convert dictionary to a DataFrame (single-row)
    weighted_avg_df = pd.DataFrame([weighted_averages])

    return weighted_avg_df

In [144]:
def weighting_for_model_1994(merged_df_2, weights):    
    # Dictionary to store weighted averages
    weighted_averages = {}
    # List of columns to calculate weighted averages for

    columns = [
            'passing_yds', 'passing_int', 'passing_times_sacked', 'rushing_yds', 'fmb',
            'punts_yds', 'penalty_yds', 'def_passing_yds',
            'def_passing_int', 'def_passing_times_sacked',
            'def_rushing_yds', 'def_fmb', 'def_passing_times_sacked',
            'pass_play_percentage', 'def_pass_play_percentage', 'drives',
            'def_drives', 'tds_per_yard', 'def_tds_per_yard',
            'fg_percentage'
        ]


    # Calculate weighted averages and store in the dictionary
    for col in columns:
        weighted_averages[col] = weighted_avg(merged_df_2, col, *weights)

    # Convert dictionary to a DataFrame (single-row)
    weighted_avg_df = pd.DataFrame([weighted_averages])

    return weighted_avg_df

In [67]:
def create_df_for_dynamo_table(after_1994, before_1995):

    # Combine them, keeping all columns and rows
    scaled_inputs_df_all = pd.concat([before_1995, after_1994], ignore_index=True, sort=False)

    # Step 1: Drop 'Unnamed: 0'
    scaled_inputs_df_all = scaled_inputs_df_all.drop(columns=['Unnamed: 0'])

    # Create the query_key for dynamoDB
    scaled_inputs_df_all['team_year'] = scaled_inputs_df_all['team'].astype(str) + scaled_inputs_df_all['year'].astype(str)
    scaled_inputs_df_all = scaled_inputs_df_all.drop(columns=['team', 'year'])

    # Step 2: Define your desired order
    desired_order = ['team_year',
        'passing_yds', 'passing_int', 'passing_times_sacked', 'rushing_yds',
        'fmb', 'time_of_possession', 'punts_yds', 'penalty_yds',
        'def_passing_yds', 'def_passing_int', 'def_passing_times_sacked',
        'def_rushing_yds', 'def_fmb', 'def_time_of_possession',
        'pass_play_percentage', 'def_pass_play_percentage', 'drives',
        'def_drives', 'tds_per_yard', 'def_tds_per_yard',
        'clutch_conversion_percentage', 'def_clutch_conversion_percentage',
        'fg_percentage', 'home_game'
    ]

    # Step 3: Reorder
    scaled_inputs_df_all = scaled_inputs_df_all[desired_order]

    return scaled_inputs_df_all

In [113]:
def model_output(team, opponent, points_team1, points_team2):    
    sd = 8
    limit = 100
    n = 0
    team1_wins = 0
    team2_wins = 0

    # Create an empty DataFrame to store results
    df = pd.DataFrame(columns=[f"{team}", f"{opponent}"])

    while n < limit:
        # Add some variance to the scores

        team1_score = max(0, round(np.random.normal(loc=points_team1.item(), scale=sd), 0))
        team2_score = max(0, round(np.random.normal(loc=points_team2.item(), scale=sd), 0))

        # Append the scores to the DataFrame
        df.loc[len(df)] = [team1_score, team2_score]

        if team1_score > team2_score:
            team1_wins += 1
        else:
            team2_wins += 1

        n += 1

    team1_win_pct = team1_wins / limit

    return team1_win_pct, df

In [159]:
def get_predictions_for_model_1995(team, opp, team_year, opp_year):

    team_year1 = team + str(team_year)
    team_year2 = opp + str(opp_year)

    # Grab Team stats
    key = {"team_year": {"S": team_year1}}  # ← correctly builds the key

    response = jb.dynamodb.get_item(
        TableName=table_name,
        Key=key
    )

    item = response.get("Item", {})

    passing_yds = float(item["passing_yds"]["N"])
    passing_int = float(item["passing_int"]["N"])
    passing_times_sacked = float(item["passing_times_sacked"]["N"])
    rushing_yds = float(item["rushing_yds"]["N"])
    fmb = float(item["fmb"]["N"])
    time_of_possession = float(item["time_of_possession"]["N"])
    punts_yds = float(item["punts_yds"]["N"])
    penalty_yds = float(item["penalty_yds"]["N"])
    pass_play_percentage = float(item["pass_play_percentage"]["N"])
    drives = float(item["drives"]["N"])
    tds_per_yard = float(item["tds_per_yard"]["N"])
    clutch_conversion_percentage = float(item["clutch_conversion_percentage"]["N"])
    fg_percentage = float(item["fg_percentage"]["N"])
    home_game = float(item["home_game"]["N"])

    key2 = {"team_year": {"S": team_year2}}  # ← correctly builds the key


    # Grab Opp stats
    response2 = jb.dynamodb.get_item(
        TableName=table_name,
        Key=key2
    )

    item = response2.get("Item", {})

    def_passing_yds = float(item["def_passing_yds"]["N"])
    def_passing_int = float(item["def_passing_int"]["N"])
    def_passing_times_sacked = float(item["def_passing_times_sacked"]["N"])
    def_rushing_yds = float(item["def_rushing_yds"]["N"])
    def_fmb = float(item["def_fmb"]["N"])
    def_time_of_possession = float(item["def_time_of_possession"]["N"])
    def_pass_play_percentage = float(item["def_pass_play_percentage"]["N"])
    def_drives = float(item["def_drives"]["N"])
    def_tds_per_yard = float(item["def_tds_per_yard"]["N"])
    def_clutch_conversion_percentage = float(item["def_clutch_conversion_percentage"]["N"])


    # recreate original scaler array
    model_input_1995 = np.array([
        passing_yds,
        passing_int,
        passing_times_sacked,
        rushing_yds,
        fmb,
        time_of_possession,
        punts_yds,
        penalty_yds,
        def_passing_yds,
        def_passing_int,
        def_passing_times_sacked,
        def_rushing_yds,
        def_fmb,
        def_time_of_possession,
        pass_play_percentage,
        def_pass_play_percentage,
        drives,
        def_drives,
        tds_per_yard,
        def_tds_per_yard,
        clutch_conversion_percentage,
        def_clutch_conversion_percentage,
        fg_percentage,
        home_game
    ], dtype=float)


    model_input = np.array(model_input_1995.reshape(1, -1))
    predictions = model_1995.predict(model_input)


    return predictions

In [158]:
def get_predictions_for_model_1994(team, opp, team_year, opp_year):

    team_year1 = team + str(team_year)
    team_year2 = opp + str(opp_year)

    # Grab Team stats
    key = {"team_year": {"S": team_year1}}  # ← correctly builds the key

    response = jb.dynamodb.get_item(
        TableName=table_name,
        Key=key
    )

    item = response.get("Item", {})

    passing_yds = float(item["passing_yds"]["N"])
    passing_int = float(item["passing_int"]["N"])
    passing_times_sacked = float(item["passing_times_sacked"]["N"])
    rushing_yds = float(item["rushing_yds"]["N"])
    fmb = float(item["fmb"]["N"])
    punts_yds = float(item["punts_yds"]["N"])
    penalty_yds = float(item["penalty_yds"]["N"])
    pass_play_percentage = float(item["pass_play_percentage"]["N"])
    drives = float(item["drives"]["N"])
    tds_per_yard = float(item["tds_per_yard"]["N"])
    fg_percentage = float(item["fg_percentage"]["N"])
    home_game = float(item["home_game"]["N"])

    key2 = {"team_year": {"S": team_year2}}  # ← correctly builds the key


    # Grab Opp stats
    response2 = jb.dynamodb.get_item(
        TableName=table_name,
        Key=key2
    )

    item = response2.get("Item", {})

    def_passing_yds = float(item["def_passing_yds"]["N"])
    def_passing_int = float(item["def_passing_int"]["N"])
    def_passing_times_sacked = float(item["def_passing_times_sacked"]["N"])
    def_rushing_yds = float(item["def_rushing_yds"]["N"])
    def_fmb = float(item["def_fmb"]["N"])
    def_pass_play_percentage = float(item["def_pass_play_percentage"]["N"])
    def_drives = float(item["def_drives"]["N"])
    def_tds_per_yard = float(item["def_tds_per_yard"]["N"])

    # recreate original scaler array
    model_input_1994 = np.array([
        passing_yds,
        passing_int,
        passing_times_sacked,
        rushing_yds,
        fmb,
        punts_yds,
        penalty_yds,
        def_passing_yds,
        def_passing_int,
        def_passing_times_sacked,
        def_rushing_yds,
        def_fmb,
        pass_play_percentage,
        def_pass_play_percentage,
        drives,
        def_drives,
        tds_per_yard,
        def_tds_per_yard,
        fg_percentage,
        home_game
    ], dtype=float)


    model_input = np.array(model_input_1994.reshape(1, -1))
    predictions = model_1994.predict(model_input)

    return predictions

## Custom Variables

In [82]:
athena_database = "nfl"
dynamodb_table = "nfl_games_all"
athena_output_location = "s3://chalkjuice-backend/nfl_games_all_athena_parquet/"
weights = [5,7,9,13,.3,.25,.25,.2]

DYNAMODB_TABLE = "nfl_matchups_model_stats"
table_name = DYNAMODB_TABLE
partition_key = 'team_year'
attribute_type = 'S'

# Modeling

## Create df for dynamoDB

### Create oldest usable year dict

In [8]:
# AWS Credentials & Region
athena_database = "nfl"
dynamodb_table = "nfl_games_all"
athena_output_location = "s3://chalkjuice-backend/nfl_games_all_athena_parquet/"

In [9]:
def oldest_usable_season(team, games_back = 34):
    query = f"""
        WITH ordered_games AS (
            SELECT season,
                ROW_NUMBER() OVER (ORDER BY date_parse(date, '%c/%e/%Y') ASC) AS rn
            FROM "{athena_database}"."{dynamodb_table}"
            WHERE team = '{team}' 
        )
        SELECT season + 1 AS oldest_usable_season
        FROM ordered_games
        WHERE rn = {games_back}
    """
    query_execution_id = jb.query_athena(query, athena_database, athena_output_location)
    df = jb.create_df_from_athena_query(query_execution_id)

    year = df['oldest_usable_season'][0]

    return int(year)

In [None]:
teams = [
    "ARI", "ATL", "BAL", "BUF", "CAR", "CHI", "CIN", "CLE",
    "DAL", "DEN", "DET", "GNB", "HOU", "IND", "JAX", "KAN",
    "LAC", "LAR", "LVR", "MIA", "MIN", "NOR", "NWE", "NYG",
    "NYJ", "PHI", "PIT", "SEA", "SFO", "TAM", "TEN", "WAS"
]

team_season_dict = {}

for team in teams:
    try:
        season = oldest_usable_season(team)
        team_season_dict[team] = str(season)
    except Exception as e:
        print(f"Error for team {team}: {e}")
        team_season_dict[team] = None  # or skip if preferred

In [None]:
team_season_dict = {}

In [10]:
oldest_year_mapping = {
  'ARI': '1970',
  'ATL': '1970',
  'BAL': '1970',
  'BUF': '1970',
  'CAR': '1998',
  'CHI': '1970',
  'CIN': '1971',
  'CLE': '2002',
  'DAL': '1970',
  'DEN': '1970',
  'DET': '1970',
  'GNB': '1970',
  'HOU': '2005',
  'IND': '1970',
  'JAX': '1998',
  'KAN': '1970',
  'LAC': '1970',
  'LAR': '1970',
  'LVR': '1970',
  'MIA': '1970',
  'MIN': '1970',
  'NOR': '1970',
  'NWE': '1970',
  'NYG': '1970',
  'NYJ': '1970',
  'PHI': '1970',
  'PIT': '1970',
  'SEA': '1979',
  'SFO': '1970',
  'TAM': '1979',
  'TEN': '1970',
  'WAS': '1970'}

### Create model input df for caching > 1995

In [None]:
def get_scaler_averages(team, year1):
    opp = team 
    year2 = year1

    off_df = grab_data(team, True, year1)
    def_df = grab_data(opp, False, year2)

    merged_df = off_df.merge(def_df, left_index=True, right_index=True, how='inner')

    # Drop extra columns here

    merged_df = merged_df.astype(int)

    merged_df_features = create_features_for_model_1995(merged_df)

    weighted_avg_df = weighting_for_model_1994(merged_df_features, weights)
    
    weighted_avg_df['home_game'] = 1

    scaled_inputs = scaler_for_model_1995.transform(weighted_avg_df)

    full_row = np.insert(scaled_inputs.astype(object), 0, [team, year1])

    
    return full_row

In [12]:
columns = [ 'team', 'year',
    'passing_yds', 'passing_int', 'passing_times_sacked', 'rushing_yds',
    'fmb', 'time_of_possession', 'punts_yds', 'penalty_yds',
    'def_passing_yds', 'def_passing_int', 'def_passing_times_sacked',
    'def_rushing_yds', 'def_fmb', 'def_time_of_possession',
    'pass_play_percentage', 'def_pass_play_percentage', 'drives',
    'def_drives', 'tds_per_yard', 'def_tds_per_yard',
    'clutch_conversion_percentage', 'def_clutch_conversion_percentage',
    'fg_percentage', 'home_game'
]

# Create an empty DataFrame with one row of zeros
scaled_inputs_df = pd.DataFrame(columns=columns)

In [13]:
teams = [
    "ARI", "ATL", "BAL", "BUF", "CAR", "CHI", "CIN", "CLE",
    "DAL", "DEN", "DET", "GNB", "HOU", "IND", "JAX", "KAN",
    "LAC", "LAR", "LVR", "MIA", "MIN", "NOR", "NWE", "NYG",
    "NYJ", "PHI", "PIT", "SEA", "SFO", "TAM", "TEN", "WAS"
]

In [14]:
for team in teams:
    first_year = int(oldest_year_mapping[team])
    if first_year < 1995:
        first_year = 1995

    for i in range(first_year, 2024):
        year1 = i

        scaled_inputs = get_scaler_averages(team, year1)
        scaled_inputs_df.loc[len(scaled_inputs_df)] = scaled_inputs

        print(team, year1)


ARI 1995
ARI 1996
ARI 1997
ARI 1998
ARI 1999
ARI 2000
ARI 2001
ARI 2002
ARI 2003
ARI 2004
ARI 2005
ARI 2006
ARI 2007
ARI 2008
ARI 2009
ARI 2010
ARI 2011
ARI 2012
ARI 2013
ARI 2014
ARI 2015
ARI 2016
ARI 2017
ARI 2018
ARI 2019
ARI 2020
ARI 2021
ARI 2022
ARI 2023
ATL 1995
ATL 1996
ATL 1997
ATL 1998
ATL 1999
ATL 2000
ATL 2001
ATL 2002
ATL 2003
ATL 2004
ATL 2005
ATL 2006
ATL 2007
ATL 2008
ATL 2009
ATL 2010
ATL 2011
ATL 2012
ATL 2013
ATL 2014
ATL 2015
ATL 2016
ATL 2017
ATL 2018
ATL 2019
ATL 2020
ATL 2021
ATL 2022
ATL 2023
BAL 1995
BAL 1996
BAL 1997
BAL 1998
BAL 1999
BAL 2000
BAL 2001
BAL 2002
BAL 2003
BAL 2004
BAL 2005
BAL 2006
BAL 2007
BAL 2008
BAL 2009
BAL 2010
BAL 2011
BAL 2012
BAL 2013
BAL 2014
BAL 2015
BAL 2016
BAL 2017
BAL 2018
BAL 2019
BAL 2020
BAL 2021
BAL 2022
BAL 2023
BUF 1995
BUF 1996
BUF 1997
BUF 1998
BUF 1999
BUF 2000
BUF 2001
BUF 2002
BUF 2003
BUF 2004
BUF 2005
BUF 2006
BUF 2007
BUF 2008
BUF 2009
BUF 2010
BUF 2011
BUF 2012
BUF 2013
BUF 2014
BUF 2015
BUF 2016
BUF 2017
BUF 2018
B

In [15]:
scaled_inputs_df.to_csv('../data/bam.csv')

### Create model input df for caching < 1995

In [None]:
def get_scaler_averages_2(team, year1):
    opp = team 
    year2 = year1

    off_df = grab_data(team, True, year1)
    def_df = grab_data(opp, False, year2)

    merged_df = off_df.merge(def_df, left_index=True, right_index=True, how='inner')
    
    merged_df = merged_df.drop(columns=['time_of_possession', 'def_time_of_possession', '2pm', '2pa', 'def_2pm', 'def_2pa',
        '3d_att', 'def_3d_att', '3d_conversions', 'def_3d_conversions', '4d_att', 'def_4d_att', '4d_conversions', 
        'def_4d_conversions'])

    merged_df = merged_df.astype(int)

    merged_df_features = create_features_for_model_1994(merged_df)

    weighted_avg_df = weighting_for_model_1994(merged_df_features, weights)
    
    weighted_avg_df['home_game'] = 1

    scaled_inputs = scaler_for_model_1995.transform(weighted_avg_df)
    #predictions = model2.predict(scaled_inputs)

    full_row = np.insert(scaled_inputs.astype(object), 0, [team, year1])

    
    return full_row, weighted_avg_df

In [45]:
columns = [ 'team', 'year',
    'passing_yds', 'passing_int', 'passing_times_sacked', 'rushing_yds', 'fmb',
    'punts_yds', 'penalty_yds', 'def_passing_yds',
    'def_passing_int', 'def_passing_times_sacked',
    'def_rushing_yds', 'def_fmb',
    'pass_play_percentage', 'def_pass_play_percentage', 'drives',
    'def_drives', 'tds_per_yard', 'def_tds_per_yard',
    'fg_percentage', 'home_game'
]


# Create an empty DataFrame with one row of zeros
scaled_inputs_df_2 = pd.DataFrame(columns=columns)

In [46]:
for team in teams:
    first_year = int(oldest_year_mapping[team])
    for i in range(first_year, 1995):
        year1 = i

        scaled_inputs, weighted_avg_df = get_scaler_averages_2(team, year1)
        scaled_inputs_df_2.loc[len(scaled_inputs_df_2)] = scaled_inputs

        print(team, year1)

ARI 1970
ARI 1971
ARI 1972
ARI 1973
ARI 1974
ARI 1975
ARI 1976
ARI 1977
ARI 1978
ARI 1979
ARI 1980
ARI 1981
ARI 1982
ARI 1983
ARI 1984
ARI 1985
ARI 1986
ARI 1987
ARI 1988
ARI 1989
ARI 1990
ARI 1991
ARI 1992
ARI 1993
ARI 1994
ATL 1970
ATL 1971
ATL 1972
ATL 1973
ATL 1974
ATL 1975
ATL 1976
ATL 1977
ATL 1978
ATL 1979
ATL 1980
ATL 1981
ATL 1982
ATL 1983
ATL 1984
ATL 1985
ATL 1986
ATL 1987
ATL 1988
ATL 1989
ATL 1990
ATL 1991
ATL 1992
ATL 1993
ATL 1994
BAL 1970
BAL 1971
BAL 1972
BAL 1973
BAL 1974
BAL 1975
BAL 1976
BAL 1977
BAL 1978
BAL 1979
BAL 1980
BAL 1981
BAL 1982
BAL 1983
BAL 1984
BAL 1985
BAL 1986
BAL 1987
BAL 1988
BAL 1989
BAL 1990
BAL 1991
BAL 1992
BAL 1993
BAL 1994
BUF 1970
BUF 1971
BUF 1972
BUF 1973
BUF 1974
BUF 1975
BUF 1976
BUF 1977
BUF 1978
BUF 1979
BUF 1980
BUF 1981
BUF 1982
BUF 1983
BUF 1984
BUF 1985
BUF 1986
BUF 1987
BUF 1988
BUF 1989
BUF 1990
BUF 1991
BUF 1992
BUF 1993
BUF 1994
CHI 1970
CHI 1971
CHI 1972
CHI 1973
CHI 1974
CHI 1975
CHI 1976
CHI 1977
CHI 1978
CHI 1979
CHI 1980
C

In [None]:
scaled_inputs_df_2.to_csv('../data/bam_2.csv')

### create df to add to dynamo from the two smaller aggreagte dfs

In [None]:
after_1994 = pd.read_csv('../data/bam.csv')
before_1995 = pd.read_csv('../data/bam_2.csv')

scaled_inputs_df_all = create_df_for_dynamo_table(after_1994, before_1995)

scaled_inputs_df_all.to_csv('../data/scaled_inputs_df_all.csv', index=False)

## Add csv to dynamoDB

### create the dynamoDB table

In [76]:
x = jb.check_dynamo_table_exists(table_name)
if x:
    pass
else:
    jb.create_dynamodb_table(table_name, partition_key, attribute_type)
    print('table created')

❌ Table 'nfl_matchups_model_stats' does not exist.
table created


#### Add data to table

In [78]:
for _, row in scaled_inputs_df_all.iterrows():
    item = {"team_year": {"S": str(row["team_year"])}}
    for col in scaled_inputs_df_all.columns:
        if col != "team_year":
            value = row[col]
            if pd.isna(value):
                item[col] = {"NULL": True}
            else:
                item[col] = {"N": str(value)}

                
    jb.add_data_to_dynamo_table(table_name, item)


## For a game, grab the pregenerated summary stats, then return the scaled array for modeling

#### 'get_predictions_for_model_1994' combines into an array that mimics the earlier sklearn scaler

In [227]:
def model_output(team, opponent, points_team1, points_team2):    
    sd = 8
    limit = 100
    n = 0
    team1_wins = 0
    team2_wins = 0

    # Create an empty DataFrame to store results
    df = pd.DataFrame(columns=[f"{team} Points", f"{opponent} Points", "Winner"])
    while n < limit:
        # Add some variance to the scores

        team1_score = max(0, round(np.random.normal(loc=points_team1.item(), scale=sd), 0))
        team2_score = max(0, round(np.random.normal(loc=points_team2.item(), scale=sd), 0))

        if team1_score > team2_score:
            winner = team
        else:
            winner = opponent

        # Append the scores to the DataFrame
        df.loc[len(df)] = [team1_score, team2_score, winner]

        if team1_score > team2_score:
            team1_wins += 1
        else:
            team2_wins += 1

        n += 1

    team1_win_pct = team1_wins / limit

    df['Simulated Game #'] = df.index + 1
    df = df[['Simulated Game #'] + [col for col in df.columns if col != 'Simulated Game #']]
    y = df.columns[1]
    df[y] = df[y].astype(int)
    x = df.columns[2]
    df[x] = df[x].astype(int)



    return team1_win_pct, df

def get_model_predictions(team, opp, team_year, opp_year, table_name, model, model_1995_true):

    team_year1 = team + str(team_year)
    team_year2 = opp + str(opp_year)

    # Grab Team stats
    key = {"team_year": {"S": team_year1}}  # ← correctly builds the key

    response = jb.dynamodb.get_item(
        TableName=table_name,
        Key=key
    )

    item = response.get("Item", {})

    passing_yds = float(item["passing_yds"]["N"])
    passing_int = float(item["passing_int"]["N"])
    passing_times_sacked = float(item["passing_times_sacked"]["N"])
    rushing_yds = float(item["rushing_yds"]["N"])
    fmb = float(item["fmb"]["N"])
    if model_1995_true:
        time_of_possession = float(item["time_of_possession"]["N"])
    punts_yds = float(item["punts_yds"]["N"])
    penalty_yds = float(item["penalty_yds"]["N"])
    pass_play_percentage = float(item["pass_play_percentage"]["N"])
    drives = float(item["drives"]["N"])
    tds_per_yard = float(item["tds_per_yard"]["N"])
    if model_1995_true:
        clutch_conversion_percentage = float(item["clutch_conversion_percentage"]["N"])
    fg_percentage = float(item["fg_percentage"]["N"])
    home_game = float(item["home_game"]["N"])

    key2 = {"team_year": {"S": team_year2}}  # ← correctly builds the key


    # Grab Opp stats
    response2 = jb.dynamodb.get_item(
        TableName=table_name,
        Key=key2
    )

    item = response2.get("Item", {})

    def_passing_yds = float(item["def_passing_yds"]["N"])
    def_passing_int = float(item["def_passing_int"]["N"])
    def_passing_times_sacked = float(item["def_passing_times_sacked"]["N"])
    def_rushing_yds = float(item["def_rushing_yds"]["N"])
    def_fmb = float(item["def_fmb"]["N"])
    if model_1995_true:
        def_time_of_possession = float(item["def_time_of_possession"]["N"])
    def_pass_play_percentage = float(item["def_pass_play_percentage"]["N"])
    def_drives = float(item["def_drives"]["N"])
    def_tds_per_yard = float(item["def_tds_per_yard"]["N"])
    if model_1995_true:
        def_clutch_conversion_percentage = float(item["def_clutch_conversion_percentage"]["N"])

    if model_1995_true:
        # recreate original scaler array
        model_input_1995 = np.array([
            passing_yds,
            passing_int,
            passing_times_sacked,
            rushing_yds,
            fmb,
            time_of_possession,
            punts_yds,
            penalty_yds,
            def_passing_yds,
            def_passing_int,
            def_passing_times_sacked,
            def_rushing_yds,
            def_fmb,
            def_time_of_possession,
            pass_play_percentage,
            def_pass_play_percentage,
            drives,
            def_drives,
            tds_per_yard,
            def_tds_per_yard,
            clutch_conversion_percentage,
            def_clutch_conversion_percentage,
            fg_percentage,
            home_game
        ], dtype=float)

        model_input = np.array(model_input_1995.reshape(1, -1))
        predictions = model.predict(model_input)

    else:

        # recreate original scaler array
        model_input_1994 = np.array([
            passing_yds,
            passing_int,
            passing_times_sacked,
            rushing_yds,
            fmb,
            punts_yds,
            penalty_yds,
            def_passing_yds,
            def_passing_int,
            def_passing_times_sacked,
            def_rushing_yds,
            def_fmb,
            pass_play_percentage,
            def_pass_play_percentage,
            drives,
            def_drives,
            tds_per_yard,
            def_tds_per_yard,
            fg_percentage,
            home_game
        ], dtype=float)

        model_input = np.array(model_input_1994.reshape(1, -1))
        predictions = model.predict(model_input)


    return predictions

In [230]:
team = 'BUF'
opp = 'CAR'
team_season = 1970
opp_season = 2023

# For seasons after 1994 i am missing the TimeOfPossession and Cluth features for both off/def
if team_season <= 1994 or opp_season <= 1994:
    model_1995_true = False
    model_choice = model_1994
else:
    model_1995_true = True
    model_choice = model_1995

points_team1 = get_model_predictions(team, opp, team_season, opp_season, table_name, model_choice, model_1995_true)
points_team2 = get_model_predictions(opp, team, opp_season, team_season, table_name, model_choice, model_1995_true)

team1_win_pct, df = model_output(team, opp, points_team1, points_team2) 
team1_win_pct



0.27

In [231]:
df

Unnamed: 0,Simulated Game #,BUF Points,CAR Points,Winner
0,1,23,17,BUF
1,2,24,23,BUF
2,3,25,28,CAR
3,4,10,22,CAR
4,5,16,23,CAR
...,...,...,...,...
95,96,9,25,CAR
96,97,16,16,CAR
97,98,16,16,CAR
98,99,28,20,BUF


In [218]:
model_1994