# Prep Barry

### import

In [6]:
import pandas as pd
import numpy as np
import pymysql as mysql
import os
from datetime import datetime
import warnings
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error

# Suppress all warnings
warnings.filterwarnings("ignore")


## connect to MySQL

In [7]:
cnx = mysql.connect(
        host='localhost',
        user='root',
        passwd=os.getenv('MYSQL'),
        database='nfl',
        port=int(3306)
)

cur = cnx.cursor()

def sql(query, params=None):
    cur = cnx.cursor()
    cur.execute(query)
    for x in cur:
        print(x)

def todf(query):
    df = pd.read_sql(query, cnx)
    return df

## create data frame of matchups to include (game_date	game_team	game_opp)

In [9]:
def oldest_game_date(team, cutoff_date, games_back):
    

    query = f"""
        SELECT game_date
        FROM games
        WHERE game_team = '{team}' 
            AND STR_TO_DATE(game_date, '%Y-%m-%d') >= '{cutoff_date}'
        ORDER BY game_date DESC
        """

    xxx = todf(query)
    xxx = xxx.iloc[:-games_back]
    oldest_game_date = xxx['game_date'].iloc[-1][:10]
    oldest_game_date = pd.to_datetime(oldest_game_date)
    oldest_game_date = oldest_game_date.strftime('%Y-%m-%d')

    return oldest_game_date

In [10]:
def relevant_historical_matchups_def(max_rows, max):

    majority_oldest_game = oldest_game_date('MIN', data_cutoff_date, games_back)

    if max == 0:
                q = f"""
        SELECT game_date, game_team, game_opp
        FROM games
        WHERE STR_TO_DATE(game_date, '%Y-%m-%d') >= '{majority_oldest_game}'
            AND game_result IN ('W', 'L', 'T')
        ORDER BY RAND()
        LIMIT {max_rows}
        """
    else:
        q = f"""
        SELECT game_date, game_team, game_opp
        FROM games
        WHERE STR_TO_DATE(game_date, '%Y-%m-%d') >= '{majority_oldest_game}'
            AND game_result IN ('W', 'L', 'T')
        ORDER BY RAND()
        """


    df_games = todf(q)
    df_games['game_date'] = pd.to_datetime(df_games['game_date'])

    baby_teams = ['HOU', 'JAX', 'CAR', 'CLE']
    for team in baby_teams:
        # get the oldest game date for each team and TERMINATE all games before that date
        oldest_game = oldest_game_date(team, data_cutoff_date, games_back)
        df_games = df_games[~((df_games['game_date'] < oldest_game) & ((df_games['game_team'] == team) | (df_games['game_opp'] == team)))]

    relevant_historical_matchups_df = df_games.sort_values(by='game_date', ascending=False)
    
    return relevant_historical_matchups_df

## For each historical_relevant_matchup, pull the weighted average of each feature across the lookback period and add to chalk_22_model_pts_df created earlier

In [11]:
def pull_raw_game_data(team1, team2, games_back, team1_date=None, team2_date=None):
        if team1_date is None:
                team1_date = datetime.now().strftime('%Y-%m-%d')  # Default to current date
        if team2_date is None:
                team2_date = datetime.now().strftime('%Y-%m-%d')  # Default to current date

        # Team 1
        # Offense
        team1_off_query = f"""
                SELECT *
                FROM games
                WHERE game_team = '{team1}'
                        AND STR_TO_DATE(game_date, '%Y-%m-%d') <= '{team1_date}'
                        AND game_result IN ('W', 'L', 'T')
                ORDER BY game_date DESC
                LIMIT {games_back}
                """
        #generate dataframe for team1 offense
        team1_off_df = todf(team1_off_query)

        # Team 2
        # Defense
        team2_def_query = f"""
                SELECT *
                FROM games
                WHERE game_opp = '{team2}'
                        AND STR_TO_DATE(game_date, '%Y-%m-%d') <= '{team2_date}'
                        AND game_result IN ('W', 'L', 'T')
                ORDER BY game_date DESC
                LIMIT {games_back}
                """
        #generate dataframe for team2 defense
        team2_def_df = todf(team2_def_query)

        # i use STR_TO_DATE becuase the nano seocndes will cause the current date to be treated like yesterdays

        points = team1_off_df['game_team_pts']
        points = points[0]
        points

        date_bb = team1_off_df['game_date'].iloc[0]

        result = team1_off_df['game_result'].iloc[0]


                        
        ## DROP THHE TOP ROW FOR MODEL TRAINING ASNANLYSIS DONT NEED THIS IN DEPLOYMENT
        ## i will also drop the +1 from the games_back variable

        team1_off_df = team1_off_df.drop(team1_off_df.index[0])
        team2_def_df = team2_def_df.drop(team2_def_df.index[0])

        offense = team1
        defense = team2
        
        return team1_off_df, team2_def_df, points, date_bb, result, offense, defense

In [12]:
def weighted_avg(df, col, gb1, gb2, gb3, gb4, weight1, weight2, weight3, weight4, inte = None):


    gb2 = gb1 + gb2
    gb3 = gb2 + gb3
    gb4 = gb3 + gb4

    average_gb1 = df[col].iloc[:gb1].mean()
    weighted_gb1 = average_gb1 * weight1

    average_gb2 = df[col].iloc[gb1:gb2].mean()
    weighted_gb2 = average_gb2 * weight2


    average_gb3 = df[col].iloc[gb2:gb3].mean()
    weighted_gb3 = average_gb3 * weight3

    average_gb4 = df[col].iloc[gb3:gb4].mean()
    weighted_gb4 = average_gb4 * weight4


    weighted_avg = round(((weighted_gb1 + weighted_gb2 + weighted_gb3 + weighted_gb4) / sum([weight1, weight2, weight3, weight4])), 3)

    
    if inte == 1:
        weighted_avg = int(weighted_avg)

    return weighted_avg

In [13]:
def pts_model_data_list_function(team1, team2, games_back, team1_date, gb1, gb2, gb3, gb4, weight1, weight2, weight3, weight4):  
    # format date
    t1_date = pd.to_datetime(team1_date)
    t1_date = t1_date.strftime('%Y-%m-%d')

    # set both dates equal for training
    team2_date = team1_date

    team1_off_df, team2_def_df, points, date, result, offense, defense = pull_raw_game_data(team1, team2, games_back, team1_date, team2_date)

    # Offense

    # field goal percentage last 10 games
    field_goal_percentage = int(round((team1_off_df['speams_FGM'].iloc[:10].sum() / team1_off_df['speams_FGA'].iloc[:10].sum()), 3)*100)
    clutch_conversion_percentage = int(round((team1_off_df['downs_3D_con'].iloc[:20].sum() + team1_off_df['downs_4D_con'].iloc[:20].sum() + team1_off_df['speams_2PM'].iloc[:20].sum()) / (team1_off_df['downs_3D_att'].iloc[:20].sum() + team1_off_df['downs_4D_att'].iloc[:20].sum() + team1_off_df['speams_2PA'].iloc[:20].sum()), 3)*100)

    # drives, passing and rushing td/total yards, passing play percentage
    team1_off_df['drives'] = (team1_off_df['passing_tds'] + team1_off_df['rushing_tds'] + team1_off_df['speams_FGA'] + team1_off_df['speams_punts_total']).astype(int)
    team1_off_df['passing_tds_total_yards'] = round((team1_off_df['passing_tds'] / (team1_off_df['passing_yds'] + team1_off_df['rushing_yds']))*10000, 1).astype(int)
    team1_off_df['rushing_tds_total_yards'] = round((team1_off_df['rushing_tds'] / (team1_off_df['passing_yds'] + team1_off_df['rushing_yds']))*10000, 0).astype(int)
    team1_off_df['pass_play_percentage'] = round((team1_off_df['passing_att'] / (team1_off_df['passing_att'] + team1_off_df['rushing_att']))*100, 0).astype(int)


    # weighted averages
    rushing_tds_total_yards = weighted_avg(team1_off_df, 'rushing_tds_total_yards', gb1, gb2, gb3, gb4, weight1, weight2, weight3, weight4, 1)
    passing_tds_total_yards = weighted_avg(team1_off_df, 'passing_tds_total_yards', gb1, gb2, gb3, gb4, weight1, weight2, weight3, weight4, 1)
    pass_play_percentage = weighted_avg(team1_off_df, 'pass_play_percentage', gb1, gb2, gb3, gb4, weight1, weight2, weight3, weight4, 1)
    drives = int(weighted_avg(team1_off_df, 'drives', gb1, gb2, gb3, gb4, weight1, weight2, weight3, weight4)*10)
    game_time_off = int(weighted_avg(team1_off_df, 'game_time_off', gb1, gb2, gb3, gb4, weight1, weight2, weight3, weight4)*10)
    penalties = weighted_avg(team1_off_df, 'game_pen_yds', gb1, gb2, gb3, gb4, weight1, weight2, weight3, weight4, 1)
    passing_yds = weighted_avg(team1_off_df, 'passing_yds', gb1, gb2, gb3, gb4, weight1, weight2, weight3, weight4, 1) 
    rushing_yds = weighted_avg(team1_off_df, 'rushing_yds', gb1, gb2, gb3, gb4, weight1, weight2, weight3, weight4, 1)
    passing_sacks_total = int(weighted_avg(team1_off_df, 'passing_sacks_total', gb1, gb2, gb3, gb4, weight1, weight2, weight3, weight4)*10)
    passing_int = int(weighted_avg(team1_off_df, 'passing_int', gb1, gb2, gb3, gb4, weight1, weight2, weight3, weight4)*100)
    rushing_fmb = int(weighted_avg(team1_off_df, 'rushing_fmb', gb1, gb2, gb3, gb4, weight1, weight2, weight3, weight4)*100)

    # Defense

    # drives, passing and rushing td/total yards, passing play percentage
    team2_def_df['drives'] = (team2_def_df['passing_tds'] + team2_def_df['rushing_tds'] + team2_def_df['speams_FGA'] + team2_def_df['speams_punts_total']).astype(int)
    team2_def_df['passing_tds_total_yards'] = round((team2_def_df['passing_tds'] / (team2_def_df['passing_yds'] + team2_def_df['rushing_yds']))*10000, 1).astype(int)
    team2_def_df['rushing_tds_total_yards'] = round((team2_def_df['rushing_tds'] / (team2_def_df['passing_yds'] + team2_def_df['rushing_yds']))*10000, 0).astype(int)
    team2_def_df['pass_play_percentage'] = round((team2_def_df['passing_att'] / (team2_def_df['passing_att'] + team2_def_df['rushing_att']))*100, 0).astype(int)

    # weighted averages
    def_rushing_tds_total_yards = weighted_avg(team2_def_df, 'rushing_tds_total_yards', gb1, gb2, gb3, gb4, weight1, weight2, weight3, weight4, 1)
    def_passing_tds_total_yards = weighted_avg(team2_def_df, 'passing_tds_total_yards', gb1, gb2, gb3, gb4, weight1, weight2, weight3, weight4, 1)
    def_drives = int(weighted_avg(team2_def_df, 'drives', gb1, gb2, gb3, gb4, weight1, weight2, weight3, weight4)*10)
    def_game_time_off = int(weighted_avg(team2_def_df, 'game_time_off', gb1, gb2, gb3, gb4, weight1, weight2, weight3, weight4)*10)
    def_passing_yds = weighted_avg(team2_def_df, 'passing_yds', gb1, gb2, gb3, gb4, weight1, weight2, weight3, weight4, 1) 
    def_rushing_yds = weighted_avg(team2_def_df, 'rushing_yds', gb1, gb2, gb3, gb4, weight1, weight2, weight3, weight4, 1)
    def_passing_sacks_total = int(weighted_avg(team2_def_df, 'passing_sacks_total',  gb1, gb2, gb3, gb4, weight1, weight2, weight3, weight4)*10)
    def_passing_int = int(weighted_avg(team2_def_df, 'passing_int',  gb1, gb2, gb3, gb4, weight1, weight2, weight3, weight4)*100)
    def_rushing_fmb = int(weighted_avg(team2_def_df, 'rushing_fmb',  gb1, gb2, gb3, gb4, weight1, weight2, weight3, weight4)*100)
    def_clutch_conversion_percentage = int(round((team2_def_df['downs_3D_con'].iloc[:20].sum() + team2_def_df['downs_4D_con'].iloc[:20].sum() + team2_def_df['speams_2PM'].iloc[:20].sum()) /
                                            (team2_def_df['downs_3D_att'].iloc[:20].sum() + team2_def_df['downs_4D_att'].iloc[:20].sum() + team2_def_df['speams_2PA'].iloc[:20].sum()), 3)*100)


    pts_model_data_list = [date, points, result, offense, defense, drives, game_time_off, penalties, passing_yds, rushing_yds, 
                       passing_tds_total_yards, rushing_tds_total_yards, pass_play_percentage, field_goal_percentage, 
        clutch_conversion_percentage, passing_sacks_total, passing_int, rushing_fmb, def_passing_yds, def_rushing_yds, 
        def_passing_tds_total_yards, def_rushing_tds_total_yards, def_game_time_off, def_drives,
        def_clutch_conversion_percentage, def_passing_sacks_total, def_passing_int, def_rushing_fmb]
    
    return pts_model_data_list

### Preprocessing 

In [14]:
## outliers
def handle_outliers(df, feature):

    # calculate the mean and standard deviation of the feature
    mean = df[feature].mean()
    std = df[feature].std()

    # define the threshold for outliers (3 standard deviations)
    threshold = 3 * std

    # save the indices of outliers
    outlier_indices = df[(df[feature] < mean - threshold) | (df[feature] > mean + threshold)].index
   
   # replace outliers with NaN values
    #df.loc[outlier_indices, feature] = np.nan
    #print(f"Number of rows dropped for feature '{feature}': {len(outlier_indices)}")

    # you can also remove outliers from the DataFrame completely
    df = df.drop(outlier_indices)

    return df

In [15]:
def feature_importance(model, original_x):
  # get feature importances and corresponding feature names
  importances = model.feature_importances_
  feature_names = original_x.columns

  # create a dictionary of feature names and importances then sort and extract
  feature_importance_dict = dict(zip(feature_names, importances))
  sorted_feature_importance = dict(sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True))
  sorted_feature_names = list(sorted_feature_importance.keys())
  sorted_importances = list(sorted_feature_importance.values())

  # create the barplot
  plt.figure(figsize=(10, 6))
  ax = sns.barplot(y=sorted_feature_names, x=sorted_importances, orient='h')

  # calculate the total importance
  total_importance = sum(sorted_importances)

  # add percentages to the end of each bar
  for i, importance in enumerate(sorted_importances):
      percentage = (importance / total_importance) * 100
      ax.text(importance + 0.01, i, f'{percentage:.2f}%', va='center')

  # calculate top 5 leaderboard
  top_features = sorted_feature_names[:10]
  top_leaderboard = "\n".join([f"{i+1}. {feature}" for i, feature in enumerate(top_features)])

  # add top leaderboard text to the bottom right
  plt.text(0.9, 0.1, top_leaderboard, transform=ax.transAxes, fontsize=10, ha='right')

  plt.xlabel('Importance')
  plt.ylabel('Feature')
  plt.title('Feature Importances')

  plt.show()

In [16]:
def preprocessing(df, target):

    # outliers
    for feature in df.select_dtypes(include=[np.number]).columns:
        df = handle_outliers(df, feature)
    print(target)
    # partitioning
    X = df.drop(columns=[target])
    y = df[target]
    X_train, X_remaining, y_train, y_remaining = train_test_split(X, y, test_size=0.2, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_remaining, y_remaining, test_size=0.5, random_state=42)

    # scaling
    standard_scaler = StandardScaler()
    X_train = pd.DataFrame(standard_scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
    X_val   = pd.DataFrame(standard_scaler.transform(X_val), columns=X_val.columns, index=X_val.index)
    X_test  = pd.DataFrame(standard_scaler.transform(X_test), columns=X_test.columns, index=X_test.index)

    # feature importance
    #model = RandomForestRegressor(random_state=42)
    #clf = model.fit(X_train, y_train)
    #feature_importance(clf, X_train)

    # feature independece
    numerical_cols = X_test.select_dtypes(include=[float, int])
    results_list = []
    # iterate over all combinations of numerical columns
    for i, col1 in enumerate(numerical_cols.columns):
        for col2 in numerical_cols.columns[i+1:]:
            x = numerical_cols[col1]
            y = numerical_cols[col2]
            # calculate Pearson's correlation coefficient and p-value
            corr_coefficient, p_value = pearsonr(x, y)
            # append the results to the list
            results_list.append({'Variable1': col1, 'Variable2': col2, 'Correlation Coefficient': corr_coefficient, 'P-Value': p_value})
 
    # convert the list to a DataFrame
    results_df = pd.DataFrame(results_list)
    results_df = results_df.sort_values(by='Correlation Coefficient', ascending=False)

    return X_train, X_test, y_train, y_test, X_val, y_val, results_df

## chalk_22_model General Model

In [17]:
df = chalk_22_model_pts_df
target = 'game_team_pts'
X_train, X_test, y_train, y_test, X_val, y_val, results_df = preprocessing(df, target)

NameError: name 'chalk_22_model_pts_df' is not defined

In [213]:
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import matplotlib.pyplot as plt

desired_confidence = 0.51

# Fit the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the validation set
X_val_sm = sm.add_constant(X_val)  # Add a constant term for statsmodels
predictions = model.predict(X_val)

# Calculate prediction intervals using statsmodels
X_train_sm = sm.add_constant(X_train)
model_sm = sm.OLS(y_train, X_train_sm).fit()
predicted = model_sm.get_prediction(X_val_sm)
prediction_summary = predicted.summary_frame(alpha=(1 - desired_confidence))



In [214]:
# Calculate RMSE on the y_test set
y_test_predictions = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_test_predictions))
# Calculate the average difference between high and low confidence intervals
confidence_interval_diff = prediction_summary['obs_ci_upper'] - prediction_summary['obs_ci_lower']
average_confidence_interval_diff = confidence_interval_diff.mean()

In [42]:
# Create the DataFrame with weights and calculated values
bigger_list = *weights, round(rmse, 3), round(average_confidence_interval_diff, 1)

# Ensure the list length matches the number of DataFrame columns
if len(bigger_list) == len(bigger_df.columns):
    # Create a DataFrame from the new row
    new_row_df = pd.DataFrame([bigger_list], columns=bigger_df.columns)


NameError: name 'bigger_df' is not defined

## Run the model 216 times to find the most predictive lookback period length and weights used to measure previous performance. 

### all comb

In [216]:
import itertools

# Define the lists for the first four variables
var1 = [5]
var2 = [3, 9]
var3 = [3]
var4 = [6]

# Define the lists for the last four variables (one of these should be chosen)
#list1 = [0.4, 0.3, 0.2, 0.1]
list2 = [0.5, 0.2, 0.15, 0.15]
list3 = [0.3, 0.25, 0.25, 0.2]

# Combine the lists for the last four variables into a list of lists
last_vars = [list2, list3]

# Generate combinations for the first four variables
first_combinations = list(itertools.product(var1, var2, var3, var4))

# Create all possible combinations by adding the selected last four variables
all_combinations = []
for last_list in last_vars:
    for combo in first_combinations:
        # Combine each of the first four variable combinations with one of the last four variable lists
        new_combinations = list(itertools.product([combo], [last_list]))
        for nc in new_combinations:
            all_combinations.append(nc[0] + tuple(nc[1]))
len(all_combinations)

4

### run the def

In [217]:
# Create a DataFrame from bigger_list
bigger_df = pd.DataFrame(columns=['gb1', 'gb2', 'gb3', 'gb4', 'w1', 'w2', 'w3', 'w4','rmse', 'diff'])

In [65]:
for comb in all_combinations:
    games_back_1 = comb[0]
    games_back_2 = comb[1]
    games_back_3 = comb[2]
    games_back_4 = comb[3]
    weight_1 = comb[4]
    weight_2 = comb[5]
    weight_3 = comb[6]
    weight_4 = comb[7]

    # independent variables
    weights = [games_back_1, games_back_2, games_back_3, games_back_4, weight_1, weight_2, weight_3, weight_4]
    num_matchups = 10000

    # dependent variables
    data_cutoff_date = '1991-06-01'
    baby_teams = ['HOU', 'JAX', 'CAR', 'CLE']
    games_back = weights[0] + weights[1] + weights[2] + weights[3] + 1


    # enter 1 if you want the maximum rows
    historical_relevant_matchups_df = relevant_historical_matchups_def(num_matchups, 0)

    new_rows = []
    z = len(historical_relevant_matchups_df)
    zz = 0
    start_time = time.time()

    # create empty dataframe which will be used to train both models
    chalk_22_model = pd.DataFrame(columns=[ 

        # I want to be able to generate a score and a a varience to generate odds & confidence scores 
        'date', 'game_team_pts', 'game_result','offense', 'defense', 

        # only give offense the home field advantage varible.
        # penalties need to be split offense defense eventually, for now i have it hinder only the offense score prediction.
        # i.i. i am basically assuming each team's offense makes all their penalties. 
        # 'drives', 'game_time_off' are invcluded in both to try and capture the interconnectedness of the two 
        # sides of the ball in terms of their ablilty to affect hoow much time each side gets to play.

        # Offense
        'drives', 'game_time_off', 'penalties',
        'passing_yds', 'rushing_yds', 'passing_tds_total_yards', 'rushing_tds_total_yards',
        'pass_play_percentage', 'field_goal_percentage', 'clutch_conversion_percentage',
        'passing_sacks_total', 'passing_int', 'rushing_fmb',
        
        # Defense
        'def_passing_yds', 'def_rushing_yds', 'def_passing_tds_total_yards', 'def_rushing_tds_total_yards',
        'def_game_time_off', 'def_drives', 
        'def_clutch_conversion_percentage',
        'def_passing_sacks_total', 'def_passing_int', 'def_rushing_fmb'

        ])

    # addd the weighted averages to a list
    for index, row in historical_relevant_matchups_df.iterrows():
        # Save each column value into individual variables
        team1 = row['game_team']
        team2 = row['game_opp']
        date = row['game_date']

        date = date.strftime('%Y-%m-%d')

        pts_model_data_list = pts_model_data_list_function(team1, team2, games_back, date, *weights)
    
        # Check if the list length matches the number of columns in the DataFrame
        if len(pts_model_data_list) == len(chalk_22_model.columns):
            # Convert the list to a DataFrame with a single row
            new_rows.append(pts_model_data_list)
        else:
            print("The length of the list does not match the number of DataFrame columns.")

        zz += 1 

        # Calculate and print the elapsed time
        elapsed_time = time.time() - start_time

            # Check if zz is a multiple of 100
        if zz % 100 == 0:
            print("Time: ", elapsed_time, "Date: ", date, "   ", zz, " of ", z)
        else:
            pass

    # add new_rows to chalk_22_model
    if new_rows:
        new_df = pd.DataFrame(new_rows, columns=chalk_22_model.columns)
        chalk_22_model = pd.concat([chalk_22_model, new_df], ignore_index=True)


    chalk_22_model_pts_df = chalk_22_model.drop(columns=['game_result', 'offense', 'defense'])
    chalk_22_model_pts_df['date'] = pd.to_datetime(chalk_22_model_pts_df['date'])
    for column in chalk_22_model_pts_df.columns:
        if column != 'date' and column != 'offense' and column != 'defense':
            chalk_22_model_pts_df[column] = chalk_22_model_pts_df[column].astype('int')
    chalk_22_model_pts_df.dtypes
    chalk_22_model_pts_df = chalk_22_model_pts_df.drop(columns=['date'])


    df = chalk_22_model_pts_df
    target = 'game_team_pts'
    X_train, X_test, y_train, y_test, X_val, y_val, results_df = preprocessing(df, target)



    desired_confidence = 0.51

    # Fit the model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Predict on the validation set
    X_val_sm = sm.add_constant(X_val)  # Add a constant term for statsmodels
    predictions = model.predict(X_val)

    # Calculate prediction intervals using statsmodels
    X_train_sm = sm.add_constant(X_train)
    model_sm = sm.OLS(y_train, X_train_sm).fit()
    predicted = model_sm.get_prediction(X_val_sm)
    prediction_summary = predicted.summary_frame(alpha=(1 - desired_confidence))

    # Calculate RMSE on the y_test set
    y_test_predictions = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_test_predictions))
    # Calculate the average difference between high and low confidence intervals
    confidence_interval_diff = prediction_summary['obs_ci_upper'] - prediction_summary['obs_ci_lower']
    average_confidence_interval_diff = confidence_interval_diff.mean()

    # Create the DataFrame with weights and calculated values
    bigger_list = *weights, round(rmse, 3), round(average_confidence_interval_diff, 1)
    print(bigger_list)
    # Ensure the list length matches the number of DataFrame columns
    if len(bigger_list) == len(bigger_df.columns):
        # Create a DataFrame from the new row
        new_row_df = pd.DataFrame([bigger_list], columns=bigger_df.columns)
        
        # Append the new row to the original DataFrame
        bigger_df = pd.concat([bigger_df, new_row_df], ignore_index=True)

: 

In [186]:
bigger_df.to_csv('bigger_df.csv', index=False)

In [None]:
    # independent variables
    weights = [5,7,9,13,.3,.25,.25,.2]
    num_matchups = 200

    # dependent variables
    data_cutoff_date = '1991-06-01'
    baby_teams = ['HOU', 'JAX', 'CAR', 'CLE']
    games_back = weights[0] + weights[1] + weights[2] + weights[3] + 1


    # enter 1 if you want the maximum rows
    historical_relevant_matchups_df = relevant_historical_matchups_def(num_matchups, 0)

    new_rows = []
    z = len(historical_relevant_matchups_df)
    zz = 0
    start_time = time.time()

    # create empty dataframe which will be used to train both models
    chalk_22_model = pd.DataFrame(columns=[ 

        # I want to be able to generate a score and a a varience to generate odds & confidence scores 
        'date', 'game_team_pts', 'game_result','offense', 'defense', 

        # only give offense the home field advantage varible.
        # penalties need to be split offense defense eventually, for now i have it hinder only the offense score prediction.
        # i.i. i am basically assuming each team's offense makes all their penalties. 
        # 'drives', 'game_time_off' are invcluded in both to try and capture the interconnectedness of the two 
        # sides of the ball in terms of their ablilty to affect hoow much time each side gets to play.

        # Offense
        'drives', 'game_time_off', 'penalties',
        'passing_yds', 'rushing_yds', 'passing_tds_total_yards', 'rushing_tds_total_yards',
        'pass_play_percentage', 'field_goal_percentage', 'clutch_conversion_percentage',
        'passing_sacks_total', 'passing_int', 'rushing_fmb',
        
        # Defense
        'def_passing_yds', 'def_rushing_yds', 'def_passing_tds_total_yards', 'def_rushing_tds_total_yards',
        'def_game_time_off', 'def_drives', 
        'def_clutch_conversion_percentage',
        'def_passing_sacks_total', 'def_passing_int', 'def_rushing_fmb'

        ])

    # addd the weighted averages to a list
    for index, row in historical_relevant_matchups_df.iterrows():
        # Save each column value into individual variables
        team1 = row['game_team']
        team2 = row['game_opp']
        date = row['game_date']

        date = date.strftime('%Y-%m-%d')

        pts_model_data_list = pts_model_data_list_function(team1, team2, games_back, date, *weights)
    
        # Check if the list length matches the number of columns in the DataFrame
        if len(pts_model_data_list) == len(chalk_22_model.columns):
            # Convert the list to a DataFrame with a single row
            new_rows.append(pts_model_data_list)
        else:
            print("The length of the list does not match the number of DataFrame columns.")

        zz += 1 

        # Calculate and print the elapsed time
        elapsed_time = time.time() - start_time

            # Check if zz is a multiple of 100
        if zz % 100 == 0:
            print("Time: ", elapsed_time, "Date: ", date, "   ", zz, " of ", z)
        else:
            pass

    # add new_rows to chalk_22_model
    if new_rows:
        new_df = pd.DataFrame(new_rows, columns=chalk_22_model.columns)
        chalk_22_model = pd.concat([chalk_22_model, new_df], ignore_index=True)


    chalk_22_model_pts_df = chalk_22_model.drop(columns=['game_result', 'offense', 'defense'])
    chalk_22_model_pts_df['date'] = pd.to_datetime(chalk_22_model_pts_df['date'])
    for column in chalk_22_model_pts_df.columns:
        if column != 'date' and column != 'offense' and column != 'defense':
            chalk_22_model_pts_df[column] = chalk_22_model_pts_df[column].astype('int')
    chalk_22_model_pts_df.dtypes
    chalk_22_model_pts_df = chalk_22_model_pts_df.drop(columns=['date'])