# Train model

In [41]:
import pandas as pd
import numpy as np
import pymysql as mysql
import os
from datetime import datetime
import warnings
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error
import joblib
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Suppress all warnings
warnings.filterwarnings("ignore")


In [42]:
df = pd.read_csv('chalk_22_model.csv')

In [43]:
## outliers
def handle_outliers(df, feature):

    # calculate the mean and standard deviation of the feature
    mean = df[feature].mean()
    std = df[feature].std()

    # define the threshold for outliers (3 standard deviations)
    threshold = 3 * std

    # save the indices of outliers
    outlier_indices = df[(df[feature] < mean - threshold) | (df[feature] > mean + threshold)].index
   
   # replace outliers with NaN values
    #df.loc[outlier_indices, feature] = np.nan
    #print(f"Number of rows dropped for feature '{feature}': {len(outlier_indices)}")

    # you can also remove outliers from the DataFrame completely
    df = df.drop(outlier_indices)

    return df


In [44]:
def preprocessing(df, target):

    # outliers
    for feature in df.select_dtypes(include=[np.number]).columns:
        df = handle_outliers(df, feature)

    # partitioning
    X = df.drop(columns=[target])
    y = df[target]
    X_train, X_remaining, y_train, y_remaining = train_test_split(X, y, test_size=0.2, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_remaining, y_remaining, test_size=0.5, random_state=42)

    # scaling
    standard_scaler = StandardScaler()
    X_train = pd.DataFrame(standard_scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
    X_val   = pd.DataFrame(standard_scaler.transform(X_val), columns=X_val.columns, index=X_val.index)
    X_test  = pd.DataFrame(standard_scaler.transform(X_test), columns=X_test.columns, index=X_test.index)
    scaler_filename = 'chalk_22_scaler.pkl'
    joblib.dump(standard_scaler, scaler_filename)


    # feature importance
    #model = RandomForestRegressor(random_state=42)
    #clf = model.fit(X_train, y_train)
    #feature_importance(clf, X_train)

    # feature independece
    numerical_cols = X_test.select_dtypes(include=[float, int])
    results_list = []
    # iterate over all combinations of numerical columns
    for i, col1 in enumerate(numerical_cols.columns):
        for col2 in numerical_cols.columns[i+1:]:
            x = numerical_cols[col1]
            y = numerical_cols[col2]
            # calculate Pearson's correlation coefficient and p-value
            corr_coefficient, p_value = pearsonr(x, y)
            # append the results to the list
            results_list.append({'Variable1': col1, 'Variable2': col2, 'Correlation Coefficient': corr_coefficient, 'P-Value': p_value})
 
    # convert the list to a DataFrame
    results_df = pd.DataFrame(results_list)
    feature_independece_df = results_df.sort_values(by='Correlation Coefficient', ascending=False)

    return X_train, X_test, y_train, y_test, X_val, y_val, feature_independece_df

In [45]:
chalk_22_model_pts_df = df.drop(columns=['game_result', 'offense', 'defense'])
chalk_22_model_pts_df['date'] = pd.to_datetime(chalk_22_model_pts_df['date'])
for column in chalk_22_model_pts_df.columns:
    if column != 'date' and column != 'offense' and column != 'defense':
        chalk_22_model_pts_df[column] = chalk_22_model_pts_df[column].astype('int')
chalk_22_model_pts_df.dtypes
chalk_22_model_pts_df = chalk_22_model_pts_df.drop(columns=['date'])
X_train, X_test, y_train, y_test, X_val, y_val, feature_independece_df = preprocessing(chalk_22_model_pts_df, 'game_team_pts')

In [46]:
X_val.dtypes

drives                              float64
game_time_off                       float64
penalties                           float64
passing_yds                         float64
rushing_yds                         float64
passing_tds_total_yards             float64
rushing_tds_total_yards             float64
pass_play_percentage                float64
field_goal_percentage               float64
clutch_conversion_percentage        float64
passing_sacks_total                 float64
passing_int                         float64
rushing_fmb                         float64
def_passing_yds                     float64
def_rushing_yds                     float64
def_passing_tds_total_yards         float64
def_rushing_tds_total_yards         float64
def_game_time_off                   float64
def_drives                          float64
def_clutch_conversion_percentage    float64
def_passing_sacks_total             float64
def_passing_int                     float64
def_rushing_fmb                 

In [47]:

from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import matplotlib.pyplot as plt

# Fit the model
model = LinearRegression()
model.fit(X_train, y_train)
joblib.dump(model, 'lr_model.pkl')
predictions = model.predict(X_val)

In [48]:
predictions

array([19.42078644, 22.08985203, 14.6435549 , ..., 23.87580702,
       22.59669962, 18.56763621])

In [49]:
# Initialize the Linear Regression model
lr_model = LinearRegression()

# Fit the model on the training data
lr_model.fit(X_train, y_train)
joblib.dump(lr_model, 'lr_model.pkl')


['lr_model.pkl']