# Linear Regression

In [6]:
import mysql.connector
from mysql.connector import errorcode
import config
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.formula.api import ols
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import scipy.stats as stats
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler
pd.options.display.max_columns = 1000

%matplotlib inline

In [14]:
dbname = 'nba'
cnx = mysql.connector.connect(
    host = config.host,
    user = config.user,
    passwd = config.password,
    db = dbname
)
cursor = cnx.cursor(buffered = True)

## Preprocessing

### Helper functions

#### Function to convert minutes to seconds

In [15]:
def calc_secs(x):
    return int(x.split(':')[0])*60 + int(x.split(':')[1])

In [16]:
def sql_to_df(table, year=''):
    
    if year:
        year_clean = '00' + year[0] + year[2:]
        select_statement = "select * from nba.{} where (GAME_ID REGEXP '(^{})') order by GAME_ID".format(table, year_clean)
    else:
        select_statement = "SELECT * FROM nba.{} order by GAME_ID;".format(table)
    
    cursor.execute(select_statement)
    data = cursor.fetchall()
    data_df = pd.DataFrame(data)
    
    field_names = [i[0] for i in cursor.description]
    data_df.columns = field_names
    
    return data_df

### Aggregation functions

#### Function to calculate rolling statistics for players from box score data

In [19]:
def formated_player_stats(table, cols, year = '', lag = None):
    
    nba_df = sql_to_df(table, year)
    
    ### Drop games where players do not play
    nba_df_played = nba_df[nba_df['MIN'].notnull()]
    
    ### Calculate seconds played in a game
    nba_df_played['SEC'] = nba_df_played['MIN'].apply(lambda x : calc_secs(x))
    
    
    players = []
    cols = cols
    for player in nba_df_played['PLAYER_ID'].unique():
        df = nba_df_played[nba_df_played['PLAYER_ID']==player]
        for col in cols:
            df['expanding_{}'.format(col)] = df.expanding().mean()[col].shift(1)
            if lag:
                df['rolling_{}'.format(col+str(lag))] = df.rolling(lag).mean()[col].shift(1)
        df.dropna(inplace=True)
        players.append(df)
    return pd.concat(players)

In [20]:
test = formated_player_stats('box_score', ['PTS'], year='2018', lag = 5)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-cop

In [21]:
test[test['PLAYER_NAME']=='Terry Rozier']

Unnamed: 0,GAME_ID,TEAM_ID,TEAM_ABBREVIATION,TEAM_CITY,PLAYER_ID,PLAYER_NAME,START_POSITION,COMMENT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,PF,PTS,PLUS_MINUS,pk,TOS,SEASON_ID,DK_PTS,DD,TD,doubles,SEC,expanding_PTS,rolling_PTS5
1895,0021800074,1610612738,BOS,Boston,1626179,Terry Rozier,,,27:21,4.0,8.0,0.500,2.0,3.0,0.667,4.0,4.0,1.0,0.0,8.0,8.0,2.0,0.0,0.0,2.0,14.0,15.0,00218000741626179,0.0,22018,28.00,0,0,1.0,1641,7.600000,7.6
2525,0021800099,1610612738,BOS,Boston,1626179,Terry Rozier,,,19:28,1.0,4.0,0.250,0.0,1.0,0.000,1.0,2.0,0.5,0.0,1.0,1.0,2.0,2.0,1.0,0.0,3.0,5.0,00218000991626179,1.0,22018,12.75,0,0,0.0,1168,8.666667,8.2
2926,0021800115,1610612738,BOS,Boston,1626179,Terry Rozier,,,26:09,3.0,11.0,0.273,1.0,4.0,0.250,0.0,0.0,0.0,0.0,7.0,7.0,3.0,0.0,2.0,4.0,7.0,-12.0,00218001151626179,1.0,22018,24.25,0,0,0.0,1569,7.857143,7.2
3230,0021800127,1610612738,BOS,Boston,1626179,Terry Rozier,,,18:56,2.0,5.0,0.400,2.0,4.0,0.500,1.0,2.0,0.5,0.0,6.0,6.0,3.0,0.0,0.0,0.0,7.0,-9.0,00218001271626179,1.0,22018,19.50,0,0,0.0,1136,7.750000,6.6
3685,0021800145,1610612738,BOS,Boston,1626179,Terry Rozier,,,22:21,2.0,9.0,0.222,2.0,5.0,0.400,0.0,0.0,0.0,0.0,3.0,3.0,0.0,1.0,1.0,2.0,6.0,-13.0,00218001451626179,1.0,22018,14.25,0,0,0.0,1341,7.666667,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28856,0021801153,1610612738,BOS,Boston,1626179,Terry Rozier,,,20:08,1.0,5.0,0.200,1.0,4.0,0.250,2.0,2.0,1.0,0.0,3.0,3.0,2.0,0.0,0.0,1.0,5.0,-12.0,00218011531626179,1.0,22018,11.75,0,0,0.0,1208,9.013514,6.0
29296,0021801170,1610612738,BOS,Boston,1626179,Terry Rozier,,,23:48,1.0,5.0,0.200,1.0,3.0,0.333,2.0,2.0,1.0,1.0,2.0,3.0,3.0,0.0,1.0,2.0,5.0,-4.0,00218011701626179,0.0,22018,15.75,0,0,0.0,1428,8.960000,4.6
29612,0021801183,1610612738,BOS,Boston,1626179,Terry Rozier,,,6:53,1.0,2.0,0.500,1.0,2.0,0.500,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,7.0,00218011831626179,0.0,22018,5.00,0,0,0.0,413,8.907895,4.8
30127,0021801203,1610612738,BOS,Boston,1626179,Terry Rozier,,,21:02,3.0,6.0,0.500,1.0,3.0,0.333,0.0,0.0,0.0,0.0,5.0,5.0,2.0,0.0,0.0,2.0,7.0,-1.0,00218012031626179,4.0,22018,14.75,0,0,0.0,1262,8.831169,3.2
