In [1]:
# Import libraries
import nba_api.stats.endpoints
from nba_api.stats.endpoints import leaguegamefinder, playbyplayv2
from nba_api.stats.static import teams
import pandas as pd
from tqdm import tqdm
import time
import datetime
import numpy as np
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize, MinMaxScaler, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from imblearn.over_sampling import RandomOverSampler
import requests, sqlalchemy
from bs4 import BeautifulSoup
import itertools

# connect to postgres database
engine = sqlalchemy.create_engine('postgresql://postgres:password@localhost:5432/NBA')

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Score difference

In [3]:
# read training data from database
work = pd.read_sql_query("select * from train", con=engine)

# playoffs dummy variable
work['playoff'] = work['season_id'].str.extract(r'(\d)\d{4}').astype(int)
work['playoff'] = work['playoff'].replace(2,0)
work['playoff'] = work['playoff'].replace(4,1)
work['season_id'] = work['season_id'].str.replace('^\d','2')

# Get season after 1995
work = work[work['season_id'] >= '21996']

In [7]:
# drop columns we dont need
train = work.drop(columns=['team_id','team_abbreviation','game_date','matchup','min','days'])

# compute difference in stats
train['blk_diff'] = train['blk'] - train['blk_oppos']
train['oreb_diff'] = train['oreb'] - train['oreb_oppos']
train['reb_diff'] = train['reb'] - train['reb_oppos']
train['ast_diff'] = train['ast'] - train['ast_oppos']
train['stl_diff'] = train['stl'] - train['stl_oppos']
train['tov_diff'] = train['tov'] - train['tov_oppos']
train['pf_diff'] = train['pf'] - train['pf_oppos']

In [8]:
# read in the play-by-play data from database
plays = pd.read_sql_query("select * from playbyplay", con=engine)

In [9]:
# Cluster the play-by-play data 
kmeans = KMeans(6, random_state=0).fit(plays.drop(columns=['game_id']))

# new feature as cluster
plays['clusters'] = kmeans.labels_

In [10]:
# merge to training data
train = train.merge(plays, left_on=['game_id'], right_on=['game_id'])

In [None]:
# Get dummy variables
final = pd.concat([
           pd.get_dummies(train['season_id']), 
           pd.get_dummies(train['wl'],drop_first=True), 
           pd.get_dummies(train['clusters'],prefix='cluster'),
           train.drop(columns=['awayteam','season_id','wl','hometeam','game_id'])], axis=1)

# Split into train and test data
tr, te = train_test_split(final,test_size=0.1,random_state=0)

## Model training

In [13]:
# Random forest
rfr = RandomForestRegressor(max_depth=12, min_samples_split=64, n_jobs=-1, random_state=0)

In [14]:
# Fit model
rfr.fit(tr.drop(columns=['2diff']), tr['2diff'])

RandomForestRegressor(max_depth=12, min_samples_split=64, n_jobs=-1,
                      random_state=0)

In [15]:
# MAE on testing
mean_absolute_error(te['2diff'],rfr.predict(te.drop(columns=['2diff'])))

9.373045180192985

In [16]:
# MAE on training
mean_absolute_error(tr['2diff'],rfr.predict(tr.drop(columns=['2diff'])))

8.3139611855241

In [17]:
# Team abbreviation conversion between our data and MSN
team_dict = {'ATL': 'ATL',
 'BKN': 'BKN',
 'BOS': 'BOS',
 'CHA': 'CHA',
 'CHI': 'CHI',
 'CLE': 'CLE',
 'DAL': 'DAL',
 'DEN': 'DEN',
 'DET': 'DET',
 'GS': 'GSW',
 'HOU': 'HOU',
 'IND': 'IND',
 'LAC': 'LAC',
 'LAL': 'LAL',
 'MEM': 'MEM',
 'MIA': 'MIA',
 'MIL': 'MIL',
 'MIN': 'MIN',
 'NO': 'NOP',
 'NY': 'NYK',
 'OKC': 'OKC',
 'ORL': 'ORL',
 'PHI': 'PHI',
 'PHO': 'PHX',
 'POR': 'POR',
 'SA': 'SAS',
 'SAC': 'SAC',
 'TOR': 'TOR',
 'UTA': 'UTA',
 'WAS': 'WAS'}

# Get game schedule tomorrrow

In [18]:
# webscrape from MSN
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}
today = str(int(str(datetime.date.today()).replace('-','')))
games_today = requests.get('https://www.msn.com/en-us/sports/nba/schedule', headers=headers)
html_soup = BeautifulSoup(games_today.content, 'html.parser')

In [19]:
# convert date to the number of days after the first game
def convert_days(date):
    d = pd.Timestamp(1983,10,28)
    return (date - d).days

In [20]:
# get the predictors from past data
teamstoday = []
page = html_soup.find_all('div',{'id':today})[0].find_all('td')
for i in range (len(page)):
    if i % 5 == 2:
        teamstoday.append(page[i].text.split('\n')[1].strip())
        
all_games = pd.read_sql_query("select * from raw", con=engine)
s = "" # email string
for i in range (0, len(teamstoday), 2):
    away = team_dict[teamstoday[i]]
    home = team_dict[teamstoday[i+1]]
    s += away + ',' + home + '\n'
    all_games['days'] = pd.to_datetime(all_games['game_date']).apply(convert_days)
    all_games['hometeam'] = all_games['matchup'].str.extract(r'\w* @ (\w*)')
    thiscomp1 = all_games[(all_games['team_abbreviation'] == away) & (all_games['hometeam'] == home)].reset_index(drop=True)
    thiscomp2 = all_games[(all_games['team_abbreviation'] == home) & (all_games['hometeam'] == away)].reset_index(drop=True)
    target = pd.concat([thiscomp1, thiscomp2]).sort_values('days').iloc[-1].drop(labels=['team_id','team_name','game_date','matchup','min'])
    daysdiff = (pd.Timestamp.today() - pd.Timestamp(1983,10,28)).days - target['days']
    
    data = pd.Series([0]*final.shape[1], index = final.drop(columns=['2diff']).columns)
    data['playoff'] = 0
    data['22020'] = 1
    if target['hometeam'] == home:
        data['opposite'] = 0
    else:
        data['opposite'] = 1

    if target['wl'] == 'W':
        data['W'] = 1
    else:
        data['W'] = 0
    
    # compute the predictors
    data['1diff'] = target['pts'] - target['pts_oppos']
    data['daysdiff'] = daysdiff
    data[28:64] = target[4:-2]
    data[66:115] = team[(team['season'] == target['season_id']) & (team['team_abbreviation'] == target['team_abbreviation'])].drop(columns=['season_id','team_abbreviation','team_name','TEAM_ID','TEAM_NAME','GP','W','L','CFID','CFPARAMS','season']).iloc[0]
    data[115:-7] = team[(team['season'] == target['season_id']) & (team['team_abbreviation'] == target['hometeam'])].drop(columns=['season_id','team_abbreviation','team_name','TEAM_ID','TEAM_NAME','GP','W','L','CFID','CFPARAMS','season']).iloc[0]
    data['blk_diff'] = data['blk'] - data['blk_oppos']
    data['oreb_diff'] = data['oreb'] - data['oreb_oppos']
    data['reb_diff'] = data['reb'] - data['reb_oppos']
    data['ast_diff'] = data['ast'] - data['ast_oppos']
    data['stl_diff'] = data['stl'] - data['stl_oppos']
    data['tov_diff'] = data['tov'] - data['tov_oppos']
    data['pf_diff'] = data['pf'] - data['pf_oppos']
    
    # prediction
    s += str(rfr.predict(data.values.reshape(1,-1))) + '\n' # email string

## Over/Under

In [21]:
# Read in data training data from database
work = pd.read_sql_query("select * from train_total", con=engine)

# playoffs indicator
work['playoff'] = work['season_id'].str.extract(r'(\d)\d{4}').astype(int)
work['playoff'] = work['playoff'].replace(2,0)
work['playoff'] = work['playoff'].replace(4,1)

# Get season after 1995
work['season_id'] = work['season_id'].str.replace('^\d','2')
work = work[work['season_id'] >= '21996']

# drop columns we dont need
train = work.drop(columns=['team_id','team_abbreviation','game_date','matchup','min','days'])

# compute stat difference
train['blk_diff'] = train['blk'] - train['blk_oppos']
train['oreb_diff'] = train['oreb'] - train['oreb_oppos']
train['reb_diff'] = train['reb'] - train['reb_oppos']
train['ast_diff'] = train['ast'] - train['ast_oppos']
train['stl_diff'] = train['stl'] - train['stl_oppos']
train['tov_diff'] = train['tov'] - train['tov_oppos']
train['pf_diff'] = train['pf'] - train['pf_oppos']

In [22]:
# dummy variables
final = pd.concat([
           pd.get_dummies(train['season_id']), 
           pd.get_dummies(train['wl'],drop_first=True), 
           train.drop(columns=['awayteam','season_id','wl','hometeam','game_id'])], axis=1)

# split into train and test dataset
tr, te = train_test_split(final,test_size=0.1,random_state=0)

# Model training

In [23]:
# random forest
rfr = RandomForestRegressor(max_depth=12, min_samples_split=64, n_jobs=-1, random_state=0)

In [24]:
# fit the model
rfr.fit(tr.drop(columns=['2diff']), tr['2diff'])

RandomForestRegressor(max_depth=12, min_samples_split=64, n_jobs=-1,
                      random_state=0)

In [25]:
# MAE for testing
mean_absolute_error(te['2diff'],rfr.predict(te.drop(columns=['2diff'])))

14.881988864855604

In [26]:
# MAE for training
mean_absolute_error(tr['2diff'],rfr.predict(tr.drop(columns=['2diff'])))

13.022569548413044

## Get game schedule tomorrow

In [27]:
# webscrape from MSN
teamstoday = []
page = html_soup.find_all('div',{'id':today})[0].find_all('td')
for i in range (len(page)):
    if i % 5 == 2:
        teamstoday.append(page[i].text.split('\n')[1].strip())

# get raw data from database
all_games = pd.read_sql_query("select * from raw", con=engine)

# get predictors from past data
for i in range (0, len(teamstoday), 2):
    away = team_dict[teamstoday[i]]
    home = team_dict[teamstoday[i+1]]
    s += away + ',' + home + '\n' # email string
    all_games['days'] = pd.to_datetime(all_games['game_date']).apply(convert_days)
    all_games['hometeam'] = all_games['matchup'].str.extract(r'\w* @ (\w*)')
    thiscomp1 = all_games[(all_games['team_abbreviation'] == away) & (all_games['hometeam'] == home)].reset_index(drop=True)
    thiscomp2 = all_games[(all_games['team_abbreviation'] == home) & (all_games['hometeam'] == away)].reset_index(drop=True)
    target = pd.concat([thiscomp1, thiscomp2]).sort_values('days').iloc[-1].drop(labels=['team_id','team_name','game_date','matchup','min'])
    daysdiff = (pd.Timestamp.today() - pd.Timestamp(1983,10,28)).days - target['days']
    
    data = pd.Series([0]*final.shape[1], index = final.drop(columns=['2diff']).columns)
    data['playoff'] = 0
    data['22020'] = 1
    if target['hometeam'] == home:
        data['opposite'] = 0
    else:
        data['opposite'] = 1

    if target['wl'] == 'W':
        data['W'] = 1
    else:
        data['W'] = 0
    
    # compute the predictors
    data['1diff'] = target['pts'] - target['pts_oppos']
    data['daysdiff'] = daysdiff
    data[28:64] = target[4:-2]
    data[66:115] = team[(team['season'] == target['season_id']) & (team['team_abbreviation'] == target['team_abbreviation'])].drop(columns=['season_id','team_abbreviation','team_name','TEAM_ID','TEAM_NAME','GP','W','L','CFID','CFPARAMS','season']).iloc[0]
    data[115:-7] = team[(team['season'] == target['season_id']) & (team['team_abbreviation'] == target['hometeam'])].drop(columns=['season_id','team_abbreviation','team_name','TEAM_ID','TEAM_NAME','GP','W','L','CFID','CFPARAMS','season']).iloc[0]
    data['blk_diff'] = data['blk'] - data['blk_oppos']
    data['oreb_diff'] = data['oreb'] - data['oreb_oppos']
    data['reb_diff'] = data['reb'] - data['reb_oppos']
    data['ast_diff'] = data['ast'] - data['ast_oppos']
    data['stl_diff'] = data['stl'] - data['stl_oppos']
    data['tov_diff'] = data['tov'] - data['tov_oppos']
    data['pf_diff'] = data['pf'] - data['pf_oppos']
    
    # prediction
    s += str(rfr.predict(data.values.reshape(1,-1))) + '\n' # email string

# Email the predictions

In [29]:
import smtplib, ssl

port = 465  # For SSL
smtp_server = "smtp.gmail.com"
sender_email = "leowei08@gmail.com"
receiver_email1 = "leowei08@gmail.com" 
password = 'password'
message = """\
Subject: Predictions Today {today}


{content}."""

context = ssl.create_default_context()
with smtplib.SMTP_SSL(smtp_server, port, context=context) as server:
    server.login(sender_email, password)
    server.sendmail(sender_email, receiver_email1, message.format(today=str(datetime.date.today()).replace('-',''), content=s))