In [8]:
import numpy as np
import pandas as pd
from statistics import mean

In [9]:
#load the nba data of past games 

data = pd.read_csv('nba.csv')
data

Unnamed: 0,TEAM,MATCHUP,GAMEDATE,W/L,MIN,PTS,FGM,FGA,FG%,3PM,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,PF,+/-
0,LAL,LAL @ NOP,5/16/2021,W,240,110,45,96,46.9,11,...,60.0,8,34,42,30,14,5,10,16,12
1,NOP,NOP vs. LAL,5/16/2021,L,240,98,37,78,47.4,9,...,71.4,8,39,47,26,4,5,21,18,-12
2,LAC,LAC @ OKC,5/16/2021,L,240,112,45,106,42.5,10,...,80.0,16,28,44,17,8,3,3,14,-5
3,OKC,OKC vs. LAC,5/16/2021,W,240,117,50,94,53.2,8,...,56.3,14,40,54,20,1,12,15,11,5
4,IND,IND @ TOR,5/16/2021,W,240,125,48,95,50.5,15,...,77.8,10,37,47,34,8,3,13,19,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,BOS,BOS vs. CHA,4/4/2021,W,240,116,43,94,45.7,21,...,81.8,14,30,44,29,11,5,6,20,30
696,BKN,BKN @ CHI,4/4/2021,L,240,107,45,93,48.4,11,...,85.7,4,31,35,28,3,3,5,17,-8
697,CHI,CHI vs. BKN,4/4/2021,W,240,115,43,78,55.1,8,...,70.0,6,34,40,29,4,3,9,11,8
698,PHI,PHI vs. MEM,4/4/2021,L,240,100,37,89,41.6,8,...,64.3,16,34,50,23,4,6,11,19,-16


In [10]:
#list of features 

features = ['PTS','FGM','FGA','FG%','3PM','3PA','3P%','FTM','FTA','FT%','OREB','DREB','REB','AST','STL','BLK','TOV','PF','+/-']

In [11]:
#Define a function to get the average of the features for the previous 5 games for each team in each matchup.
#This is essentially a 5 game moving average for all the features. 

#The function takes each team's abbreviation as an argument. 

def trailingStats(team):
    #Get a subset of the dataframe of matchups for the current team
    teamData = data[data['TEAM'] == team].copy()
    
    #create a copy of the dataframe subset and reset the index
    
    teamData = teamData.reset_index(drop=True)
    
    #create empty dataframe
    
    df = pd.DataFrame()
    
    #Create empty list to store dates
    
    dates = [] 
    
    # iterate over each game in the team's dataframe except the first five games
   
    for i in range(len(teamData)-5):
        #empty list
        trailing = []
        #iterate over each feature and append the values for that feature for the last five games to the trailing list
        for f in features:
            trailing.append(float(teamData.loc[i+1,f]))
            trailing.append(float(teamData.loc[i+2,f]))
            trailing.append(float(teamData.loc[i+3,f]))
            trailing.append(float(teamData.loc[i+4,f]))
            trailing.append(float(teamData.loc[i+5,f]))
            
            #Take the mean of the last five values for that feature
            trailingMean = mean(trailing)
            #Add the mean of the last five values for that feature to the current matchup.
            df.loc[i,f] = trailingMean
            # Reset the trailing list to zero for the next iteration
            trailing = []
        #Add the date of the game to the dates list
        dates.append(teamData.loc[i,'GAMEDATE'])
    #Add the dates to the GAMEDATE column in the final dataframe once iteration is finished      
    df['GAMEDATE'] = dates
    #Add a lable for the team
    df['TEAM'] = team
    #Return a dataframe that contains all matchups for a team and the averages of the features for the previous 5 games
    return(df)

In [12]:
#Test the function on Boston
#Each feature is the average of that stat for the previous 5 games, which caputres trends leading up to the game
trailingStats('BOS')

Unnamed: 0,PTS,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,FT%,...,DREB,REB,AST,STL,BLK,TOV,PF,+/-,GAMEDATE,TEAM
0,112.4,41.6,90.6,46.24,14.6,39.4,37.16,14.6,19.4,74.44,...,32.2,40.6,24.0,7.4,6.0,12.2,21.0,-5.6,5/16/2021,BOS
1,114.0,42.6,90.8,47.18,15.2,40.0,37.92,13.6,18.2,73.88,...,32.4,40.8,24.2,7.4,6.6,11.4,20.8,-1.6,5/15/2021,BOS
2,119.0,44.0,88.4,49.72,16.8,39.6,42.32,14.2,18.4,76.94,...,32.8,40.6,27.2,6.2,6.2,13.6,20.4,-2.0,5/12/2021,BOS
3,123.4,44.6,91.8,48.6,16.8,39.8,42.08,17.4,22.2,79.0,...,34.0,44.6,27.2,6.0,6.6,14.0,21.4,0.2,5/11/2021,BOS
4,122.6,44.4,93.6,47.32,17.0,40.2,42.22,16.8,21.6,78.46,...,35.4,48.6,26.6,6.4,7.6,14.0,21.8,3.2,5/9/2021,BOS
5,125.8,45.6,95.8,47.64,17.0,42.2,41.06,17.6,22.0,80.4,...,34.4,47.0,27.0,8.6,7.4,14.0,24.0,6.8,5/7/2021,BOS
6,120.2,42.2,94.2,44.74,15.6,41.4,38.54,20.2,24.8,82.32,...,33.2,46.2,25.0,8.2,6.0,14.4,22.0,-4.6,5/5/2021,BOS
7,117.2,41.2,95.2,43.16,15.0,42.0,36.38,19.8,25.0,78.74,...,34.0,48.0,23.4,8.8,6.0,14.6,21.8,-3.6,5/2/2021,BOS
8,108.4,38.0,89.6,42.4,13.8,42.4,32.9,18.6,22.6,81.24,...,34.6,47.2,21.2,8.8,5.0,16.6,19.0,-1.6,4/30/2021,BOS
9,103.6,36.4,86.6,42.2,12.8,39.4,33.4,18.0,21.2,84.44,...,34.8,44.2,20.2,8.6,4.0,17.0,17.2,-4.6,4/28/2021,BOS


In [14]:
#Call the function all 30 NBA teams

ATL = trailingStats('ATL')
BKN = trailingStats('BKN')
BOS = trailingStats('BOS')
CHA = trailingStats('CHA')
CHI = trailingStats('CHI')
CLE = trailingStats('CLE')
DAL = trailingStats('DAL')
DEN = trailingStats('DEN')
DET = trailingStats('DET')
GSW = trailingStats('GSW')
HOU = trailingStats('HOU')
IND = trailingStats('IND')
LAC = trailingStats('LAC')
LAL = trailingStats('LAL')
MIA = trailingStats('MIA')
MEM = trailingStats('MEM')
MIN = trailingStats('MIN')
MIL = trailingStats('MIL')
NOP = trailingStats('NOP')
NYK = trailingStats('NYK')
OKC = trailingStats('OKC')
ORL = trailingStats('ORL')
PHI = trailingStats('PHI')
PHX = trailingStats('PHX')
POR = trailingStats('POR')
SAC = trailingStats('SAC')
SAS = trailingStats('SAS')
TOR = trailingStats('TOR')
UTA = trailingStats('UTA')
WAS = trailingStats('WAS')



In [13]:
#Save the dataframes of all the teams to a list

abbreviations = [ATL,
BKN,
BOS,
CHA,
CHI,
CLE,
DAL,
DEN,
DET,
GSW,
HOU,
IND,
LAC,
LAL,
MEM,
MIA,
MIL,
MIN,
NOP,
NYK,
OKC,
ORL,
PHI,
PHX,
POR,
SAC,
SAS,
TOR,
UTA,
WAS]

In [15]:
#Create a dataframe of matchups. Each team has its own row.
#There are two rows per game

data2 = data[['TEAM','MATCHUP', 'GAMEDATE','W/L']]
data2

Unnamed: 0,TEAM,MATCHUP,GAMEDATE,W/L
0,LAL,LAL @ NOP,5/16/2021,W
1,NOP,NOP vs. LAL,5/16/2021,L
2,LAC,LAC @ OKC,5/16/2021,L
3,OKC,OKC vs. LAC,5/16/2021,W
4,IND,IND @ TOR,5/16/2021,W
...,...,...,...,...
695,BOS,BOS vs. CHA,4/4/2021,W
696,BKN,BKN @ CHI,4/4/2021,L
697,CHI,CHI vs. BKN,4/4/2021,W
698,PHI,PHI vs. MEM,4/4/2021,L


In [16]:
#test merging data2, containing matchups, and LAL, containing lakers trailing averages

test = pd.merge(data2, LAL, how = 'left', left_on = ['TEAM','GAMEDATE'], right_on = ['TEAM','GAMEDATE'])

In [17]:
#Perform a union operation on the dataframes containing the trailing stats for all the teams

union = pd.concat([ATL,
BKN,
BOS,
CHA,
CHI,
CLE,
DAL,
DEN,
DET,
GSW,
HOU,
IND,
LAC,
LAL,
MEM,
MIA,
MIL,
MIN,
NOP,
NYK,
OKC,
ORL,
PHI,
PHX,
POR,
SAC,
SAS,
TOR,
UTA,
WAS])

In [269]:
#Merge the dataframe containing each team's trailing statistics with the matchup data
#This returns matchup information and each team's 5 game trailing average stats
data3 = pd.merge(data2, union, how = 'inner', left_on = ['TEAM','GAMEDATE'], right_on = ['TEAM','GAMEDATE'])
data3

Unnamed: 0,TEAM,MATCHUP,GAMEDATE,W/L,PTS,FGM,FGA,FG%,3PM,3PA,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,PF,+/-
0,LAL,LAL @ NOP,5/16/2021,W,114.2,41.2,87.0,47.36,10.8,27.4,...,73.86,11.4,34.4,45.8,25.4,6.2,3.6,14.4,18.6,3.8
1,NOP,NOP vs. LAL,5/16/2021,L,111.6,42.2,96.0,43.94,9.8,30.2,...,65.30,13.8,35.8,49.6,24.0,8.6,4.8,12.2,20.0,-5.2
2,LAC,LAC @ OKC,5/16/2021,L,112.2,40.4,81.8,49.58,13.6,35.6,...,88.86,7.6,36.2,43.8,25.8,6.8,4.2,16.8,17.2,10.6
3,OKC,OKC vs. LAC,5/16/2021,W,98.2,36.6,87.4,41.94,8.8,31.0,...,73.44,9.0,32.6,41.6,20.0,8.6,2.4,14.4,18.2,-24.0
4,IND,IND @ TOR,5/16/2021,W,118.8,45.4,94.8,47.98,12.4,34.6,...,81.16,9.0,34.6,43.6,31.0,8.2,6.8,11.8,21.0,0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
545,NOP,NOP vs. SAC,4/12/2021,W,111.4,40.0,89.4,44.82,9.0,29.6,...,71.28,14.6,35.4,50.0,25.0,9.6,5.6,14.8,17.8,-4.6
546,MEM,MEM vs. CHI,4/12/2021,W,125.0,47.6,94.2,50.84,14.0,33.8,...,74.18,12.8,34.8,47.6,30.0,8.0,5.8,12.0,18.4,7.0
547,CHI,CHI @ MEM,4/12/2021,L,115.0,44.4,87.4,50.94,12.4,31.2,...,78.38,9.6,34.8,44.4,29.4,6.8,4.0,14.6,15.0,3.4
548,HOU,HOU @ PHX,4/12/2021,L,113.0,40.4,85.4,47.46,15.0,38.2,...,69.86,9.8,34.4,44.2,25.4,5.4,4.0,15.6,19.8,-6.8


In [19]:
#For each even numbered row in data3, append a 0 if the away team wins, and a 1 if the home team wins to the empty winner list
winner = []
for i in np.arange(0,len(data3),2):
    if data3.loc[i,'W/L'] == 'W':
        winner.append(0)
    else:
        winner.append(1)
   

In [20]:
#Create a dataframe of teams and their features 

home = pd.DataFrame(columns = ['home' + ' ' + f for f in features])

#Create a dataframe of away teams and their features

away = pd.DataFrame(columns = ['away' + ' ' + f for f in features])

# for each even numbered row in data3, insert the away team information into away dataframe

for i in np.arange(0,len(data3),2):
    away.loc[i] = data3.loc[i,features].tolist()
    
# for each odd numbered row in data3, insert the home team information into a home dataframe

for i in np.arange(1,len(data3),2):
    home.loc[i] = data3.loc[i,features].tolist()

#Create a numbered index for the home dataframe

home['index'] = np.arange(0,len(home))
home = home.set_index('index')

#Create a numbered index for the away dataframe 
away['index'] = np.arange(0,len(away))
away = away.set_index('index')

In [21]:
#Merge the home and away dataframes on the numerical index
fulldata = home.merge(away, left_on='index', right_on='index')

In [22]:
#full data contains 1 row per game. The columns correspond to features, which are the average stats ove the past 5 games
#for the home and away teams

fulldata

Unnamed: 0_level_0,home PTS,home FGM,home FGA,home FG%,home 3PM,home 3PA,home 3P%,home FTM,home FTA,home FT%,...,away FT%,away OREB,away DREB,away REB,away AST,away STL,away BLK,away TOV,away PF,away +/-
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,111.6,42.2,96.0,43.94,9.8,30.2,33.32,17.4,26.2,65.30,...,73.86,11.4,34.4,45.8,25.4,6.2,3.6,14.4,18.6,3.8
1,98.2,36.6,87.4,41.94,8.8,31.0,29.56,16.2,22.6,73.44,...,88.86,7.6,36.2,43.8,25.8,6.8,4.2,16.8,17.2,10.6
2,107.2,40.0,92.0,43.52,11.8,36.2,32.52,15.4,21.0,72.98,...,81.16,9.0,34.6,43.6,31.0,8.2,6.8,11.8,21.0,0.2
3,108.6,40.0,92.2,43.46,11.0,38.2,28.88,17.6,22.8,77.38,...,84.04,11.0,36.6,47.6,27.0,7.6,6.0,12.0,21.2,2.6
4,114.0,41.2,91.0,45.24,12.2,41.6,28.86,19.4,28.2,69.20,...,85.76,10.0,32.4,42.4,25.4,6.4,4.2,10.6,21.2,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270,121.4,45.4,85.4,53.28,16.4,34.0,48.92,14.2,17.0,84.94,...,73.88,8.8,32.2,41.0,24.6,8.6,5.6,20.6,20.8,-1.0
271,112.6,39.6,84.8,46.70,15.2,40.6,36.80,18.2,21.2,84.36,...,79.20,11.0,38.2,49.2,19.6,4.6,3.8,10.6,19.2,3.8
272,111.4,40.0,89.4,44.82,9.0,29.6,28.88,22.4,31.6,71.28,...,82.30,9.6,34.6,44.2,27.6,9.2,4.0,11.4,19.4,6.2
273,115.0,44.4,87.4,50.94,12.4,31.2,39.54,13.8,17.8,78.38,...,74.18,12.8,34.8,47.6,30.0,8.0,5.8,12.0,18.4,7.0


In [250]:
#Split fulldata and the target variable, winner, into train and test sets using an 80-20 split

from sklearn.model_selection import train_test_split
X_train, X_test, targets_train, targets_test = train_test_split(fulldata, winner, test_size=0.2, random_state=0)

In [24]:
#build a random forest classifier and fit it on the training set

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=200, random_state=0, min_samples_split = 2, max_depth = 9 )
classifier.fit(X_train, targets_train)

RandomForestClassifier(max_depth=9, n_estimators=200, random_state=0)

In [25]:
#make predictions on the test set and record accuracy.
#accuracy is 65.45%. 
targets_pred = classifier.predict(X_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(targets_test, targets_pred))

0.6545454545454545


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
#try a gradient boosting classifier 

gbc=GradientBoostingClassifier(n_estimators=10000,learning_rate=0.005,random_state=100 )

gbc.fit(X_train, targets_train)
y_pred = gbc.predict(X_test)
print(accuracy_score(targets_test, y_pred))

In [26]:
#scale the data for a neural network

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [30]:
#convert the training targets form list to dataframe

targets_train2 = pd.DataFrame(targets_train)

In [129]:
#Traing a deep learning neural network with four hidden layers on the training set

from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(38,60,60,38), max_iter= 5000)
mlp.fit(X_train, targets_train2.values.ravel())

MLPClassifier(hidden_layer_sizes=(38, 60, 60, 38), max_iter=5000)

In [130]:
#Make predictinos on the training set using the deep learning neural network

predictions = mlp.predict(X_test)

In [131]:
#Record the accuracy of the neural network
#Accuracy is 67.27%. This is consistent with most accuracy levels in other published models/papers
print(accuracy_score(targets_test, predictions))

0.6727272727272727
