In [1]:
# Load necessary libraries

import platform
import numpy as np
import pandas as pd
import sklearn as sk
import os

In [7]:
# Load the dataset: NBA statistics from the regular season
df = pd.read_csv('COMBINEDgamesWithInfo2016-19.csv')

In [8]:
from sklearn.metrics import f1_score, make_scorer, classification_report

scorer = make_scorer(f1_score, pos_label = None, average = 'weighted')

In [9]:
# Examine the dataset with its initial formatting
df.head()

Unnamed: 0.1,Unnamed: 0,Home,Away,W_PCT,REB,TOV,PLUS_MINUS,OFF_RATING,DEF_RATING,TS_PCT,Result,Date
0,0,Philadelphia 76ers,Chicago Bulls,-1.764924,-3.739935,2.574073,-2.619013,-2.549887,0.99368,-0.201678,0,11/25/2016
1,1,Los Angeles Lakers,Golden State Warriors,-1.945127,-0.311661,0.980599,-2.601317,-1.762117,1.925254,-1.73443,0,11/25/2016
2,2,New York Knicks,Charlotte Hornets,-0.376305,0.0,1.838624,-1.044066,-0.497539,1.179995,-0.322685,1,11/25/2016
3,3,Phoenix Suns,Minnesota Timberwolves,0.143102,-0.089046,-0.183862,-0.884802,-1.036539,0.186315,-0.72604,0,11/25/2016
4,4,Boston Celtics,San Antonio Spurs,-1.060015,0.0,0.429012,-0.814018,-0.704847,0.37263,-0.645369,0,11/25/2016


In [10]:
df.dtypes

# Note the Date is read as an object field

Unnamed: 0      int64
Home           object
Away           object
W_PCT         float64
REB           float64
TOV           float64
PLUS_MINUS    float64
OFF_RATING    float64
DEF_RATING    float64
TS_PCT        float64
Result          int64
Date           object
dtype: object

In [26]:
# Don't read the first row because it is blank, and read the date column as a date

df = pd.read_csv('COMBINEDgamesWithInfo2016-19.csv', parse_dates = ['Date'])

# Correct column names
df.columns = ['Rank', 'Home', 'Away', 'W_PCT',  'REB', 'TOV', 'PLUS_MINUS', 'OFF_RATING', 'DEF_RATING', 'TS_PCT', 'Result', 'Date']


In [27]:
df.shape

(3024, 12)

In [28]:
# View the updated dataset
df.head()

Unnamed: 0,Rank,Home,Away,W_PCT,REB,TOV,PLUS_MINUS,OFF_RATING,DEF_RATING,TS_PCT,Result,Date
0,0,Philadelphia 76ers,Chicago Bulls,-1.764924,-3.739935,2.574073,-2.619013,-2.549887,0.99368,-0.201678,0,2016-11-25
1,1,Los Angeles Lakers,Golden State Warriors,-1.945127,-0.311661,0.980599,-2.601317,-1.762117,1.925254,-1.73443,0,2016-11-25
2,2,New York Knicks,Charlotte Hornets,-0.376305,0.0,1.838624,-1.044066,-0.497539,1.179995,-0.322685,1,2016-11-25
3,3,Phoenix Suns,Minnesota Timberwolves,0.143102,-0.089046,-0.183862,-0.884802,-1.036539,0.186315,-0.72604,0,2016-11-25
4,4,Boston Celtics,San Antonio Spurs,-1.060015,0.0,0.429012,-0.814018,-0.704847,0.37263,-0.645369,0,2016-11-25


In [29]:
# The Date field has been updated to read as a datetime field
# May not do anything with the field yet
df.dtypes

Rank                   int64
Home                  object
Away                  object
W_PCT                float64
REB                  float64
TOV                  float64
PLUS_MINUS           float64
OFF_RATING           float64
DEF_RATING           float64
TS_PCT               float64
Result                 int64
Date          datetime64[ns]
dtype: object

In [30]:
# Create new feature that notes whether the home team won
df['Home Win'] = df['OFF_RATING'] < df['DEF_RATING']

# Assign "class values" to the Home Win column

### This will also be the value that we want to predict ###
y_true = df['Home Win'].values

In [31]:
# Review new column with correctly populated fields
df.head()

Unnamed: 0,Rank,Home,Away,W_PCT,REB,TOV,PLUS_MINUS,OFF_RATING,DEF_RATING,TS_PCT,Result,Date,Home Win
0,0,Philadelphia 76ers,Chicago Bulls,-1.764924,-3.739935,2.574073,-2.619013,-2.549887,0.99368,-0.201678,0,2016-11-25,True
1,1,Los Angeles Lakers,Golden State Warriors,-1.945127,-0.311661,0.980599,-2.601317,-1.762117,1.925254,-1.73443,0,2016-11-25,True
2,2,New York Knicks,Charlotte Hornets,-0.376305,0.0,1.838624,-1.044066,-0.497539,1.179995,-0.322685,1,2016-11-25,True
3,3,Phoenix Suns,Minnesota Timberwolves,0.143102,-0.089046,-0.183862,-0.884802,-1.036539,0.186315,-0.72604,0,2016-11-25,True
4,4,Boston Celtics,San Antonio Spurs,-1.060015,0.0,0.429012,-0.814018,-0.704847,0.37263,-0.645369,0,2016-11-25,True


In [32]:
# Establishing the baseline win percentage for the Home Teams overall
n_games = df['Home Win'].count()
n_homewins = df['Home Win'].sum()
win_percentage = n_homewins / n_games

print('Home Win percentage: {0:.2f}%'.format(100 * win_percentage))

Home Win percentage: 51.03%


In [33]:
# Predicting the baseline for Home Teams wins with simple classifier

from sklearn.metrics import f1_score

y_pred = [1] * len(y_true)
print('F1: {0:.4f}%'.format(f1_score(y_true, y_pred, pos_label = None, average = 'weighted') * 100))

F1: 34.4786%


In [34]:
# Create two new columns, with all of the values set to False
df['Home Last Win'] = False
df['Visitor Last Win'] = False

In [38]:
# Determining whether the home and visitor teams won their last games
# This will update the Home Last Win & Visitor Last Win columns to either remain False or change to True

from collections import defaultdict

won_last = defaultdict(int)   # The default value of int is 0

# Inefficient method to deterime previous wins
for index, row in df.iterrows():
    Home= row['Home']
    Away = row['Away']
    row['Home Last Win'] = won_last[Home]
    row['Visitor Last Win'] = won_last[Away]
    # Set current win
    won_last[home_team] = row['Home Win']
    won_last[visitor_team] = not row['Home Win']
df.loc[20:25]

Unnamed: 0,Rank,Home,Away,W_PCT,REB,TOV,PLUS_MINUS,OFF_RATING,DEF_RATING,TS_PCT,Result,Date,Home Win,Home Last Win,Visitor Last Win
20,20,Philadelphia 76ers,Cleveland Cavaliers,-3.191055,-1.727048,2.927207,-3.238842,-3.549449,0.599018,-1.84154,0,2016-11-27,True,False,False
21,21,Los Angeles Lakers,Atlanta Hawks,-0.809592,-1.372782,-0.996496,-1.254839,0.877163,3.751743,0.640536,1,2016-11-27,True,False,False
22,22,Phoenix Suns,Denver Nuggets,-0.425825,-2.081315,-0.186843,-0.610462,-0.632373,-0.031527,-0.080067,0,2016-11-27,True,False,False
23,23,Brooklyn Nets,Sacramento Kings,-0.567766,-0.309983,1.183339,-0.898736,-0.877163,0.157636,-0.080067,0,2016-11-27,True,False,False
24,24,Orlando Magic,Milwaukee Bucks,-0.283883,1.195649,-0.311405,-0.983523,-1.366742,-0.252218,-1.801506,0,2016-11-27,True,False,False
25,25,Portland Trail Blazers,Houston Rockets,-0.657137,-1.151366,-2.242116,-1.203967,-0.611974,1.292617,-0.720603,0,2016-11-27,True,False,False


In [41]:
df.loc[90:95][['Date', 'Home', 'Away', 'Home', 'PLUS_MINUS', 
               'TS_PCT', 'Result', 'Home Win', 'Home Last Win', 'Visitor Last Win']]

Unnamed: 0,Date,Home,Away,Home.1,PLUS_MINUS,TS_PCT,Result,Home Win,Home Last Win,Visitor Last Win
90,2016-12-06,Miami Heat,New York Knicks,Miami Heat,0.055359,-1.059045,0,False,False,False
91,2016-12-06,Utah Jazz,Phoenix Suns,Utah Jazz,2.177462,1.567386,1,False,False,False
92,2016-12-06,Minnesota Timberwolves,San Antonio Spurs,Minnesota Timberwolves,-1.070278,-0.635427,0,True,False,False
93,2016-12-06,Washington Wizards,Orlando Magic,Washington Wizards,0.295249,1.397939,0,True,False,False
94,2016-12-06,Memphis Grizzlies,Philadelphia 76ers,Memphis Grizzlies,1.679229,-0.211809,1,False,False,False
95,2016-12-07,New York Knicks,Cleveland Cavaliers,New York Knicks,-1.294385,-1.655125,0,True,False,False


In [42]:
# Basic Decision Tree Classifier set up
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state = 14) # Remove random_state to get non-replicable results

In [83]:
from sklearn.model_selection import cross_val_score

# Use selected features as input for the classifier (target)
X_previouswins = df[['Home Last Win', 'Visitor Last Win']].values

# Decision Tree Classifier
dtc = DecisionTreeClassifier(random_state = 14)
scores = cross_val_score(dtc, X_previouswins, y_true, scoring = scorer)

# Print results
print("Using just the last result from the home and visitor teams")
print('F1: {0:.4f}%'.format(np.mean(scores) * 100))

Using just the last result from the home and visitor teams
F1: 34.4786%


In [75]:
# Taking into consideration winning streaks - What are the teams' win streaks coming into the game?
df["Home Win Streak"] = 0
df["Away Win Streak"] = 0

# Did the home and visitor teams win their last game?
from collections import defaultdict
win_streak = defaultdict(int)

for index, row in df.iterrows():  # Note that this is not the most efficient method
    Home = row["Home"]
    Away = row["Away"]
    row["Home Win Streak"] = win_streak[Home]
    row["Away Win Streak"] = win_streak[Away]
    df.loc[index] = row    
    
    # Set current win streak
    if row["Home Win"]:
        win_streak[Home] += 1
        win_streak[Away] = 0
    else:
        win_streak[Home] = 0
        win_streak[Away] += 1

In [76]:
df.loc[50:60][['Date', 'Away', 'Home', 
               'Home Win', 'Home Win Streak', 'Away Win Streak']]

Unnamed: 0,Date,Away,Home,Home Win,Home Win Streak,Away Win Streak
50,2016-12-01,Houston Rockets,Golden State Warriors,False,0,1
51,2016-12-01,Milwaukee Bucks,Brooklyn Nets,True,3,1
52,2016-12-01,LA Clippers,Cleveland Cavaliers,True,0,0
53,2016-12-01,Miami Heat,Utah Jazz,False,0,0
54,2016-12-01,Dallas Mavericks,Charlotte Hornets,False,0,3
55,2016-12-01,Orlando Magic,Memphis Grizzlies,False,1,3
56,2016-12-02,Orlando Magic,Philadelphia 76ers,True,3,4
57,2016-12-02,Washington Wizards,San Antonio Spurs,False,0,1
58,2016-12-02,Cleveland Cavaliers,Chicago Bulls,False,0,1
59,2016-12-02,Minnesota Timberwolves,New York Knicks,True,4,0


In [94]:
# Use selected features as input for the classifier (target)
from sklearn.model_selection import cross_val_score
# Use selected features as input for the classifier (target)
X_winstreak = df[["Home Last Win", "Visitor Last Win", "Home Win Streak", "Visitor Win Streak"]].values

# Decision Tree Classifier
dtc = DecisionTreeClassifier(random_state = 14)
scores = cross_val_score(dtc, X_winstreak, y_true, scoring = scorer)

# Print results
print("Using whether the home team is ranked higher")
print("When taking into consideration whether a team is on a win streak, the model's results show 66.5576%")
print("F1: {0:.4f}%".format(np.mean(scores) * 100))



Using whether the home team is ranked higher
When taking into consideration whether a team is on a win streak, the model's results show 66.5576%
F1: 66.5576%


In [105]:
# Identify which team is higher in the standings, based on the previous year's regular season final standings
# Load the standings data file

rank = pd.read_csv('ranking.csv', index_col = 'TEAM')

In [106]:
rank.head(3)

Unnamed: 0_level_0,TEAM_ID,LEAGUE_ID,SEASON_ID,STANDINGSDATE,CONFERENCE,G,W,L,W_PCT,HOME_RECORD,ROAD_RECORD,RETURNTOPLAY
TEAM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Golden State,1610612744,0,22021,2021-11-17,West,14,12,2,0.857,8-1,4-1,
Phoenix,1610612756,0,22021,2021-11-17,West,14,11,3,0.786,6-2,5-1,
Dallas,1610612742,0,22021,2021-11-17,West,14,9,5,0.643,6-1,3-4,


In [108]:
# Rename columns in the rank DataFrame
rank.columns = ['TEAM_ID', 'TEAM', 'SEASON_ID', 'STANDINGSDATE', 'G', 'W', 'L', 'HOME_RECORD', 'ROAD_RECORD','CONFERENCE', 
                'A', 'W_PCT']

In [109]:
rank.head(3)


Unnamed: 0_level_0,TEAM_ID,TEAM,SEASON_ID,STANDINGSDATE,G,W,L,HOME_RECORD,ROAD_RECORD,CONFERENCE,A,W_PCT
TEAM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Golden State,1610612744,0,22021,2021-11-17,West,14,12,2,0.857,8-1,4-1,
Phoenix,1610612756,0,22021,2021-11-17,West,14,11,3,0.786,6-2,5-1,
Dallas,1610612742,0,22021,2021-11-17,West,14,9,5,0.643,6-1,3-4,


In [118]:
# Create a new feature -> Home Team Ranks Higher
# First create a function that iterates through the df to determine if the home team has a higher rank based on rank df

def home_team_ranks_higher(row):
    HOME_RECORD = row["Home Team"]
    ROAD_RECORD = row["Visitor Team"]
    
    
    home_rank = rank.loc[home_team]["Rank"]
    visitor_rank = rank.loc[visitor_team]["Rank"]
    
    return home_rank < visitor_rank   # The higher ranking will be the lower number

In [120]:
# Decision Tree Classifier based on if Home Team has a Higher Ranking

# Use selected features as input for the classifier (target)
X_homehigher = df[["Home Last Win", "Visitor Last Win", "Home Team Ranks Higher"]].values

# Decision Tree Classifier
dtc = DecisionTreeClassifier(random_state = 14)
scores = cross_val_score(dtc, X_homehigher, y_true, scoring = scorer)

# Print results
print("Using whether the home team is ranked higher")
print("When taking into consideration whether the home team has a higher rank in the previous season's standings, the model's results show 59.45%")
print("F1: {0:.4f}%".format(np.mean(scores) * 100))

KeyError: "['Home Team Ranks Higher'] not in index"

In [121]:
# Build a function that determines whether a team won the last matchup between the 2 teams
# This does not take into consideration the home/visitor teams

last_game_winner = defaultdict(int)

def home_team_won_last(row):
    home_team = row['Home Team']
    visitor_team = row['Visistor Team']
    
    # Sort for a consistent ordering
    teams = tuple(sorted([home_team, visitor_team]))
    # Parse the row for which team won the last matchup, then add a 1 if the Home Team won
    result = 1 if last_game_winner[teams] == row['Home Team'] else 0
    
    # Update record for next matchup
    winner = row['Home Team'] if row['Home Win'] else row['Visitor Team']
    last_game_winner[teams] = winner
    
    return result

# Function works, but is not applying across DataFrame (see cell below)

In [123]:
 Display the selected columns on the sliced DataFrame (10 random rows)
df[90:200][['Date', 'Visitor Team', 'Visitor Score', 'Home Team', 'Home Score', 'Home Win', 'Home Team Won Last']]

SyntaxError: invalid syntax (771572806.py, line 1)

In [124]:
# Use selected features as input for the classifier (target)
X_home_higher = df[['Home Last Win', 'Visitor Last Win', "Home Team Ranks Higher", "Home Team Won Last"]].values

# Decision Tree Classifier
dtc = DecisionTreeClassifier(random_state = 14)
scores = cross_val_score(dtc, X_home_higher, y_true, scoring = scorer)

# Print results
print("Using whether the home team won the last matchup")
print("F1: {0:.4f}%".format(np.mean(scores) * 100))

KeyError: "['Home Team Ranks Higher', 'Home Team Won Last'] not in index"