In [1]:
import os
import numpy as np
import pandas as pd
home_folder = os.getcwd()
data_folder = os.path.join(home_folder, "data")
data_filename = os.path.join(data_folder,'nba', "nba.csv")

In [2]:
results = pd.read_csv(data_filename)
results.head()

Unnamed: 0,Date,Start (ET),Visitor/Neutral,PTS,Home/Neutral,PTS.1,Unnamed: 6,Unnamed: 7,Attend.,Notes
0,Wed Jan 1 2014,6:00p,Dallas Mavericks,87,Washington Wizards,78,Box Score,,15713,
1,Wed Jan 1 2014,7:00p,Indiana Pacers,82,Toronto Raptors,95,Box Score,,18271,
2,Wed Jan 1 2014,8:00p,New Orleans Pelicans,112,Minnesota Timberwolves,124,Box Score,,14002,
3,Wed Jan 1 2014,9:00p,Philadelphia 76ers,114,Denver Nuggets,102,Box Score,,16006,
4,Wed Jan 1 2014,9:30p,Charlotte Bobcats,85,Los Angeles Clippers,112,Box Score,,19160,


In [3]:
results = pd.read_csv(data_filename, parse_dates=["Date"])
# Fix the name of the columns
results.columns = ["Date", "Start Time" , "Visitor Team",
"VisitorPts", "Home Team", "HomePts", "Score Type", "OT?","Attend.", "Notes"]

results.head()

Unnamed: 0,Date,Start Time,Visitor Team,VisitorPts,Home Team,HomePts,Score Type,OT?,Attend.,Notes
0,2014-01-01,6:00p,Dallas Mavericks,87,Washington Wizards,78,Box Score,,15713,
1,2014-01-01,7:00p,Indiana Pacers,82,Toronto Raptors,95,Box Score,,18271,
2,2014-01-01,8:00p,New Orleans Pelicans,112,Minnesota Timberwolves,124,Box Score,,14002,
3,2014-01-01,9:00p,Philadelphia 76ers,114,Denver Nuggets,102,Box Score,,16006,
4,2014-01-01,9:30p,Charlotte Bobcats,85,Los Angeles Clippers,112,Box Score,,19160,


In [4]:
results["HomeWin"] = results["VisitorPts"] < results["HomePts"]
# "Class values"
y_true = results["HomeWin"].values
results.head()

Unnamed: 0,Date,Start Time,Visitor Team,VisitorPts,Home Team,HomePts,Score Type,OT?,Attend.,Notes,HomeWin
0,2014-01-01,6:00p,Dallas Mavericks,87,Washington Wizards,78,Box Score,,15713,,False
1,2014-01-01,7:00p,Indiana Pacers,82,Toronto Raptors,95,Box Score,,18271,,True
2,2014-01-01,8:00p,New Orleans Pelicans,112,Minnesota Timberwolves,124,Box Score,,14002,,True
3,2014-01-01,9:00p,Philadelphia 76ers,114,Denver Nuggets,102,Box Score,,16006,,False
4,2014-01-01,9:30p,Charlotte Bobcats,85,Los Angeles Clippers,112,Box Score,,19160,,True


In [5]:
print("Home Win percentage: {0:.1f}%".format(100 * results["HomeWin"].sum() / results["HomeWin"].count()))

Home Win percentage: 57.9%


In [6]:
results["HomeLastWin"] = False
results["VisitorLastWin"] = False

In [7]:
# Did the home and visitor teams win their last game?
from collections import defaultdict
won_last = defaultdict(int)

for index, row in results.sort_values('Date').iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    row["HomeLastWin"] = won_last[home_team]
    row["VisitorLastWin"] = won_last[visitor_team]
    results.iloc[index] = row    
    # Set current win
    won_last[home_team] = row["HomeWin"]
    won_last[visitor_team] = not row["HomeWin"]
results.iloc[20:25]

Unnamed: 0,Date,Start Time,Visitor Team,VisitorPts,Home Team,HomePts,Score Type,OT?,Attend.,Notes,HomeWin,HomeLastWin,VisitorLastWin
20,2014-01-03,10:30p,Utah Jazz,99,Los Angeles Lakers,110,Box Score,,18997,,True,False,True
21,2014-01-04,7:00p,New Orleans Pelicans,82,Indiana Pacers,99,Box Score,,18165,,True,False,True
22,2014-01-04,7:00p,Miami Heat,110,Orlando Magic,94,Box Score,,18846,,False,False,False
23,2014-01-04,7:30p,Cleveland Cavaliers,82,Brooklyn Nets,89,Box Score,,17732,,True,True,True
24,2014-01-04,8:00p,Atlanta Hawks,84,Chicago Bulls,91,Box Score,,21539,,True,True,False


In [8]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=14)

In [9]:
from sklearn.model_selection import cross_val_score

# Create a dataset with just the neccessary information
X_previouswins = results[["HomeLastWin", "VisitorLastWin"]].values
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_previouswins, y_true, scoring='accuracy')
print("Using just the last result from the home and visitor teams")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Using just the last result from the home and visitor teams
Accuracy: 57.4%




In [10]:
# What about win streaks?
results["HomeWinStreak"] = 0
results["VisitorWinStreak"] = 0
from collections import defaultdict
win_streak = defaultdict(int)

for index, row in results.iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    row["HomeWinStreak"] = win_streak[home_team]
    row["VisitorWinStreak"] = win_streak[visitor_team]
    results.iloc[index] = row    
    # Set current win
    if row["HomeWin"]:
        win_streak[home_team] += 1
        win_streak[visitor_team] = 0
    else:
        win_streak[home_team] = 0
        win_streak[visitor_team] += 1

In [11]:
clf = DecisionTreeClassifier(random_state=14)
X_winstreak =  results[["HomeLastWin", "VisitorLastWin", "HomeWinStreak", "VisitorWinStreak"]].values
scores = cross_val_score(clf, X_winstreak, y_true, scoring='accuracy')
print("Using whether the home team is ranked higher")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Using whether the home team is ranked higher
Accuracy: 56.5%




In [12]:
ladder_filename = os.path.join(data_folder, 'nba',"standing.csv")
ladder = pd.read_csv(ladder_filename, skiprows=1)
ladder.head()

Unnamed: 0,Rk,Team,Overall,Home,Road,E,W,A,C,SE,...,Post,≤3,≥10,Oct,Nov,Dec,Jan,Feb,Mar,Apr
0,1,Miami Heat,66-16,37-4,29-12,41-11,25-5,14-4,12-6,15-1,...,30-2,9-3,39-8,1-0,10-3,10-5,8-5,12-1,17-1,8-1
1,2,Oklahoma City Thunder,60-22,34-7,26-15,21-9,39-13,7-3,8-2,6-4,...,21-8,3-6,44-6,,13-4,11-2,11-5,7-4,12-5,6-2
2,3,San Antonio Spurs,58-24,35-6,23-18,25-5,33-19,8-2,9-1,8-2,...,16-12,9-5,31-10,1-0,12-4,12-4,12-3,8-3,10-4,3-6
3,4,Denver Nuggets,57-25,38-3,19-22,19-11,38-14,5-5,10-0,4-6,...,24-4,11-7,28-8,0-1,8-8,9-6,12-3,8-4,13-2,7-1
4,5,Los Angeles Clippers,56-26,32-9,24-17,21-9,35-17,7-3,8-2,6-4,...,17-9,3-5,38-12,1-0,8-6,16-0,9-7,8-5,7-7,7-1


In [13]:
results["HomeTeamRanksHigher"] = 0
for index, row in results.sort_values('Home Team').iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    if home_team == "New Orleans Pelicans":
        home_team = "New Orleans Hornets"
    elif visitor_team == "New Orleans Pelicans":
        visitor_team = "New Orleans Hornets"
    home_rank = ladder[ladder["Team"] == home_team]["Rk"].values[0]
    visitor_rank = ladder[ladder["Team"] == visitor_team]["Rk"].values[0]
    row["HomeTeamRanksHigher"] = int(home_rank > visitor_rank)
    results.iloc[index] = row
results[:5]

Unnamed: 0,Date,Start Time,Visitor Team,VisitorPts,Home Team,HomePts,Score Type,OT?,Attend.,Notes,HomeWin,HomeLastWin,VisitorLastWin,HomeWinStreak,VisitorWinStreak,HomeTeamRanksHigher
0,2014-01-01,6:00p,Dallas Mavericks,87,Washington Wizards,78,Box Score,,15713,,False,True,True,0,0,1
1,2014-01-01,7:00p,Indiana Pacers,82,Toronto Raptors,95,Box Score,,18271,,True,True,True,0,0,1
2,2014-01-01,8:00p,New Orleans Pelicans,112,Minnesota Timberwolves,124,Box Score,,14002,,True,False,True,0,0,0
3,2014-01-01,9:00p,Philadelphia 76ers,114,Denver Nuggets,102,Box Score,,16006,,False,False,True,0,0,0
4,2014-01-01,9:30p,Charlotte Bobcats,85,Los Angeles Clippers,112,Box Score,,19160,,True,False,False,0,0,0


In [14]:
X_homehigher =  results[["HomeLastWin", "VisitorLastWin", "HomeTeamRanksHigher"]].values
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_homehigher, y_true, scoring='accuracy', cv=5)
print("Using whether the home team is ranked higher")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Using whether the home team is ranked higher
Accuracy: 59.2%


In [15]:
from sklearn.model_selection import GridSearchCV

parameter_space = {
                   "max_depth": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
                   }
clf = DecisionTreeClassifier(random_state=14)
grid = GridSearchCV(clf, parameter_space)
grid.fit(X_homehigher, y_true)
print("Accuracy: {0:.1f}%".format(grid.best_score_ * 100))



Accuracy: 60.4%


In [16]:
# Who won the last match?
last_match_winner = defaultdict(int)
results["HomeTeamWonLast"] = 0

for index, row in results.sort_values('Home Team').iterrows():
    home_team = row["Home Team"]
    visitor_team = row["Visitor Team"]
    teams = tuple(sorted([home_team, visitor_team]))
    # Set in the row, who won the last encounter
    row["HomeTeamWonLast"] = 1 if last_match_winner[teams] == row["Home Team"] else 0
    results.iloc[index] = row
    # Who won this one?
    winner = row["Home Team"] if row["HomeWin"] else row["Visitor Team"]
    last_match_winner[teams] = winner
results.iloc[:5]

Unnamed: 0,Date,Start Time,Visitor Team,VisitorPts,Home Team,HomePts,Score Type,OT?,Attend.,Notes,HomeWin,HomeLastWin,VisitorLastWin,HomeWinStreak,VisitorWinStreak,HomeTeamRanksHigher,HomeTeamWonLast
0,2014-01-01,6:00p,Dallas Mavericks,87,Washington Wizards,78,Box Score,,15713,,False,True,True,0,0,1,0
1,2014-01-01,7:00p,Indiana Pacers,82,Toronto Raptors,95,Box Score,,18271,,True,True,True,0,0,1,0
2,2014-01-01,8:00p,New Orleans Pelicans,112,Minnesota Timberwolves,124,Box Score,,14002,,True,False,True,0,0,0,0
3,2014-01-01,9:00p,Philadelphia 76ers,114,Denver Nuggets,102,Box Score,,16006,,False,False,True,0,0,0,0
4,2014-01-01,9:30p,Charlotte Bobcats,85,Los Angeles Clippers,112,Box Score,,19160,,True,False,False,0,0,0,0


In [17]:
X_home_higher =  results[["HomeTeamRanksHigher", "HomeTeamWonLast"]].values
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_home_higher, y_true, scoring='accuracy')
print("Using whether the home team is ranked higher")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Using whether the home team is ranked higher
Accuracy: 61.1%




#### Sometimes a team performs better against a particular team

In [18]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
encoding = LabelEncoder()
encoding.fit(results["Home Team"].values)
home_teams = encoding.transform(results["Home Team"].values)
visitor_teams = encoding.transform(results["Visitor Team"].values)
X_teams = np.vstack([home_teams, visitor_teams]).T

onehot = OneHotEncoder()
X_teams = onehot.fit_transform(X_teams).todense()

clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 58.9%


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [19]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy')
print("Using full team labels is ranked higher")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))



Using full team labels is ranked higher
Accuracy: 57.8%


In [20]:
X_all = np.hstack([X_home_higher, X_teams])
print(X_all.shape)

(1319, 62)


In [21]:
clf = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf, X_all, y_true, scoring='accuracy')
print("Using whether the home team is ranked higher")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))



Using whether the home team is ranked higher
Accuracy: 61.0%




In [22]:
parameter_space = {
                   "max_features": [2, 10, 'auto'],
                   "n_estimators": [100,],
                   "criterion": ["gini", "entropy"],
                   "min_samples_leaf": [2, 4, 6],
                   }
clf = RandomForestClassifier(random_state=14)
grid = GridSearchCV(clf, parameter_space)
grid.fit(X_all, y_true)
print("Accuracy: {0:.1f}%".format(grid.best_score_ * 100))
print(grid.best_estimator_)



Accuracy: 65.3%
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=14, verbose=0,
                       warm_start=False)
