In [1]:
import numpy as np
import pandas as pd

filename = 'nba_2016_games.csv'
results = pd.read_csv(filename, parse_dates=['Date'])
results.head()

Unnamed: 0,Date,Visitor/Neutral,PTS,Home/Neutral,PTS.1,Unnamed: 5,Unnamed: 6,Attend.,Notes
0,2016-10-25,New York Knicks,88,Cleveland Cavaliers,117,Box Score,,20562,
1,2016-10-25,San Antonio Spurs,129,Golden State Warriors,100,Box Score,,19596,
2,2016-10-25,Utah Jazz,104,Portland Trail Blazers,113,Box Score,,19446,
3,2016-10-26,Brooklyn Nets,117,Boston Celtics,122,Box Score,,18624,
4,2016-10-26,Dallas Mavericks,121,Indiana Pacers,130,Box Score,OT,17923,


In [2]:
results.columns = ['Date', 'VisitorTeam', 'VisitorPts', 'HomeTeam', 'HomePts',
           'ScoreType', 'OT?', 'Attend', 'Notes']
results.head()

Unnamed: 0,Date,VisitorTeam,VisitorPts,HomeTeam,HomePts,ScoreType,OT?,Attend,Notes
0,2016-10-25,New York Knicks,88,Cleveland Cavaliers,117,Box Score,,20562,
1,2016-10-25,San Antonio Spurs,129,Golden State Warriors,100,Box Score,,19596,
2,2016-10-25,Utah Jazz,104,Portland Trail Blazers,113,Box Score,,19446,
3,2016-10-26,Brooklyn Nets,117,Boston Celtics,122,Box Score,,18624,
4,2016-10-26,Dallas Mavericks,121,Indiana Pacers,130,Box Score,OT,17923,


In [3]:
results['HomeWin'] = results['HomePts'] > results['VisitorPts']
y_true = results['HomeWin'].values
y_true

array([ True, False,  True, ..., False,  True,  True])

In [4]:
results.head()

Unnamed: 0,Date,VisitorTeam,VisitorPts,HomeTeam,HomePts,ScoreType,OT?,Attend,Notes,HomeWin
0,2016-10-25,New York Knicks,88,Cleveland Cavaliers,117,Box Score,,20562,,True
1,2016-10-25,San Antonio Spurs,129,Golden State Warriors,100,Box Score,,19596,,False
2,2016-10-25,Utah Jazz,104,Portland Trail Blazers,113,Box Score,,19446,,True
3,2016-10-26,Brooklyn Nets,117,Boston Celtics,122,Box Score,,18624,,True
4,2016-10-26,Dallas Mavericks,121,Indiana Pacers,130,Box Score,OT,17923,,True


In [5]:
HomePct = 100 * results['HomeWin'].sum() / results['HomeWin'].count()
print('Home win percentage is: %.1f' % HomePct + '%')

Home win percentage is: 58.3%


In [6]:
results['HomeWin'].sum()

763

In [7]:
results['HomeLastWin'] = False
results['VisitorLastWin'] = False
results.head()

Unnamed: 0,Date,VisitorTeam,VisitorPts,HomeTeam,HomePts,ScoreType,OT?,Attend,Notes,HomeWin,HomeLastWin,VisitorLastWin
0,2016-10-25,New York Knicks,88,Cleveland Cavaliers,117,Box Score,,20562,,True,False,False
1,2016-10-25,San Antonio Spurs,129,Golden State Warriors,100,Box Score,,19596,,False,False,False
2,2016-10-25,Utah Jazz,104,Portland Trail Blazers,113,Box Score,,19446,,True,False,False
3,2016-10-26,Brooklyn Nets,117,Boston Celtics,122,Box Score,,18624,,True,False,False
4,2016-10-26,Dallas Mavericks,121,Indiana Pacers,130,Box Score,OT,17923,,True,False,False


In [8]:
from collections import defaultdict
# 创建字典，存储球队上次比赛的结果
won_last = defaultdict(int)

for index, row in results.iterrows():
    home_team = row['HomeTeam']
    visitor_team = row['VisitorTeam']
    row['HomeLastWin'] = won_last[home_team]
    row['VisitorLastWin'] = won_last[visitor_team]
    results.iloc[index] = row
    
    won_last[home_team] = row['HomeWin']
    won_last[visitor_team] = not row['HomeWin']
results.iloc[20:25]

Unnamed: 0,Date,VisitorTeam,VisitorPts,HomeTeam,HomePts,ScoreType,OT?,Attend,Notes,HomeWin,HomeLastWin,VisitorLastWin
20,2016-10-28,Charlotte Hornets,97,Miami Heat,91,Box Score,,19600,,False,True,True
21,2016-10-28,Golden State Warriors,122,New Orleans Pelicans,114,Box Score,,18217,,False,False,False
22,2016-10-28,Phoenix Suns,110,Oklahoma City Thunder,113,Box Score,OT,18203,,True,True,False
23,2016-10-28,Cleveland Cavaliers,94,Toronto Raptors,91,Box Score,,19800,,False,True,True
24,2016-10-28,Los Angeles Lakers,89,Utah Jazz,96,Box Score,,19911,,True,False,True


In [9]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=14)

In [10]:
from sklearn.model_selection import cross_val_score

x_lastwins = results[['HomeLastWin', 'VisitorLastWin']].values
scores = cross_val_score(clf, x_lastwins, y_true, scoring='accuracy')
print('Using just the last result from the home and visitor teams')
print('Accuracy: %.1f' % (np.mean(scores)*100) + '%')

Using just the last result from the home and visitor teams
Accuracy: 58.3%


In [11]:
filename2 = 'nba_2015_standings.csv'
ladder = pd.read_csv(filename2, skiprows=[0,])
ladder.head()

Unnamed: 0,Rk,Team,Overall,Home,Road,E,W,A,C,SE,...,Post,≤3,≥10,Oct,Nov,Dec,Jan,Feb,Mar,Apr
0,1,Golden State Warriors,73-9,39-2,34-7,27-3,46-6,9-1,8-2,10-0,...,25-5,7-2,44-5,3-0,16-0,11-2,14-2,9-1,15-2,5-2
1,2,San Antonio Spurs,67-15,40-1,27-14,24-6,43-9,9-1,7-3,8-2,...,22-7,4-4,44-6,1-1,13-3,14-2,11-2,11-1,13-3,4-3
2,3,Cleveland Cavaliers,57-25,33-8,24-17,35-17,22-8,14-4,8-8,13-5,...,19-11,4-7,32-8,2-1,11-3,8-5,13-3,8-5,11-5,4-3
3,4,Toronto Raptors,56-26,32-9,24-17,39-13,17-13,14-2,11-7,14-4,...,21-9,6-6,28-10,2-0,9-7,9-6,12-2,7-4,11-5,6-2
4,5,Oklahoma City Thunder,55-27,32-9,23-18,18-12,37-15,6-4,4-6,8-2,...,15-13,8-6,33-5,2-0,9-7,12-3,13-3,6-5,11-5,2-4


In [12]:
results['HomeTeamRankHigher'] = 0
for index, row in results.iterrows():
    home_team = row['HomeTeam']
    visitor_team = row['VisitorTeam']
    home_rank = ladder[ladder['Team'] == home_team]['Rk'].values[0]
    visitor_rank = ladder[ladder['Team'] == visitor_team]['Rk'].values[0]
    row['HomeTeamRankHigher'] = int(home_rank > visitor_rank)
    results.iloc[index] = row
results.head()

Unnamed: 0,Date,VisitorTeam,VisitorPts,HomeTeam,HomePts,ScoreType,OT?,Attend,Notes,HomeWin,HomeLastWin,VisitorLastWin,HomeTeamRankHigher
0,2016-10-25,New York Knicks,88,Cleveland Cavaliers,117,Box Score,,20562,,True,0,0,0
1,2016-10-25,San Antonio Spurs,129,Golden State Warriors,100,Box Score,,19596,,False,0,0,0
2,2016-10-25,Utah Jazz,104,Portland Trail Blazers,113,Box Score,,19446,,True,0,0,0
3,2016-10-26,Brooklyn Nets,117,Boston Celtics,122,Box Score,,18624,,True,0,0,0
4,2016-10-26,Dallas Mavericks,121,Indiana Pacers,130,Box Score,OT,17923,,True,0,0,0


In [13]:
ladder[ladder['Team'] == 'San Antonio Spurs']['Rk'].values

array([2])

In [14]:
x_homehigher = results[['HomeLastWin', 'VisitorLastWin', 'HomeTeamRankHigher']].values
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, x_homehigher, y_true, scoring='accuracy')
print('Accuracy: %.1f' % (np.mean(scores) * 100) + '%')

Accuracy: 63.3%


In [15]:
last_match_winner = defaultdict(int)
results['HomeTeamWonLast'] = 0

for index, row in results.iterrows():
    home_team = row['HomeTeam']
    visitor_team = row['VisitorTeam']
    teams = tuple(sorted([home_team, visitor_team]))
    row['HomeTeamWonLast'] == 1 if last_match_winner[teams] == row['HomeTeam'] else 0
    results.iloc[index] = row
    winner = row['HomeTeam'] if row['HomeWin'] else row['VisitorTeam']
    last_match_winner[teams] = winner
results.iloc[44:52]

Unnamed: 0,Date,VisitorTeam,VisitorPts,HomeTeam,HomePts,ScoreType,OT?,Attend,Notes,HomeWin,HomeLastWin,VisitorLastWin,HomeTeamRankHigher,HomeTeamWonLast
44,2016-10-31,Denver Nuggets,102,Toronto Raptors,105,Box Score,,19800,,True,False,False,0,0
45,2016-11-01,Houston Rockets,120,Cleveland Cavaliers,128,Box Score,,20562,,True,True,True,0,0
46,2016-11-01,New York Knicks,89,Detroit Pistons,102,Box Score,,13087,,True,True,True,0,0
47,2016-11-01,Los Angeles Lakers,108,Indiana Pacers,115,Box Score,,15348,,True,False,False,0,0
48,2016-11-01,Sacramento Kings,96,Miami Heat,108,Box Score,OT,19612,,True,False,False,0,0
49,2016-11-01,Memphis Grizzlies,80,Minnesota Timberwolves,116,Box Score,,14774,,True,False,True,1,0
50,2016-11-01,Milwaukee Bucks,117,New Orleans Pelicans,113,Box Score,,15515,,False,False,False,1,0
51,2016-11-01,Orlando Magic,103,Philadelphia 76ers,101,Box Score,,12529,,False,False,False,1,0


In [16]:
x_lastwinner = results[['HomeTeamRankHigher', 'HomeTeamWonLast']].values
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, x_lastwinner, y_true, scoring='accuracy')
print('Accuracy: %.1f' % (np.mean(scores) * 100) + '%')

Accuracy: 64.0%


In [17]:
from sklearn.preprocessing import LabelEncoder
encoding = LabelEncoder()
encoding.fit(results['HomeTeam'].values)
home_teams = encoding.transform(results['HomeTeam'].values)
visitor_teams = encoding.transform(results['VisitorTeam'].values)
x_teams = np.vstack([home_teams, visitor_teams]).T
x_teams

array([[ 5, 19],
       [ 9, 26],
       [24, 28],
       ...,
       [ 5,  9],
       [ 5,  9],
       [ 9,  5]])

In [18]:
from sklearn.preprocessing import OneHotEncoder
onehot = OneHotEncoder()
x_teams_expanded = onehot.fit_transform(x_teams).todense()
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, x_teams_expanded, y_true, scoring='accuracy')
print('Accuracy: %.1f' % (np.mean(scores) * 100) + '%')

Accuracy: 58.5%


In [19]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf, x_teams_expanded, y_true, scoring='accuracy')
print('Accuracy: %.1f' % (np.mean(scores) * 100) + '%')

Accuracy: 59.5%


In [20]:
x_all = np.hstack([x_homehigher, x_teams_expanded])
clf = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf, x_all, y_true, scoring='accuracy')
print(x_all.shape)
print('Accuracy: %.1f' % (np.mean(scores) * 100) + '%')

(1309, 63)
Accuracy: 60.2%


In [21]:
from sklearn.model_selection import GridSearchCV

parameter_space = {
    'max_features': [2, 10, 'auto'],
    'n_estimators': [100,],
    'criterion': ['gini', 'entropy'],
    'min_samples_leaf': [2, 4, 6],
    }
clf = RandomForestClassifier(random_state=14)
grid = GridSearchCV(clf, parameter_space)
grid.fit(x_all, y_true)
print('Accuracy: %.1f' % (grid.best_score_ * 100) + '%')
print(grid.best_estimator_)

Accuracy: 64.8%
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=6, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=14, verbose=0, warm_start=False)
