### 决策树
剪枝: 先创建一棵完整的树，再对其进行修剪，去掉对整个过程没有提供太多信息的节点<br>
- min_samples_split：指定创建一个新节点至少需要的个体数量。<br>
- min_samples_leaf：指定为了保留节点，每个节点至少应该包含的个体数量。<br>

##### 决策树的另一个参数是创建决策的标准，常用的有以下两个。
- 基尼不纯度（Gini impurity）：用于衡量决策节点错误预测新个体类别的比例。<br>
- 信息增益（Information gain）：用信息论中的熵来表示决策节点提供多少新信息。<br>

In [None]:
import numpy as np
from collections import defaultdict
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

In [34]:
data = pd.read_csv('./data/leagues_NBA_2014_games_games.zip', parse_dates=["Date"], compression='zip')

In [35]:
data.describe()

Unnamed: 0,VisitorPts,HomePts
count,1230.0,1230.0
mean,99.710569,102.307317
std,11.738712,11.844457
min,66.0,67.0
25%,92.0,94.0
50%,99.0,102.0
75%,107.0,110.0
max,145.0,143.0


In [36]:
data['HomeWin'] = data['HomePts'] > data['VisitorPts']
data['HomeLastWin'] = False
data['VisitorLastWin'] = False
y_true = data['HomeWin'].values

In [37]:
win_last = defaultdict(int)
for index, row in data.iterrows():
    home_team = row['Home Team']
    visitor_team = row['Visitor Team']
    row['HomeLastWin'] = win_last[home_team]
    row['VisitorLastWin'] = win_last[visitor_team]
    data.iloc[index] = row
    win_last[home_team] = row['HomeWin']
    win_last[visitor_team] = not row['HomeWin']

In [38]:
data.iloc[20:25]

Unnamed: 0,Date,Score Type,Visitor Team,VisitorPts,Home Team,HomePts,OT?,Notes,HomeWin,HomeLastWin,VisitorLastWin
20,2013-11-01,Box Score,Milwaukee Bucks,105,Boston Celtics,98,,,False,False,False
21,2013-11-01,Box Score,Miami Heat,100,Brooklyn Nets,101,,,True,False,False
22,2013-11-01,Box Score,Cleveland Cavaliers,84,Charlotte Bobcats,90,,,True,False,True
23,2013-11-01,Box Score,Portland Trail Blazers,113,Denver Nuggets,98,,,False,False,False
24,2013-11-01,Box Score,Dallas Mavericks,105,Houston Rockets,113,,,True,True,True


In [39]:
clf = DecisionTreeClassifier(random_state=2021)
X_previous_wins = data[['HomeLastWin', 'VisitorLastWin']].values
scores = cross_val_score(clf, X_previous_wins, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 59.2%


In [40]:
data['HomeWithStreak'] = 0
data['VisitorWithStreak'] = 0
win_streak = defaultdict(int)
for index, row in data.iterrows():
    home_team = row['Home Team']
    visitor_team = row['Visitor Team']
    row['HomeWithStreak'] = win_streak[home_team]
    row['VisitorWithStreak'] = win_streak[visitor_team]
    data.iloc[index] = row
    if row['HomeWin']:
        win_streak[home_team] += 1
        win_streak[visitor_team] += 0
    else:
        win_streak[home_team] += 0
        win_streak[visitor_team] += 1

In [41]:
X_win_streak = data[['HomeLastWin', 'VisitorLastWin', 'HomeWithStreak', 'VisitorWithStreak']].values
scores = cross_val_score(clf, X_win_streak, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))

Accuracy: 54.8%


In [42]:
ladder = pd.read_csv('./data/leagues_NBA_2013_standings_expanded-standings.zip', compression='zip')
ladder.head()

Unnamed: 0,Rk,Team,Overall,Home,Road,E,W,A,C,SE,...,Post,≤3,≥10,Oct,Nov,Dec,Jan,Feb,Mar,Apr
0,1,Miami Heat,66-16,37-4,29-12,41-11,25-5,14-4,12-6,15-1,...,30-2,9-3,39-8,1-0,10-3,10-5,8-5,12-1,17-1,8-1
1,2,Oklahoma City Thunder,60-22,34-7,26-15,21-9,39-13,7-3,8-2,6-4,...,21-8,3-6,44-6,,13-4,11-2,11-5,7-4,12-5,6-2
2,3,San Antonio Spurs,58-24,35-6,23-18,25-5,33-19,8-2,9-1,8-2,...,16-12,9-5,31-10,1-0,12-4,12-4,12-3,8-3,10-4,3-6
3,4,Denver Nuggets,57-25,38-3,19-22,19-11,38-14,5-5,10-0,4-6,...,24-4,11-7,28-8,0-1,8-8,9-6,12-3,8-4,13-2,7-1
4,5,Los Angeles Clippers,56-26,32-9,24-17,21-9,35-17,7-3,8-2,6-4,...,17-9,3-5,38-12,1-0,8-6,16-0,9-7,8-5,7-7,7-1


In [43]:
data['HomeTeamRankHigher'] = 0
for index, row in data.iterrows():
    home_team = row['Home Team']
    visitor_team = row['Visitor Team']
    if home_team == "New Orleans Pelicans":
        home_team = "New Orleans Hornets"
    elif visitor_team == "New Orleans Pelicans":
        visitor_team = "New Orleans Hornets"
    home_rank = ladder[ladder['Team'] == home_team]['Rk'].values[0]
    visitor_rank = ladder[ladder['Team'] == visitor_team]['Rk'].values[0]
    row['HomeTeamRankHigher'] = int(home_rank > visitor_rank)
    data.iloc[index] = row
data[:5]

Unnamed: 0,Date,Score Type,Visitor Team,VisitorPts,Home Team,HomePts,OT?,Notes,HomeWin,HomeLastWin,VisitorLastWin,HomeWithStreak,VisitorWithStreak,HomeTeamRankHigher
0,2013-10-29,Box Score,Orlando Magic,87,Indiana Pacers,97,,,True,0,0,0,0,0
1,2013-10-29,Box Score,Los Angeles Clippers,103,Los Angeles Lakers,116,,,True,0,0,0,0,1
2,2013-10-29,Box Score,Chicago Bulls,95,Miami Heat,107,,,True,0,0,0,0,0
3,2013-10-30,Box Score,Brooklyn Nets,94,Cleveland Cavaliers,98,,,True,0,0,0,0,1
4,2013-10-30,Box Score,Atlanta Hawks,109,Dallas Mavericks,118,,,True,0,0,0,0,1


In [44]:
X_home_higher = data[["HomeLastWin", "VisitorLastWin", "HomeTeamRankHigher"]].values
scores = cross_val_score(clf, X_home_higher, y_true, scoring='accuracy')
print("准确率: {0:.1f}%".format(np.mean(scores) * 100))

准确率: 60.3%


In [49]:
from sklearn.model_selection import GridSearchCV

In [50]:
parameter_space = {"max_depth": list(range(1, 21))}
grid = GridSearchCV(clf, parameter_space)
grid.fit(X_home_higher, y_true)
print("准确率: {0:.1f}%".format(grid.best_score_ * 100))

准确率: 60.6%


In [56]:
last_match_winner = defaultdict(int)
data['HomeTeamWinLast'] = 0
for index, row in data.iterrows():
    home_team = row['Home Team']
    visitor_team = row['Visitor Team']
    teams = tuple(sorted([home_team, visitor_team]))
    row['HomeTeamWinLast'] = 1 if last_match_winner[teams] == home_team else 0
    data.ix[index] = row
    winner = home_team if row['HomeWin'] else visitor_team
    last_match_winner[teams] = winner

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  data.ix[index] = row


In [60]:
X_home_higher = data[["HomeTeamRankHigher", "HomeTeamWinLast"]].values
scores = cross_val_score(clf, X_home_higher, y_true, scoring='accuracy')
print("准确率: {0:.1f}%".format(np.mean(scores) * 100))

准确率: 60.6%


In [61]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
encoding = LabelEncoder()
encoding.fit(data['Home Team'].values)
home_teams = encoding.transform(data['Home Team'].values)
visitor_teams = encoding.transform(data['Visitor Team'].values)
X_teams = np.vstack([home_teams, visitor_teams]).T
onehot = OneHotEncoder()
X_teams = onehot.fit_transform(X_teams).todense()
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy')
print("准确率: {0:.1f}%".format(np.mean(scores) * 100))

准确率: 60.0%


In [62]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=2021)
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy')
print("准确率: {0:.1f}%".format(np.mean(scores) * 100))

准确率: 61.5%


In [67]:
X_all = np.hstack([X_home_higher, X_teams])
scores = cross_val_score(clf, X_all, y_true, scoring='accuracy')
print("准确率: {0:.1f}%".format(np.mean(scores) * 100))

准确率: 63.3%


In [71]:
parameter_space = {
    "max_features": [2, 10, 'auto'], 
    "n_estimators": [100, ],
    "criterion": ['gini', 'entropy'], 
    "min_samples_leaf": [2, 4, 6]
}
grid = GridSearchCV(clf, parameter_space)
grid.fit(X_all, y_true)
print("准确率: {0:.1f}%".format(grid.best_score_ * 100))
print(grid.best_estimator_)

准确率: 65.6%
RandomForestClassifier(max_features=2, min_samples_leaf=2, random_state=2021)


In [70]:
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2021,
 'verbose': 0,
 'warm_start': False}

In [108]:
teamRk = defaultdict(int)
data['HomeTeamRank'] = 0
data['VisitorTeamRank'] = 0

for index, row in data.iterrows():
    home_team = row['Home Team']
    visitor_team = row['Visitor Team']
    if home_team == "New Orleans Pelicans":
        home_team = "New Orleans Hornets"
    elif visitor_team == "New Orleans Pelicans":
        visitor_team = "New Orleans Hornets"
    row['HomeTeamRank'] = ladder[ladder['Team'] == home_team]['Rk'].values[0]
    row['VisitorTeamRank'] = ladder[ladder['Team'] == visitor_team]['Rk'].values[0]
    data.iloc[index] = row

In [124]:
X_all_features = data[['Home Team', 'Visitor Team', 'HomeLastWin', 'VisitorLastWin', 'HomeWithStreak', 
                       'VisitorWithStreak', 'HomeTeamRankHigher', 
                       'HomeTeamWinLast', 'HomeTeamRank', 'VisitorTeamRank']]

In [126]:
X_all_features = pd.get_dummies(X_all_features, columns=['Home Team', 'Visitor Team'], drop_first=True)
clf = DecisionTreeClassifier(random_state=2021)
scores = cross_val_score(clf, X_all_features, y_true, scoring='accuracy')
print("准确率: {0:.1f}%".format(np.mean(scores) * 100))

准确率: 57.4%


In [127]:
clf = RandomForestClassifier(random_state=2021)
scores = cross_val_score(clf, X_all_features, y_true, scoring='accuracy')
print("准确率: {0:.1f}%".format(np.mean(scores) * 100))

准确率: 63.3%


In [128]:
parameter_space = {
    "max_features": [2, 10, 'auto'], 
    "n_estimators": [100, ],
    "criterion": ['gini', 'entropy'], 
    "min_samples_leaf": [2, 4, 6]
}
grid = GridSearchCV(clf, parameter_space)
grid.fit(X_all_features, y_true)
print("准确率: {0:.1f}%".format(grid.best_score_ * 100))
print(grid.best_estimator_)

准确率: 66.0%
RandomForestClassifier(criterion='entropy', max_features=2, min_samples_leaf=4,
                       random_state=2021)
