### NBA 경기 결과 예측

In [1]:
import os
import pandas as pd
import numpy as np

from collections import defaultdict

from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [2]:
home_folder = os.getcwd()
data_folder = os.path.join(home_folder, '../Data/NBA')

In [3]:
data_files = ['nba_2015_10.csv', 'nba_2015_11.csv', 'nba_2015_12.csv',
              'nba_2016_01.csv', 'nba_2016_02.csv', 'nba_2016_03.csv', 
              'nba_2016_04.csv', 'nba_2016_05.csv', 'nba_2016_06.csv']

In [4]:
csv_objs = []   # pandas 데이터프레임 객체들(각 월별 데이터 표)의 리스트
for f in data_files:
    month_data = os.path.join(data_folder, f)
    csv_objs.append(pd.read_csv(month_data))

In [5]:
season_result = pd.concat(csv_objs, ignore_index=True)   # ignore_index=True로 하면 새롭게 index가 설정됨
season_result.columns = ["Date", "StartTime", "VisitorTeam",
                         "VisitorPts", "HomeTeam", "HomePts",
                         "ScoreType", "Overtime", "Attend",
                         "Notes"]

In [6]:
season_result.head(3)

Unnamed: 0,Date,StartTime,VisitorTeam,VisitorPts,HomeTeam,HomePts,ScoreType,Overtime,Attend,Notes
0,Tue Oct 27 2015,8:00p,Detroit Pistons,106,Atlanta Hawks,94,Box Score,,19187,
1,Tue Oct 27 2015,8:00p,Cleveland Cavaliers,95,Chicago Bulls,97,Box Score,,21957,
2,Tue Oct 27 2015,10:30p,New Orleans Pelicans,95,Golden State Warriors,111,Box Score,,19596,


In [7]:
print(len(season_result))

1316


In [8]:
season_result["HomeWin"] = season_result["VisitorPts"] < season_result["HomePts"]

In [9]:
season_result.tail(3)

Unnamed: 0,Date,StartTime,VisitorTeam,VisitorPts,HomeTeam,HomePts,ScoreType,Overtime,Attend,Notes,HomeWin
1313,Mon Jun 13 2016,9:00p,Cleveland Cavaliers,112,Golden State Warriors,97,Box Score,,19596,,False
1314,Thu Jun 16 2016,9:00p,Golden State Warriors,101,Cleveland Cavaliers,115,Box Score,,20562,,True
1315,Sun Jun 19 2016,8:00p,Cleveland Cavaliers,93,Golden State Warriors,89,Box Score,,19596,,False


In [10]:
score = 100 * season_result["HomeWin"].sum()/season_result["HomeWin"].count()
print("Home Win percentage : %.1f%%"%score)

Home Win percentage : 59.4%


In [11]:
# 2014-2015 시즌 결과
standing_file = os.path.join(data_folder, "nba_2014_2015_standing.csv")
standing_result = pd.read_csv(standing_file, skiprows=[0])

In [12]:
standing_result.head(3)

Unnamed: 0,Rk,Team,Overall,Home,Road,E,W,A,C,SE,...,Post,≤3,≥10,Oct,Nov,Dec,Jan,Feb,Mar,Apr
0,1,Golden State Warriors,67-15,39-2,28-13,25-5,42-10,9-1,7-3,9-1,...,25-6,5-3,45-9,1-0,13-2,11-3,12-3,8-3,16-2,6-2
1,2,Atlanta Hawks,60-22,35-6,25-16,38-14,22-8,12-6,14-4,12-4,...,17-11,6-4,30-10,0-1,9-5,14-2,17-0,7-4,9-7,4-3
2,3,Houston Rockets,56-26,30-11,26-15,23-7,33-19,9-1,8-2,6-4,...,20-9,8-4,31-14,2-0,11-4,9-5,11-6,7-3,10-6,6-2


In [13]:
player_file = os.path.join(data_folder, "nba_2016_player_stat.csv")
player_result = pd.read_csv(player_file)

In [14]:
player_result["PLAYER"][:3]

0       Stephen Curry GS
1       Kevin Durant OKC
2    Boban Marjanovic SA
Name: PLAYER, dtype: object

In [15]:
print(len(player_result))   # 시즌에 참여한 선수 수

350


In [16]:
player_result.head(6)  # 공동 4위인 Russell Westbrook과 LeBron James에 주목

Unnamed: 0,RK,PLAYER,GP,MPG,TS%,AST,TO,USG,ORR,DRR,REBR,PER,VA,EWA
0,1.0,Stephen Curry GS,79,34.2,0.669,20.6,10.2,31.6,2.9,13.6,8.6,31.56,828.6,27.6
1,2.0,Kevin Durant OKC,72,35.8,0.634,16.3,11.3,30.2,2.0,21.8,12.4,28.25,682.8,22.8
2,3.0,Boban Marjanovic SA,54,9.4,0.662,7.7,10.6,21.0,16.9,26.2,21.7,27.77,130.2,4.3
3,4.0,Russell Westbrook OKC,80,34.4,0.554,29.0,11.9,33.3,6.1,18.1,12.4,27.64,683.0,22.8
4,,LeBron James CLE,76,35.6,0.588,21.5,10.4,31.1,4.7,18.8,11.8,27.64,692.9,23.1
5,6.0,Chris Paul LAC,74,32.7,0.575,33.7,8.9,28.1,1.8,12.0,7.0,26.31,552.8,18.4


In [17]:
team_name = {"GS" : "Golden State Warriors",
            "SA" : "San Antonio Spurs",
            "CLE" : "Cleveland Cavaliers",
            "TOR" : "Toronto Raptors",
            "OKC" : "Oklahoma City Thunder",
            "LAC" : "Los Angeles Clippers",
            "ATL" : "Atlanta Hawks",
            "BOS" : "Boston Celtics",
            "CHA" : "Charlotte Hornets",
            "MIA" : "Miami Heat",
            "IND" : "Indiana Pacers",
            "DET" : "Detroit Pistons",
            "POR" : "Portland Trail Blazers",
            "DAL" : "Dallas Mavericks",
            "MEM" : "Memphis Grizzlies",
            "CHI" : "Chicago Bulls",
            "HOU" : "Houston Rockets",
            "WSH" : "Washington Wizards",
            "UTAH" : "Utah Jazz",
            "ORL" : "Orlando Magic",
            "DEN" : "Denver Nuggets",
            "MIL" : "Milwaukee Bucks",
            "SAC" : "Sacramento Kings",
            "NY" : "New York Knicks",
            "NO" : "New Orleans Pelicans",
            "MIN" : "Minnesota Timberwolves",
            "PHX" : "Phoenix Suns",
            "BKN" : "Brooklyn Nets",
            "LAL" : "Los Angeles Lakers",
            "PHI" : "Philadelphia 76ers",
}

In [18]:
team_per = {}
for key, value in team_name.items():
    team_per[value] = []

for idx, row in player_result.iterrows():   # 각 row의 참조법!
    player = row["PLAYER"]
    per = row["PER"]
    team_list = player.split()[-1].split('/')   # 이적한 선수는 팀이 두개로 표기되어 있다. ex) DEN/PHI
    for team in team_list:
        team_per[team_name[team]].append(per)

In [19]:
print("Golden State Warriors: Sum of PER: %.2f / Mean of PER: %.2f"
      %(np.sum(team_per["Golden State Warriors"]),np.mean(team_per["Golden State Warriors"])))
print("Philadelphia 76ers: Sum of PER: %.2f / Mean of PER: %.2f"
      %(np.sum(team_per["Philadelphia 76ers"]),np.mean(team_per["Philadelphia 76ers"])))

Golden State Warriors: Sum of PER: 192.37 / Mean of PER: 16.03
Philadelphia 76ers: Sum of PER: 165.17 / Mean of PER: 13.76


- 잘 하는 팀들은 연전연승의 횟수가 많을 것이라고 가정

In [20]:
season_result["VisitorWinStreak"] = 0
season_result["HomeWinStreak"] = 0

winning_streak = defaultdict(int)

for index, row in season_result.iterrows():
    home = row["HomeTeam"]
    visitor = row["VisitorTeam"]
    row["HomeWinStreak"] = winning_streak[home]   # home을 key로 갖고 0을 value로 갖는 쌍이 생성됨
    row["VisitorWinStreak"] = winning_streak[visitor]  # visitor를 key로 갖고 0을 value로 갖는 쌍이 생성됨
    season_result.loc[index] = row   # ix 대신에 loc 쓸것
    
    if row["HomeWin"]:
        winning_streak[home] += 1
        winning_streak[visitor] = 0
    else:
        winning_streak[home] = 0
        winning_streak[visitor] += 1
        

In [21]:
season_result.tail()

Unnamed: 0,Date,StartTime,VisitorTeam,VisitorPts,HomeTeam,HomePts,ScoreType,Overtime,Attend,Notes,HomeWin,VisitorWinStreak,HomeWinStreak
1311,Wed Jun 8 2016,9:00p,Golden State Warriors,90,Cleveland Cavaliers,120,Box Score,,20562,,True,5,0
1312,Fri Jun 10 2016,9:00p,Golden State Warriors,108,Cleveland Cavaliers,97,Box Score,,20562,,False,0,1
1313,Mon Jun 13 2016,9:00p,Cleveland Cavaliers,112,Golden State Warriors,97,Box Score,,19596,,False,0,1
1314,Thu Jun 16 2016,9:00p,Golden State Warriors,101,Cleveland Cavaliers,115,Box Score,,20562,,True,0,1
1315,Sun Jun 19 2016,8:00p,Cleveland Cavaliers,93,Golden State Warriors,89,Box Score,,19596,,False,2,0


In [22]:
clf = DecisionTreeClassifier(random_state=7)
x_test = season_result[["VisitorWinStreak","HomeWinStreak"]].values  # numpy.array로 변환
y_test = season_result["HomeWin"].values
scores = cross_val_score(clf, x_test, y_test, scoring="accuracy", cv=3)

In [23]:
season_result[["VisitorWinStreak","HomeWinStreak"]].values

array([[0, 0],
       [0, 0],
       [0, 0],
       ...,
       [0, 1],
       [0, 1],
       [2, 0]])

- VisitorWinStreak과 HomeWinStreak만 고려했을때

In [24]:
print("Accuracy: %.1f%% (+/- %.2f%%)"%(np.mean(scores)*100, np.std(scores)))

Accuracy: 56.1% (+/- 0.02%)


In [25]:
print(x_test, y_test)

[[0 0]
 [0 0]
 [0 0]
 ...
 [0 1]
 [0 1]
 [2 0]] [False  True  True ... False  True False]


In [26]:
standing_result["PER_Sum"] = 0
for idx, row in standing_result.iterrows():
    team = row["Team"]
    row["PER_Sum"] = np.sum(team_per[team][:10])   # 각 팀별 PER 상위 10명의 PER을 모두 더한다.
    standing_result.loc[idx] = row

In [27]:
# 각 팀에서 PER 상위 10명의 PER 합을 비교
season_result["HomePERHigh"] = 0
for idx, row in season_result.iterrows():
    home = row["HomeTeam"]
    visitor = row["VisitorTeam"]
    
    home_per = standing_result[standing_result["Team"] == home]["PER_Sum"].values[0]
    visitor_per = standing_result[standing_result["Team"] == visitor]["PER_Sum"].values[0]
    row["HomePERHigh"] = int(home_per > visitor_per)
    season_result.loc[idx] = row

In [28]:
standing_result[standing_result["Team"] == "Golden State Warriors"]  # 특정 row 참조법

Unnamed: 0,Rk,Team,Overall,Home,Road,E,W,A,C,SE,...,≤3,≥10,Oct,Nov,Dec,Jan,Feb,Mar,Apr,PER_Sum
0,1,Golden State Warriors,67-15,39-2,28-13,25-5,42-10,9-1,7-3,9-1,...,5-3,45-9,1-0,13-2,11-3,12-3,8-3,16-2,6-2,170.68


- HomePERHigh만 고려했을때

In [29]:
x_test = season_result[["HomePERHigh"]].values
scores = cross_val_score(clf, x_test, y_test, scoring="accuracy", cv=3)
print("Accuracy: %.1f%% (+/- %.2f)"%(np.mean(scores)*100, np.std(scores)))

Accuracy: 63.4% (+/- 0.02)


In [30]:
season_result[["HomePERHigh"]].values

array([[1],
       [0],
       [1],
       ...,
       [1],
       [0],
       [1]])

- HomeWinStreak, VisitorWinStreak, HomePERHigh 셋 다 고려했을때

In [31]:
x_test = season_result[["HomeWinStreak", "VisitorWinStreak", "HomePERHigh"]].values
scores = cross_val_score(clf, x_test, y_test, scoring="accuracy", cv=4)
print("Accuracy: %.1f%% (+/- %.2f)"%(np.mean(scores)*100, np.std(scores)))

Accuracy: 61.1% (+/- 0.02)


In [32]:
name_encoding = LabelEncoder()
name_encoding.fit(season_result["HomeTeam"].values)  # 팀 이름을 숫자에 대응시킴

print(name_encoding.transform(["Golden State Warriors", "Cleveland Cavaliers"]))
print(name_encoding.inverse_transform([9,5]))

[9 5]
['Golden State Warriors' 'Cleveland Cavaliers']


In [33]:
home_teams = name_encoding.transform(season_result["HomeTeam"].values)
visitor_teams = name_encoding.transform(season_result["VisitorTeam"].values)
team_match = np.vstack([home_teams, visitor_teams]).T

In [34]:
home_teams

array([0, 4, 9, ..., 9, 5, 9])

In [35]:
visitor_teams

array([ 8,  5, 18, ...,  5,  9,  5])

In [36]:
np.vstack([home_teams, visitor_teams])

array([[ 0,  4,  9, ...,  9,  5,  9],
       [ 8,  5, 18, ...,  5,  9,  5]])

In [37]:
team_match

array([[ 0,  8],
       [ 4,  5],
       [ 9, 18],
       ...,
       [ 9,  5],
       [ 5,  9],
       [ 9,  5]])

In [38]:
onehot = OneHotEncoder(categories='auto')
x_test = onehot.fit_transform(team_match).todense()

In [39]:
x_test[0]

matrix([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [40]:
scores = cross_val_score(clf, x_test, y_test, scoring="accuracy", cv=3)
print("Accuracy: %.1f%% (+/- %.2f)"%(np.mean(scores)*100, np.std(scores)))

Accuracy: 63.0% (+/- 0.01)


In [41]:
season_result["HomeRankHigh"] = 0
for idx, row in season_result.iterrows():
    home = row["HomeTeam"]
    visitor = row["VisitorTeam"]
    
    home_rank = standing_result[standing_result["Team"] == home]["Rk"].values[0]
    visitor_rank = standing_result[standing_result["Team"] == visitor]["Rk"].values[0]
    row["HomeRankHigh"] = int(home_rank > visitor_rank)
    season_result.loc[idx] = row

- HomeRankHigh만 고려했을때

In [42]:
x_test = season_result[["HomeRankHigh"]].values
scores = cross_val_score(clf, x_test, y_test, scoring="accuracy", cv=5)
print("Accuracy: %.1f%% (+/- %.2f)"%(np.mean(scores)*100, np.std(scores)))

Accuracy: 62.0% (+/- 0.03)


- HomeWinStreak, VisitorWinStreak, HomePERHigh, HomeRankHigh를 다 고려했을때

In [43]:
x_test = season_result[["HomeRankHigh", "HomePERHigh", "HomeWinStreak", "VisitorWinStreak",]].values
scores = cross_val_score(clf, x_test, y_test, scoring="accuracy", cv=4)
print("Accuracy: %.1f%% (+/- %.2f)"%(np.mean(scores)*100, np.std(scores)))

Accuracy: 62.7% (+/- 0.02)
