In [3]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [4]:
with open('nba_stats.csv', 'r') as f:
    data = pd.read_csv(f)

print("Dataset size: {}".format(data.shape))
print("Dataset attributes: {}".format(data.keys()))

Dataset size: (45920, 24)
Dataset attributes: Index(['TEAM', 'DATE', 'MATCHUP', 'W/L', 'MIN', 'PTS', 'FGM', 'FGA', 'FG%',
       '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'OREB', 'DREB', 'REB', 'AST',
       'STL', 'BLK', 'TOV', 'PF', '+/-'],
      dtype='object')


<font size=3, color='black'>
    <strong>
Here we print all the attributes, while some of them are used in the classification step, some are just labels or uninterested information.<br>
The reference of each attribute is as follows:<br>
<font>
<font size=2>
    <strong>
TEAM: name of the team which the statistics belong to  <br>
DATE: match date <br>
MATCHUP: team matchup, (A@B suggests A is Away Team and B is Home Team) <br>
W/L: win or lose <br>
MIN: total minutes spent <br>
PTS: achieved points <br>
FGM: filed goals made <br>
FGA: filed goal attemped <br>
FG%: filed goal percentage <br>
3PM: three-point filed goals made <br>
3PA: three-point filed goals attemped <br>
3P%: three-point filed goals precentagte <br>
FTM: free throws made <br>
FTA: free throws attemped <br>
FT%: free throws percentage <br>
OREB: offensive rebound <br>
DREB: defensive rebound <br>
REB: rebounds (OREB + REB) <br>
AST: assists <br>
STL: steals <br>
BLK: blocked shots <br>
TOV: turnovers <br>
PF: personal fouls <br>
+/-: scoring margin <br>
    <strong>
<font>

In [5]:
# preprocess data, for classification purpose, assign win to 1 and lose to 0
data['W/L'] = [1 if i == 'W' else 0 for i in data['W/L']]
# randomly split data into 9:1 where 9 for training and 1 for testing
data = data.sample(frac=1.0)  
cut_idx = int(round(0.2 * data.shape[0]))
data_test, data_train = data.iloc[:cut_idx], data.iloc[cut_idx:]

drop_list = ['TEAM', 'DATE', 'MATCHUP', 'PTS', '+/-']
data_test = data_test.drop(drop_list, axis=1)
data_train = data_train.drop(drop_list, axis=1)
data_test.to_csv("testset.csv", index=False, header=None)
data_train.to_csv("trainset.csv", index=False, header=None)
print("Testset shape: {}\t Trainingset shape: {}".format(data_test.shape, data_train.shape))

Testset shape: (9184, 19)	 Trainingset shape: (36736, 19)


<font size=4, color='green'>
For test case we run a simple random forest algorithm implemented in sklearn:
<font>

In [6]:
from sklearn.ensemble import RandomForestClassifier
# first divide raw data into statistics and labels
train_set = data_train.drop('W/L', axis=1)
train_label = data_train['W/L']
test_set = data_test.drop('W/L', axis=1)
test_label = data_test['W/L']

clf = RandomForestClassifier(max_depth=10, random_state=0)
clf.fit(train_set, train_label)
print(clf.predict(test_set));
print(clf.score(test_set, test_label))

  from numpy.core.umath_tests import inner1d


[0 1 1 ... 1 1 0]
0.7986716027874564


<font face=consola, size=4>
This time we try KNN classifier:
<font>

In [51]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(train_set, train_label)
print(neigh.predict(test_set))
print(neigh.score(test_set, test_label))

[0 0 0 ... 1 1 1]
0.7525043554006968


<font face=consola, size=4>
Again we try Decision Tree classifier:
<font>

In [53]:
from sklearn.tree import DecisionTreeClassifier
decision_clf = DecisionTreeClassifier(random_state=0)
decision_clf.fit(train_set, train_label)
print(decision_clf.predict(test_set))
print(decision_clf.score(test_set, test_label))

[1 1 1 ... 1 0 1]
0.739329268292683


<font face=consola, size=4>
Logistic Regression classifier:
<font>

In [54]:
from sklearn.linear_model import LogisticRegression
logic_cls = LogisticRegression(random_state=0).fit(train_set, train_label)
print(logic_cls.predict(test_set))
print(logic_cls.score(test_set, test_label))

[0 1 0 ... 1 0 1]
0.8422256097560976


<font face=consola, size=4>
Support Vector Machine:
<font>

In [57]:
from sklearn.svm import SVC
svm_cls = SVC(gamma='auto').fit(train_set, train_label)
print(svm_cls.predict(test_set))
print(svm_cls.score(test_set, test_label))

[1 1 1 ... 1 1 1]
0.6821646341463414


<font face=consola, size=4>
Here we try multi-layer preceptron implemented by pytorch, we leverage Adam optimizer and CrossEntropy loss to deal with dataset of batch 256. However, such neural network method seems not better than logistic regression.
<font>

In [35]:
import torch
#import torch.utils.data as data
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self, n_feat, n_classes):
        super(MLP, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Linear(n_feat, 10),
            nn.ReLU(),
        )
        self.layer2 = nn.Sequential(
            nn.Linear(10, 10),
            nn.ReLU(),
        )
        self.classifier = nn.Linear(10, n_classes)
    
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        output = self.classifier(x)
        return output

    
class nba_data(torch.utils.data.Dataset):
    def __init__(self, raw_data):
        self.samples, self.labels = raw_data
    
    def __getitem__(self, index):
        samples = self.samples[index]
        labels = self.labels[index]
        
        return samples, labels
    
    def __len__(self):
        return self.samples.shape[0]

train_set = np.array(train_set, dtype=np.float32)
train_label = np.array(train_label)
test_set = np.array(test_set, dtype=np.float32)
test_label = np.array(test_label)
trainset = nba_data((train_set, train_label))
train_loader = torch.utils.data.DataLoader(trainset, batch_size=256, num_workers=4, shuffle=True)
testset = nba_data((test_set, test_label))
test_loader = torch.utils.data.DataLoader(testset, batch_size=256, num_workers=4, shuffle=False)

mlp = MLP(n_feat=train_set.shape[1], n_classes=2)
optim = torch.optim.Adam(mlp.parameters(), lr = 1e-3)
criterion = torch.nn.CrossEntropyLoss()

# training epochs
for epoch in range(10):
    losses = []
    correct = 0.0
    mlp.train() # set netowrk in training mode
    for i, (data,labels) in enumerate(train_loader):
        optim.zero_grad()
        
        data = torch.FloatTensor(data)
        labels = torch.LongTensor(labels)
        output = mlp(data)
        
        _, pred = output.max(1)
        correct += (pred==labels).sum()
        loss = criterion(output, labels)
        losses.append(loss.item())
        loss.backward()
        optim.step()
        
    avg_loss = sum(losses) / len(losses)
    train_acc = float(correct) / train_set.shape[0]
    print("Epoch{}\tLoss:{:.4f}\tAcc:{:.4f}%".format(epoch, avg_loss, train_acc*100))


test_loss = []
correct = 0.0
mlp.eval()
for i, (data,labels) in enumerate(test_loader):
        
        data = torch.FloatTensor(data)
        labels = torch.LongTensor(labels)
        output = mlp(data)
        
        loss = criterion(output, labels)
        test_loss.append(loss.item())
        
        _, pred = output.max(1)
        correct += (pred == labels).sum() 
        
avg_loss = sum(test_loss) / len(test_loss)
avg_acc = float(correct) / test_set.shape[0]
print("TestAcc:{:.4f}%\tTestLoss:{:.4f}\t".format(avg_acc*100, avg_loss))

Epoch0	Loss:0.6662	Acc:59.5792%
Epoch1	Loss:0.5872	Acc:69.6810%
Epoch2	Loss:0.5248	Acc:74.8176%
Epoch3	Loss:0.4745	Acc:78.0079%
Epoch4	Loss:0.4413	Acc:79.6385%
Epoch5	Loss:0.4187	Acc:80.7029%
Epoch6	Loss:0.4066	Acc:81.4405%
Epoch7	Loss:0.3974	Acc:81.9686%
Epoch8	Loss:0.3904	Acc:82.2953%
Epoch9	Loss:0.3860	Acc:82.5784%
TestAcc:82.98127177700349%	TestLoss:0.3770	
