In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [47]:
with open('nba_stats.csv', 'r') as f:
    data = pd.read_csv(f)

print("Dataset size: {}".format(data.shape))
print("Dataset attributes: {}".format(data.keys()))

Dataset size: (45920, 24)
Dataset attributes: Index(['TEAM', 'DATE', 'MATCHUP', 'W/L', 'MIN', 'PTS', 'FGM', 'FGA', 'FG%',
       '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'OREB', 'DREB', 'REB', 'AST',
       'STL', 'BLK', 'TOV', 'PF', '+/-'],
      dtype='object')


In [48]:
# preprocess data, for classification purpose, assign win to 1 and lose to 0
data['W/L'] = [1 if i == 'W' else 0 for i in data['W/L']]
# randomly split data into 9:1 where 9 for training and 1 for testing
data = data.sample(frac=1.0)  
cut_idx = int(round(0.2 * data.shape[0]))
data_test, data_train = data.iloc[:cut_idx], data.iloc[cut_idx:]

drop_list = ['TEAM', 'DATE', 'MATCHUP', 'PTS', '+/-']
data_test = data_test.drop(drop_list, axis=1)
data_train = data_train.drop(drop_list, axis=1)
data_test.to_csv("testset.csv", index=False, header=None)
data_train.to_csv("trainset.csv", index=False, header=None)
print("Testset shape: {}\t Trainingset shape: {}".format(data_test.shape, data_train.shape))

Testset shape: (9184, 19)	 Trainingset shape: (36736, 19)


<font face=consola, size=4>
For test case we run a simple random forest algorithm implemented in sklearn:
<font>

In [55]:
from sklearn.ensemble import RandomForestClassifier
# first divide raw data into statistics and labels
train_set = data_train.drop('W/L', axis=1)
train_label = data_train['W/L']
test_set = data_test.drop('W/L', axis=1)
test_label = data_test['W/L']

clf = RandomForestClassifier(max_depth=10, random_state=0)
clf.fit(train_set, train_label)
print(clf.predict(test_set));
print(clf.score(test_set, test_label))

[0 1 0 ... 1 1 1]
0.7969294425087108


<font face=consola, size=4>
This time we try KNN classifier:
<font>

In [51]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(train_set, train_label)
print(neigh.predict(test_set))
print(neigh.score(test_set, test_label))

[0 0 0 ... 1 1 1]
0.7525043554006968


<font face=consola, size=4>
Again we try Decision Tree classifier:
<font>

In [53]:
from sklearn.tree import DecisionTreeClassifier
decision_clf = DecisionTreeClassifier(random_state=0)
decision_clf.fit(train_set, train_label)
print(decision_clf.predict(test_set))
print(decision_clf.score(test_set, test_label))

[1 1 1 ... 1 0 1]
0.739329268292683


<font face=consola, size=4>
Logistic Regression classifier:
<font>

In [54]:
from sklearn.linear_model import LogisticRegression
logic_cls = LogisticRegression(random_state=0).fit(train_set, train_label)
print(logic_cls.predict(test_set))
print(logic_cls.score(test_set, test_label))

[0 1 0 ... 1 0 1]
0.8422256097560976


<font face=consola, size=4>
Support Vector Machine:
<font>

In [57]:
from sklearn.svm import SVC
svm_cls = SVC(gamma='auto').fit(train_set, train_label)
print(svm_cls.predict(test_set))
print(svm_cls.score(test_set, test_label))

[1 1 1 ... 1 1 1]
0.6821646341463414


In [59]:
import torch
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self, n_feat, n_classes):
        super(MLP, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Linear(n_feat, 10),
            nn.ReLU(),
        )
        self.layer2 = nn.Sequential(
            nn.Linear(10, 10),
            nn.ReLU(),
        )
        self.classifier = nn.Linear(10, n_classes)
    
    def forward(x):
        x = self.layer1(x)
        x = self.layer2(x)
        output = self.classifier(x)
        return output

mlp = MLP(n_feat=train_set.shape[1], n_classes=2)
optim = torch.optim.SGD(mlp.parameters(), lr=0.01, momentum=0.9)
criterion = torch.nn.CrossEntropyLoss()

batch_size = 256