In [324]:
import numpy as np
import pandas as pd
import lightgbm as lgbm
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import os

In [325]:
train_data = pd.DataFrame([])
test_data = pd.DataFrame([])
for year in range(2014,2022):
    train_data = pd.concat([train_data,pd.read_csv(f'./NBA_DATA3-5/{year}_TeamGameStats.csv')],axis=0,ignore_index=True)
    if year == 2021:
        test_data = pd.read_csv(f'./NBA_DATA3-5/{year}_TeamGameStats.csv')
base_train_data = train_data.copy()
train_data.head()

Unnamed: 0,gmDate,gmTime,seasType,offLNm1,offFNm1,offLNm2,offFNm2,OffLNm3,offFNm3,teamAbbr,...,opptFIC,opptFIC40,opptOrtg,opptDrtg,opptEDiff,opptPlay_RATE,opptAR,opptASTDividedByTO,opptSTLDividedByTO,opptFouls
0,2014/10/28,20:00,Regular,,,,,,,Orlando Magic,...,,,106.1,88.2,,,,2.22,111.11,17
1,2014/10/28,20:00,Regular,,,,,,,New Orleans Pelicans,...,,,88.2,106.1,,,,0.94,27.78,25
2,2014/10/28,20:00,Regular,,,,,,,Dallas Mavericks,...,,,117.3,116.1,,,,1.15,25.0,20
3,2014/10/28,20:00,Regular,,,,,,,San Antonio Spurs,...,,,116.1,117.3,,,,1.7,90.0,20
4,2014/10/28,22:30,Regular,,,,,,,Houston Rockets,...,,,99.0,118.8,,,,1.45,63.64,32


# Preparing to analysis

In [326]:
# Date processing
date_value = pd.to_datetime(train_data['gmDate'], errors='coerce')
time_value = pd.to_datetime(train_data['gmTime'], errors='coerce')


train_data['year'] = date_value.dt.year
train_data['month'] = date_value.dt.month
train_data['day'] = date_value.dt.day
train_data['hour'] = time_value.dt.hour
train_data['minute'] = time_value.dt.minute

del train_data['gmDate']
del train_data['gmTime']
train_data

Unnamed: 0,seasType,offLNm1,offFNm1,offLNm2,offFNm2,OffLNm3,offFNm3,teamAbbr,teamConf,teamLoc,...,opptPlay_RATE,opptAR,opptASTDividedByTO,opptSTLDividedByTO,opptFouls,year,month,day,hour,minute
0,Regular,,,,,,,Orlando Magic,,Away,...,,,2.22,111.11,17,2014,10,28,20,0
1,Regular,,,,,,,New Orleans Pelicans,,Home,...,,,0.94,27.78,25,2014,10,28,20,0
2,Regular,,,,,,,Dallas Mavericks,,Away,...,,,1.15,25.00,20,2014,10,28,20,0
3,Regular,,,,,,,San Antonio Spurs,,Home,...,,,1.70,90.00,20,2014,10,28,20,0
4,Regular,,,,,,,Houston Rockets,,Away,...,,,1.45,63.64,32,2014,10,28,22,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16809,Regular,,,,,,,Portland Trail Blazers,,Home,...,,,2.58,50.00,24,2022,1,3,22,0
16810,Regular,,,,,,,Minnesota Timberwolves,,Away,...,,,1.10,60.00,10,2022,1,3,22,30
16811,Regular,,,,,,,Los Angeles Clippers,,Home,...,,,2.00,52.94,14,2022,1,3,22,30
16812,Regular,,,,,,,Memphis Grizzlies,,Away,...,,,2.55,18.18,14,2022,1,4,19,0


In [327]:
# Mapping of teamRslt column
mapping = {'Loss': 2, 'Win': 1}

train_data = train_data.replace({'teamRslt': mapping})
test_data = test_data.replace({'teamRslt': mapping})

In [328]:
# Drop columns with missing values
cols_with_missing = [col for col in train_data.columns if train_data[col].isnull().any()]
train_data.drop(cols_with_missing, axis=1, inplace=True)
test_data.drop(cols_with_missing, axis=1, inplace=True)

In [329]:
# Categorical data processing
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
categorical_columns = []
features = train_data.columns.values.tolist()
for col in features:
    if train_data[col].dtype in numerics: continue
    categorical_columns.append(col)
indexer = {}
for col in categorical_columns:
    if train_data[col].dtype in numerics: continue
    _, indexer[col] = pd.factorize(train_data[col])

for col in categorical_columns:
    if train_data[col].dtype in numerics: continue
    train_data[col] = indexer[col].get_indexer(train_data[col])

In [330]:
train_data.head()

Unnamed: 0,seasType,teamAbbr,teamLoc,teamRslt,teamMin,teamPTS,teamAST,teamTO,teamSTL,teamBLK,...,opptOrtg,opptDrtg,opptASTDividedByTO,opptSTLDividedByTO,opptFouls,year,month,day,hour,minute
0,0,0,0,2,240,84,17,18,5,9,...,106.1,88.2,2.22,111.11,17,2014,10,28,20,0
1,0,1,1,1,240,101,20,9,10,17,...,88.2,106.1,0.94,27.78,25,2014,10,28,20,0
2,0,2,0,2,240,100,17,10,9,3,...,117.3,116.1,1.15,25.0,20,2014,10,28,20,0
3,0,3,1,1,240,101,23,20,5,3,...,116.1,117.3,1.7,90.0,20,2014,10,28,20,0
4,0,4,0,1,240,108,22,13,7,3,...,99.0,118.8,1.45,63.64,32,2014,10,28,22,30


In [332]:
y = train_data['teamRslt']

columns_to_delete = ['teamRslt']

train_data.drop(columns_to_delete, axis=1, inplace=True)

X = train_data;

In [309]:
y = train_data['teamRslt']
X = train_data.drop('teamRslt', axis=1)
X

Unnamed: 0,seasType,teamAbbr,teamLoc,teamMin,teamPTS,teamAST,teamTO,teamSTL,teamBLK,teamFGA,...,opptOrtg,opptDrtg,opptASTDividedByTO,opptSTLDividedByTO,opptFouls,year,month,day,hour,minute
0,0,0,0,240,84,17,18,5,9,84,...,106.1,88.2,2.22,111.11,17,2014,10,28,20,0
1,0,1,1,240,101,20,9,10,17,101,...,88.2,106.1,0.94,27.78,25,2014,10,28,20,0
2,0,2,0,240,100,17,10,9,3,78,...,117.3,116.1,1.15,25.00,20,2014,10,28,20,0
3,0,3,1,240,101,23,20,5,3,70,...,116.1,117.3,1.70,90.00,20,2014,10,28,20,0
4,0,4,0,240,108,22,13,7,3,73,...,99.0,118.8,1.45,63.64,32,2014,10,28,22,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16809,0,27,1,240,136,29,9,10,3,89,...,133.8,138.9,2.58,50.00,24,2022,1,3,22,0
16810,0,18,0,240,122,34,17,9,4,93,...,104.1,122.2,1.10,60.00,10,2022,1,3,22,30
16811,0,29,1,240,104,22,20,12,3,78,...,122.2,104.1,2.00,52.94,14,2022,1,3,22,30
16812,0,19,0,240,110,21,9,6,8,90,...,111.9,116.1,2.55,18.18,14,2022,1,4,19,0


In [333]:
feature_columns = ['opptPTS', 'teamDrtg', 'teamTO', 'teamORB', 'teamFGM']
X = X[feature_columns];

X.head()

Unnamed: 0,opptPTS,teamDrtg,teamTO,teamORB,teamFGM
0,101,106.1,18,16,32
1,84,88.2,9,26,41
2,101,117.3,10,9,38
3,100,116.1,20,9,37
4,90,99.0,13,14,31


In [334]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [335]:
clfgtb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                    max_depth=1, random_state=0).fit(X_train, y_train)
results = []
result = clfgtb.score(X_valid, y_valid) * 100
results.append(result)
print(result)

84.8647041332144


In [336]:
corr = 0
sum = 0
matches = test_data[['teamAbbr','opptAbbr','teamRslt']].to_dict('r')
for match in matches:
    home_team = match['teamAbbr']
    away_team = match['opptAbbr']

    prev_matches = base_train_data.loc[(base_train_data['teamAbbr'] == home_team) & (base_train_data['opptAbbr'] == away_team)][feature_columns]

    avg = prev_matches.mean()
    avg_prev = [prev_matches.mean().values.tolist()]
    pred = clfgtb.predict(avg_prev)
    prob = clfgtb.predict_proba(avg_prev)



    sum+=1
    print(home_team + ' vs ' + away_team)
    print(pred,match['teamRslt'])
    print(prob)



    if pred[0] == match['teamRslt'] :
            corr += 1

    print('-------------------------------\n')


print(corr/sum)

  matches = test_data[['teamAbbr','opptAbbr','teamRslt']].to_dict('r')


Brooklyn Nets vs Milwaukee Bucks
[2] 2
[[0.09742894 0.90257106]]
-------------------------------

Milwaukee Bucks vs Brooklyn Nets
[1] 1
[[0.93364485 0.06635515]]
-------------------------------

Golden State Warriors vs Los Angeles Lakers
[1] 1
[[0.82240936 0.17759064]]
-------------------------------

Los Angeles Lakers vs Golden State Warriors
[2] 2
[[0.46315991 0.53684009]]
-------------------------------

Indiana Pacers vs Charlotte Hornets
[1] 2
[[0.87449742 0.12550258]]
-------------------------------

Charlotte Hornets vs Indiana Pacers
[2] 1
[[0.20440812 0.79559188]]
-------------------------------

Chicago Bulls vs Detroit Pistons
[1] 1
[[0.67652076 0.32347924]]
-------------------------------

Detroit Pistons vs Chicago Bulls
[1] 2
[[0.67652076 0.32347924]]
-------------------------------

Boston Celtics vs New York Knicks
[1] 2
[[0.73150386 0.26849614]]
-------------------------------

New York Knicks vs Boston Celtics
[2] 1
[[0.23549867 0.76450133]]
-----------------------