In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import sklearn as skl

In [2]:
raw_df = pd.read_csv('nba_rankings_2014-2020.csv', index_col=False)
raw_df.head()

Unnamed: 0.1,Unnamed: 0,PLAYER,TEAM,AGE,SEASON,GP,W,L,MIN,PTS,...,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-,rankings
0,0,Aaron Gordon,ORL,24,2019-20,62,30,32,32.5,14.4,...,3.7,1.6,0.8,0.6,2.0,31.9,20.0,1.0,-1.1,80.0
1,1,Aaron Holiday,IND,23,2019-20,66,42,24,24.5,9.5,...,3.4,1.3,0.8,0.2,1.8,19.3,3.0,0.0,1.7,76.0
2,2,Abdel Nader,OKC,26,2019-20,55,37,18,15.8,6.3,...,0.7,0.8,0.4,0.4,1.4,11.1,0.0,0.0,-1.5,71.0
3,3,Adam Mokoka,CHI,21,2019-20,11,3,8,10.2,2.9,...,0.4,0.2,0.4,0.0,1.5,5.5,0.0,0.0,4.5,68.0
4,4,Admiral Schofield,WAS,23,2019-20,33,9,24,11.2,3.0,...,0.5,0.2,0.2,0.1,1.5,6.3,0.0,0.0,-1.7,71.0


In [3]:
season_list = raw_df["SEASON"].nunique()
enc = OneHotEncoder(sparse=False)
encode_df = pd.DataFrame(enc.fit_transform(raw_df.SEASON.values.reshape(-1,1)))


In [4]:
encode_df.columns = enc.get_feature_names(["SEASON"])
encode_df.head()



Unnamed: 0,SEASON_2014-15,SEASON_2015-16,SEASON_2016-17,SEASON_2017-18,SEASON_2018-19,SEASON_2019-20
0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,1.0


In [5]:
new_df = raw_df.merge(encode_df, left_index=True, right_index=True).drop("SEASON", 1)
new_df.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0.1,Unnamed: 0,PLAYER,TEAM,AGE,GP,W,L,MIN,PTS,FGM,...,DD2,TD3,+/-,rankings,SEASON_2014-15,SEASON_2015-16,SEASON_2016-17,SEASON_2017-18,SEASON_2018-19,SEASON_2019-20
0,0,Aaron Gordon,ORL,24,62,30,32,32.5,14.4,5.4,...,20.0,1.0,-1.1,80.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,Aaron Holiday,IND,23,66,42,24,24.5,9.5,3.5,...,3.0,0.0,1.7,76.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,Abdel Nader,OKC,26,55,37,18,15.8,6.3,2.2,...,0.0,0.0,-1.5,71.0,0.0,0.0,0.0,0.0,0.0,1.0
3,3,Adam Mokoka,CHI,21,11,3,8,10.2,2.9,1.1,...,0.0,0.0,4.5,68.0,0.0,0.0,0.0,0.0,0.0,1.0
4,4,Admiral Schofield,WAS,23,33,9,24,11.2,3.0,1.1,...,0.0,0.0,-1.7,71.0,0.0,0.0,0.0,0.0,0.0,1.0


In [6]:
X = new_df.copy()
X = X.drop("rankings", axis=1)
X = X.drop("PLAYER", axis=1)
X = X.drop("TEAM", axis=1)
X = X.drop("TD3", axis=1)
X = X.drop("DD2", axis=1)
X.head()

Unnamed: 0.1,Unnamed: 0,AGE,GP,W,L,MIN,PTS,FGM,FGA,FG%,...,BLK,PF,FP,+/-,SEASON_2014-15,SEASON_2015-16,SEASON_2016-17,SEASON_2017-18,SEASON_2018-19,SEASON_2019-20
0,0,24,62,30,32,32.5,14.4,5.4,12.4,43.7,...,0.6,2.0,31.9,-1.1,0.0,0.0,0.0,0.0,0.0,1.0
1,1,23,66,42,24,24.5,9.5,3.5,8.5,41.4,...,0.2,1.8,19.3,1.7,0.0,0.0,0.0,0.0,0.0,1.0
2,2,26,55,37,18,15.8,6.3,2.2,4.8,46.8,...,0.4,1.4,11.1,-1.5,0.0,0.0,0.0,0.0,0.0,1.0
3,3,21,11,3,8,10.2,2.9,1.1,2.5,42.9,...,0.0,1.5,5.5,4.5,0.0,0.0,0.0,0.0,0.0,1.0
4,4,23,33,9,24,11.2,3.0,1.1,2.8,38.0,...,0.1,1.5,6.3,-1.7,0.0,0.0,0.0,0.0,0.0,1.0


In [7]:
y = new_df["rankings"].ravel()
y[:5]

array([80., 76., 71., 68., 71.])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

In [9]:
scaler = StandardScaler()
X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
rf_model = RandomForestClassifier(n_estimators=128, random_state=2)

In [11]:
rf_model = rf_model.fit(X_train_scaled, y_train)

In [14]:
predictions = rf_model.predict(X_test_scaled)
predictions

array([71., 77., 72., 72., 77., 75., 79., 94., 74., 74., 69., 76., 94.,
       70., 73., 79., 78., 72., 77., 74., 74., 77., 75., 75., 76., 70.,
       77., 86., 79., 75., 76., 68., 75., 75., 68., 72., 76., 70., 82.,
       76., 81., 73., 74., 71., 74., 69., 72., 76., 70., 73., 84., 76.,
       76., 75., 72., 79., 73., 70., 76., 76., 72., 80., 75., 76., 72.,
       77., 96., 73., 70., 73., 79., 79., 69., 77., 77., 72., 74., 75.,
       96., 74., 84., 72., 89., 85., 72., 72., 74., 67., 75., 72., 74.,
       70., 75., 72., 74., 75., 73., 73., 69., 74., 65., 68., 77., 73.,
       75., 72., 76., 79., 94., 70., 76., 74., 87., 68., 80., 87., 73.,
       85., 75., 72., 73., 75., 74., 78., 76., 69., 76., 76., 75., 68.,
       84., 91., 74., 72., 76., 76., 78., 75., 77., 76., 74., 67., 76.,
       79., 72., 70., 74., 76., 77., 77., 77., 68., 76., 73., 75., 77.,
       69., 70., 70., 96., 73., 75., 76., 85., 76., 76., 76., 77., 72.,
       74., 76., 74., 80., 75., 79., 81., 76., 70., 73., 72., 75

In [15]:
accuracy_score(y_test, predictions)

0.20398009950248755

In [16]:
comparison_df = pd.DataFrame(predictions, columns=["predictions"])
comparison_df["real"] = y_test
comparison_df.head(20)

Unnamed: 0,predictions,real
0,71.0,70.0
1,77.0,78.0
2,72.0,74.0
3,72.0,69.0
4,77.0,79.0
5,75.0,76.0
6,79.0,80.0
7,94.0,88.0
8,74.0,73.0
9,74.0,76.0


In [23]:
raw_df.head()
season_df = raw_df[raw_df['SEASON'] == "2019-20"]
season_df.head()


Unnamed: 0.1,Unnamed: 0,PLAYER,TEAM,AGE,SEASON,GP,W,L,MIN,PTS,...,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-,rankings
0,0,Aaron Gordon,ORL,24,2019-20,62,30,32,32.5,14.4,...,3.7,1.6,0.8,0.6,2.0,31.9,20.0,1.0,-1.1,80.0
1,1,Aaron Holiday,IND,23,2019-20,66,42,24,24.5,9.5,...,3.4,1.3,0.8,0.2,1.8,19.3,3.0,0.0,1.7,76.0
2,2,Abdel Nader,OKC,26,2019-20,55,37,18,15.8,6.3,...,0.7,0.8,0.4,0.4,1.4,11.1,0.0,0.0,-1.5,71.0
3,3,Adam Mokoka,CHI,21,2019-20,11,3,8,10.2,2.9,...,0.4,0.2,0.4,0.0,1.5,5.5,0.0,0.0,4.5,68.0
4,4,Admiral Schofield,WAS,23,2019-20,33,9,24,11.2,3.0,...,0.5,0.2,0.2,0.1,1.5,6.3,0.0,0.0,-1.7,71.0


In [40]:
#new features for 2020 model 
new_X = season_df.copy()
new_X = new_X.drop("rankings", axis=1)
new_X = new_X.drop("PLAYER", axis=1)
new_X = new_X.drop("TEAM", axis=1)
new_X = new_X.drop("TD3", axis=1)
new_X = new_X.drop("DD2", axis=1)
new_X = new_X.drop("SEASON", axis=1)
new_X.columns

Index(['Unnamed: 0', 'AGE', 'GP', 'W', 'L', 'MIN', 'PTS', 'FGM', 'FGA', 'FG%',
       '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'OREB', 'DREB', 'REB', 'AST',
       'TOV', 'STL', 'BLK', 'PF', 'FP', '+/-'],
      dtype='object')

In [41]:
new_y = season_df["rankings"].ravel()
new_y[:5]

array([80., 76., 71., 68., 71.])

In [42]:
X_train, X_test, y_train, y_test = train_test_split(new_X, new_y, random_state=2)

In [43]:
rf_model = RandomForestClassifier(n_estimators=128, random_state=2)

In [44]:
rf_model = rf_model.fit(X_train, y_train)

In [45]:
predictions = rf_model.predict(X_test)
predictions

array([74., 78., 77., 76., 78., 76., 78., 68., 73., 76., 72., 75., 73.,
       85., 76., 74., 68., 83., 76., 77., 77., 76., 85., 72., 87., 73.,
       71., 75., 73., 71., 71., 68., 76., 68., 71., 71., 84., 76., 83.,
       79., 84., 85., 73., 77., 68., 77., 74., 72., 75., 74., 76., 72.,
       68., 72., 79., 75., 80., 73., 71., 87., 84., 68., 73., 75., 87.,
       78., 76., 71., 73., 83., 75., 76., 68., 72., 73., 83., 76., 78.,
       68., 68., 74., 85., 77., 94., 71., 77., 72., 87., 74., 71., 76.,
       71., 84., 74., 73., 78., 75., 79., 68., 87., 78., 68., 72., 75.,
       78., 73., 70., 79., 78., 69., 76., 76., 77., 67., 72., 80., 76.,
       68., 84., 76., 79., 72., 67., 68., 76., 75., 78.])

In [46]:
accuracy_score(y_test, predictions)

0.2204724409448819

In [48]:
comparison_df = pd.DataFrame(predictions, columns=['predictions'])
comparison_df['real'] = y_test
comparison_df.head(20)

Unnamed: 0,predictions,real
0,74.0,76.0
1,78.0,78.0
2,77.0,75.0
3,76.0,75.0
4,78.0,77.0
5,76.0,78.0
6,78.0,77.0
7,68.0,68.0
8,73.0,73.0
9,76.0,76.0
