In [2]:
import pandas as pd
import bs4
import requests
import re
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split, ParameterGrid, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, OneClassSVM
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, f1_score, balanced_accuracy_score
from sklearn.ensemble import IsolationForest

In [4]:
players_df = pd.read_csv('data/player_data.csv')
players_df.drop([423, 1191, 2063, 3146], inplace=True)
players_df.head()

Unnamed: 0,name,year_start,year_end,position,height,weight,birth_date,college
0,Alaa Abdelnaby,1991,1995,F-C,6-10,240.0,"June 24, 1968",Duke University
1,Zaid Abdul-Aziz,1969,1978,C-F,6-9,235.0,"April 7, 1946",Iowa State University
2,Kareem Abdul-Jabbar,1970,1989,C,7-2,225.0,"April 16, 1947","University of California, Los Angeles"
3,Mahmoud Abdul-Rauf,1991,2001,G,6-1,162.0,"March 9, 1969",Louisiana State University
4,Tariq Abdul-Wahad,1998,2003,F,6-6,223.0,"November 3, 1974",San Jose State University


In [5]:
seasons_df = pd.read_csv('data/Seasons_stats.csv')
seasons_df.drop('Unnamed: 0', axis=1, inplace=True)
seasons_df.dropna(inplace=True, how='all')
seasons_df['Player'] = seasons_df['Player'].apply(lambda x: re.sub('\*', "", x))
seasons_df.head()

Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,TS%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1950.0,Curly Armstrong,G-F,31.0,FTW,63.0,,,,0.368,...,0.705,,,,176.0,,,,217.0,458.0
1,1950.0,Cliff Barker,SG,29.0,INO,49.0,,,,0.435,...,0.708,,,,109.0,,,,99.0,279.0
2,1950.0,Leo Barnhorst,SF,25.0,CHS,67.0,,,,0.394,...,0.698,,,,140.0,,,,192.0,438.0
3,1950.0,Ed Bartels,F,24.0,TOT,15.0,,,,0.312,...,0.559,,,,20.0,,,,29.0,63.0
4,1950.0,Ed Bartels,F,24.0,DNN,13.0,,,,0.308,...,0.548,,,,20.0,,,,27.0,59.0


In [6]:
agg_dict = {'PER': ['mean'], 'TS%': ['mean'], '3PAr': ['mean'], 'FTr': ['mean'], 'ORB%': ['mean'], 'DRB%': ['mean'], 
            'TRB%':['mean'], 'AST%': ['mean'], 'STL%': ['mean'], 'BLK%': ['mean'], 'TOV%': ['mean'], 'USG%': ['mean'],
            'WS/48': ['mean'], 'OBPM': ['mean'], 'DBPM': ['mean'], 'BPM': ['mean'], 'FG%':['mean'], '3P%': ['mean'], 
            '2P%': ['mean'], 'eFG%': ['mean'], 'FT%': ['mean'], 'G': ['sum'], 'GS': ['sum'], 'MP': ['sum'], 
            'OWS': ['sum'], 'DWS': ['sum'], 'WS': ['sum'], 'FG': ['sum'], 'FGA': ['sum'], '3P': ['sum'], '3PA': ['sum'],
            '2P': ['sum'], '2PA': ['sum'], 'FT': ['sum'], 'FTA': ['sum'], 'ORB': ['sum'], 'DRB': ['sum'], 'TRB': ['sum'],
            'AST': ['sum'], 'STL': ['sum'], 'BLK': ['sum'], 'TOV': ['sum'], 'PF': ['sum'], 'PTS': ['sum']}

In [7]:
seasons_agg_df = seasons_df.groupby(['Player']).agg(agg_dict)
seasons_agg_df.columns = seasons_agg_df.columns.droplevel(1)
seasons_agg_df.head()


Unnamed: 0_level_0,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,...,FTA,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A.C. Green,13.872222,0.543556,0.049,0.437111,10.388889,18.811111,14.655556,5.216667,1.438889,0.85,...,4447.0,3576.0,6553.0,10129.0,1469.0,1103.0,562.0,1508.0,2581.0,12928.0
A.J. Bramlett,-0.4,0.19,0.0,0.0,21.7,18.5,20.1,0.0,0.8,0.0,...,0.0,12.0,10.0,22.0,0.0,1.0,0.0,3.0,13.0,8.0
A.J. English,11.55,0.48,0.047,0.241,4.9,6.25,5.55,15.85,0.9,0.45,...,333.0,140.0,175.0,315.0,320.0,57.0,24.0,203.0,287.0,1502.0
A.J. Guyton,4.366667,0.324,0.371,0.068333,1.366667,3.8,2.533333,23.466667,2.466667,0.5,...,45.0,22.0,58.0,80.0,147.0,20.0,12.0,62.0,58.0,442.0
A.J. Hammons,8.4,0.472,0.238,0.476,5.4,20.9,12.8,3.8,0.3,7.2,...,20.0,8.0,28.0,36.0,4.0,1.0,13.0,10.0,21.0,48.0


In [8]:
edited_player_df = players_df[['name', 'year_start', 'year_end', 'position']]
edited_player_df.rename(columns={'name': 'Player'}, inplace=True)
edited_player_df.set_index('Player', inplace=True)
edited_player_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0_level_0,year_start,year_end,position
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alaa Abdelnaby,1991,1995,F-C
Zaid Abdul-Aziz,1969,1978,C-F
Kareem Abdul-Jabbar,1970,1989,C
Mahmoud Abdul-Rauf,1991,2001,G
Tariq Abdul-Wahad,1998,2003,F


In [9]:
dataset_df = edited_player_df.join(seasons_agg_df, on='Player', how='inner')
dataset_df

Unnamed: 0_level_0,year_start,year_end,position,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,...,FTA,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alaa Abdelnaby,1991,1995,F-C,9.833333,0.475889,0.004222,0.244000,9.788889,19.700000,14.766667,...,472.0,446.0,851.0,1297.0,125.0,111.0,107.0,389.0,777.0,2299.0
Zaid Abdul-Aziz,1969,1978,C-F,13.750000,0.450071,,0.345857,15.171429,25.400000,19.210000,...,1536.0,522.0,1155.0,4524.0,648.0,134.0,208.0,28.0,1264.0,4978.0
Kareem Abdul-Jabbar,1970,1989,C,24.085000,0.591500,0.001800,0.325100,7.618750,21.125000,15.305263,...,9304.0,2975.0,9394.0,17440.0,5660.0,1160.0,3189.0,2527.0,4657.0,38387.0
Mahmoud Abdul-Rauf,1991,2001,G,14.877778,0.497111,0.158778,0.134667,1.477778,6.111111,3.744444,...,1161.0,219.0,868.0,1087.0,2079.0,487.0,46.0,963.0,1106.0,8553.0
Tariq Abdul-Wahad,1998,2003,F,9.930000,0.410100,0.033200,0.296900,7.310000,12.120000,9.700000,...,755.0,428.0,723.0,1151.0,388.0,263.0,121.0,442.0,688.0,2662.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Stephen Zimmerman,2017,2017,C,7.300000,0.346000,0.000000,0.161000,10.800000,24.900000,17.600000,...,5.0,11.0,24.0,35.0,4.0,2.0,5.0,3.0,17.0,23.0
Paul Zipser,2017,2018,G-F,6.900000,0.503000,0.448000,0.181000,1.900000,14.200000,8.000000,...,40.0,15.0,110.0,125.0,36.0,15.0,16.0,40.0,78.0,240.0
Jim Zoet,1983,1983,C,-0.800000,0.200000,0.000000,0.000000,10.200000,17.700000,13.900000,...,0.0,3.0,5.0,8.0,1.0,1.0,3.0,4.0,9.0,2.0
Bill Zopf,1971,1971,G,9.600000,0.391000,,0.267000,,,5.500000,...,36.0,0.0,0.0,46.0,73.0,0.0,0.0,0.0,34.0,118.0


In [10]:
hof_url = 'https://basketball.realgm.com/nba/hall-of-fame'
hof_page = requests.get(hof_url)
hof_soup = bs4.BeautifulSoup(hof_page.content, 'html.parser')
hof_players = []
for player in hof_soup.find_all(href=re.compile("/player/")):
    hof_players.append(player.text)

In [11]:
def hall_of_fame(name):
    if name in hof_players:
        return 'Yes'
    else:
        return 'No'

dataset_df['Hall of Fame'] = [hall_of_fame(name) for name in dataset_df.index]

In [14]:
all_star_url = "https://en.wikipedia.org/wiki/List_of_NBA_All-Stars"
all_star_page = requests.get(all_star_url)
all_star_soup = bs4.BeautifulSoup(all_star_page.content, 'html.parser')
rows = all_star_soup.find_all('tr')


In [15]:
dataset_df['All Star Games'] = 0
for row in rows:
    if row.find_all("span", attrs={"class": "fn"}):
        name = re.sub("\s", " ", row.find("span", attrs={"class": "fn"}).string)
        if name in dataset_df.index:
            dataset_df['All Star Games'][name] = row.contents[3].string
            if "DeAndre Jordan" in name:
                print(name, row.contents[3].string, dataset_df['All Star Games'][name])


DeAndre Jordan 1 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_df['All Star Games'][name] = row.contents[3].string


In [16]:
for row in rows:
    if row.find_all("span", attrs={"class": "fn"}):
        name = re.sub("\s", " ", row.find("span", attrs={"class": "fn"}).string)
        if name in dataset_df.index:
            dataset_df['All Star Games'][name] = row.contents[3].string

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_df['All Star Games'][name] = row.contents[3].string


In [17]:
dataset_df.loc["Mike Dunleavy"]

Unnamed: 0_level_0,year_start,year_end,position,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Hall of Fame,All Star Games
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Mike Dunleavy,1977,1990,G,13.5,0.549594,0.407481,0.278375,2.209375,10.865625,6.51875,...,4508.0,5456.0,4214.0,1156.0,379.0,2271.0,3281.0,15921.0,No,0
Mike Dunleavy,2003,2017,F-G,13.5,0.549594,0.407481,0.278375,2.209375,10.865625,6.51875,...,4508.0,5456.0,4214.0,1156.0,379.0,2271.0,3281.0,15921.0,No,0


In [18]:
dataset_df.loc["Michael Jordan"]

year_start             1985
year_end               2003
position                G-F
PER                 27.3733
TS%                0.559467
3PAr              0.0709333
FTr                   0.354
ORB%                    4.7
DRB%                  14.08
TRB%                   9.38
AST%                24.8067
STL%                    3.1
BLK%                1.46667
TOV%                9.29333
USG%                   33.6
WS/48                0.2366
OBPM                6.45333
DBPM               0.966667
BPM                    7.46
FG%                0.488267
3P%                0.283933
2P%                0.500667
eFG%                    0.5
FT%                0.830933
G                      1072
GS                     1039
MP                    41011
OWS                   149.9
DWS                    64.2
WS                    213.9
FG                    12192
FGA                   24537
3P                      581
3PA                    1778
2P                    11611
2PA                 

In [19]:
dataset_df

Unnamed: 0_level_0,year_start,year_end,position,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Hall of Fame,All Star Games
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alaa Abdelnaby,1991,1995,F-C,9.833333,0.475889,0.004222,0.244000,9.788889,19.700000,14.766667,...,851.0,1297.0,125.0,111.0,107.0,389.0,777.0,2299.0,No,0
Zaid Abdul-Aziz,1969,1978,C-F,13.750000,0.450071,,0.345857,15.171429,25.400000,19.210000,...,1155.0,4524.0,648.0,134.0,208.0,28.0,1264.0,4978.0,No,0
Kareem Abdul-Jabbar,1970,1989,C,24.085000,0.591500,0.001800,0.325100,7.618750,21.125000,15.305263,...,9394.0,17440.0,5660.0,1160.0,3189.0,2527.0,4657.0,38387.0,Yes,19
Mahmoud Abdul-Rauf,1991,2001,G,14.877778,0.497111,0.158778,0.134667,1.477778,6.111111,3.744444,...,868.0,1087.0,2079.0,487.0,46.0,963.0,1106.0,8553.0,No,0
Tariq Abdul-Wahad,1998,2003,F,9.930000,0.410100,0.033200,0.296900,7.310000,12.120000,9.700000,...,723.0,1151.0,388.0,263.0,121.0,442.0,688.0,2662.0,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Stephen Zimmerman,2017,2017,C,7.300000,0.346000,0.000000,0.161000,10.800000,24.900000,17.600000,...,24.0,35.0,4.0,2.0,5.0,3.0,17.0,23.0,No,0
Paul Zipser,2017,2018,G-F,6.900000,0.503000,0.448000,0.181000,1.900000,14.200000,8.000000,...,110.0,125.0,36.0,15.0,16.0,40.0,78.0,240.0,No,0
Jim Zoet,1983,1983,C,-0.800000,0.200000,0.000000,0.000000,10.200000,17.700000,13.900000,...,5.0,8.0,1.0,1.0,3.0,4.0,9.0,2.0,No,0
Bill Zopf,1971,1971,G,9.600000,0.391000,,0.267000,,,5.500000,...,0.0,46.0,73.0,0.0,0.0,0.0,34.0,118.0,No,0
