In [1]:
# Scraping info from additional links
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re
import time
import pickle

from sklearn.linear_model import LinearRegression

In [2]:
player_measurement  = pd.read_csv('data/player_measurement.csv', index_col=0)
draft_df = pd.read_csv('data/draft_info.csv', index_col=0)

In [3]:
add_data  = pd.read_csv('data/additional_data.csv', index_col=0)
add_draft_data = pd.read_csv('data/additional_draft_data.csv', index_col=0)

In [4]:
salary_df = pd.read_csv('data/salary_data.csv', index_col=0)

# dealing with multiple names

In [5]:
def add_name_index(df):
    df['name_index'] = 1
    df['name_index'] = df.groupby('name')['name_index'].shift(1)
    df.loc[df.name_index>0, 'name_index'] = df['name_index'] + 1
    df.loc[df.name_index.isnull(), 'name_index'] =  1

In [6]:
add_name_index(draft_df)
add_name_index(player_measurement)
merged_df = pd.merge(draft_df, player_measurement, on=['name', 'name_index'], how='left')

In [7]:
add_name_index(add_data)
add_name_index(add_draft_data)
merged_add_df = pd.merge(add_data, add_draft_data, on=['name', 'name_index'], how='left')

In [8]:
merged_add_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 938 entries, 0 to 937
Data columns (total 37 columns):
college          938 non-null object
draft_rank       938 non-null int64
draft_year       938 non-null int64
height_in_x      0 non-null float64
name             938 non-null object
reach_in         0 non-null float64
weight_lb_x      0 non-null float64
wingspan_in      0 non-null float64
name_index       938 non-null float64
per              934 non-null float64
no_of_seasons    934 non-null float64
start_age        934 non-null float64
end_age          934 non-null float64
g                934 non-null float64
mp               866 non-null float64
fg               933 non-null float64
fga              933 non-null float64
fg3              461 non-null float64
fg3a             461 non-null float64
ft               933 non-null float64
fta              933 non-null float64
orb              0 non-null float64
ast              615 non-null float64
stl              742 non-null float64

In [9]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 807 entries, 0 to 806
Data columns (total 35 columns):
name             807 non-null object
per              807 non-null object
no_of_seasons    807 non-null float64
start_age        807 non-null int64
end_age          807 non-null int64
g                807 non-null int64
mp               754 non-null float64
fg               807 non-null int64
fga              807 non-null int64
fg3              807 non-null int64
fg3a             807 non-null int64
ft               807 non-null int64
fta              807 non-null int64
orb              298 non-null float64
ast              807 non-null int64
stl              798 non-null float64
blk              798 non-null float64
tov              639 non-null float64
pf               556 non-null float64
pts              807 non-null int64
fg_pct           807 non-null float64
fg3_pct          753 non-null float64
ft_pct           807 non-null float64
mp_per_g         754 non-null float64
pts_per

In [10]:
player_measurement[player_measurement.name == 'Marcus Williams']

Unnamed: 0,college,draft_rank,draft_year,height_in,name,reach_in,weight_lb,wingspan_in,name_index
361,University of Connecticut,22,2006,,Marcus Williams,,,,1.0
413,University of Arizona,33,2007,,Marcus Williams,,,,2.0


In [11]:
final_df = merged_df.append(merged_add_df, ignore_index=True)

In [12]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1745 entries, 0 to 1744
Data columns (total 39 columns):
ast              1422 non-null float64
ast_per_g        1422 non-null float64
blk              1537 non-null float64
college          1735 non-null object
draft_rank       1735 non-null float64
draft_year       1735 non-null float64
end_age          1741 non-null float64
fg               1740 non-null float64
fg3              1268 non-null float64
fg3_pct          1214 non-null float64
fg3a             1268 non-null float64
fg_pct           1740 non-null float64
fga              1740 non-null float64
ft               1740 non-null float64
ft_pct           1740 non-null float64
fta              1740 non-null float64
g                1741 non-null float64
height_in        601 non-null float64
height_in_x      0 non-null float64
height_in_y      934 non-null float64
mp               1620 non-null float64
mp_per_g         1620 non-null float64
name             1745 non-null object
nam

In [13]:
salary_df = salary_df.drop_duplicates(['name','salary'])

In [14]:
add_name_index(salary_df)

In [15]:
df = pd.merge(final_df, salary_df, on=['name', 'name_index'], how='left')

In [16]:
df.loc[(df.height_in.isnull())&(df.height_in_y.notnull()),'height_in'] = df.height_in_y
df.loc[(df.weight_lb.isnull())&(df.weight_lb_y.notnull()),'weight_lb'] = df.weight_lb_y

In [17]:

# Not enough data on these features
df = df.drop(['orb', 'tov', 'pf', 'height_in_x', 'height_in_y', 'weight_lb_x', 'weight_lb_y'], axis=1)

In [18]:
df['per'] = pd.to_numeric(df['per'], errors='force')

In [19]:
df.loc[df.weight_lb.str.isnumeric() == False, 'weight_lb'] = np.nan

In [20]:
df.weight_lb = pd.to_numeric(df.weight_lb,errors='ignore')

In [21]:
df.loc[(df.height_in==0), 'height_in'] = np.nan
df.loc[(df.wingspan_in==0), 'wingspan_in'] = np.nan
df.loc[(df.reach_in==0), 'reach_in'] = np.nan

In [22]:
df.to_csv('final_df.csv')

In [23]:
cor = df.corr()
cor['reach_in'].sort_values(ascending=False)[:3]

reach_in       1.000000
height_in      0.913513
wingspan_in    0.903681
Name: reach_in, dtype: float64

In [24]:
# Create an empty model
lr_wingspan = LinearRegression()
df_wingspan = df[['height_in', 'wingspan_in']]
df_wingspan = df_wingspan.dropna()
X = df_wingspan['height_in'][:, np.newaxis]
y = df_wingspan['wingspan_in']

# Fit the model to the full dataset
lr_wingspan.fit(X, y)
# Print out the R^2 for the model against the full dataset
lr_wingspan.score(X,y)

0.68962092699941246

In [25]:
df_reach = df[['height_in', 'reach_in']]
df_reach = df_reach.dropna()
X = df_reach['height_in'][:, np.newaxis]
y = df_reach['reach_in']
lr_reach = LinearRegression()
# Fit the model to the full dataset
lr_reach.fit(X, y)
# Print out the R^2 for the model against the full dataset
lr_reach.score(X,y)

0.83450571240710936

In [28]:
cor['mp'].sort_values(ascending=False)[:3]

mp     1.000000
g      0.740903
pts    0.728444
Name: mp, dtype: float64

In [30]:
df_mp = df[['mp', 'g']]
df_mp = df_mp.dropna()
X = df_mp['g'][:, np.newaxis]
y = df_mp['mp']
lr_mp = LinearRegression()
# Fit the model to the full dataset
lr_mp.fit(X,y)
# Print out the R^2 for the model against the full dataset
lr_mp.score(X,y)

0.54893782820715997

In [31]:
mask = (df.height_in.notnull()) & (df.wingspan_in.isnull())
wingspan_predictions = lr_wingspan.predict(df[mask].height_in[:,np.newaxis])
df.loc[mask, 'wingspan_in'] = wingspan_predictions

In [32]:
mask = (df.height_in.notnull()) & (df.reach_in.isnull())
reach_predictions = lr_reach.predict(df[mask].height_in[:,np.newaxis])
df.loc[mask, 'reach_in'] = reach_predictions

In [33]:
mask = (df.g.notnull()) & (df.mp.isnull())
mp_predictions = lr_mp.predict(df[mask].g[:,np.newaxis])
df.loc[mask, 'mp'] = mp_predictions

In [35]:
def fill_null_with_average(column):
    mean = df[column].mean()
    df.loc[df[column].isnull(), column] = mean

# fill_null_with_average('height_in')
for col in ['height_in', 'weight_lb', 'mp', 'fg3_pct', 'mp_per_g', 'blk', 'stl', 'reach_in', 'wingspan_in']:
    fill_null_with_average(col)



In [36]:
# PER above 30 is not possible.
df[df.per > 30] = np.nan
# we're getting inconsistent PER with the last nba picks
df = df.loc[df.draft_year<2016]

# Remove fg3_pct == 1 or fg3 < 5
df.loc[df.fg3 < 5,'fg3_pct'] = df.fg3_pct.mean()

In [38]:
df[df.shoot.isnull()]['name']

410     Marcus Williams
558    Tristan Thompson
655        Nerlens Noel
Name: name, dtype: object

# Research
- researched online for these two people
- Tristan Thompson switched from left to right hand.
- Nerlens Noel can shoot from both hands.

In [39]:
df.loc[df.name=='Tristan Thompson', 'shoot'] = 'switch'
df.loc[df.name=='Nerlens Noel', 'shoot'] = 'both'

In [41]:
df.to_csv('data/data_salary.csv')