In [6]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LinearRegression as Lin_Reg

import matplotlib.cm as cmx
import matplotlib.colors as colors

from bs4 import BeautifulSoup
import urllib

### Reading in the data:

In [7]:
dd = pd.read_csv("NBA_hothands_data.csv",delimiter = ',')
def_name = dd.copy()

unique_check = pd.unique(def_name.CLOSEST_DEFENDER)
defenders = pd.DataFrame(unique_check, columns = ['full_name']) # create dataframe of defenders

unique_check2 = pd.unique(def_name.player_name)
shooters = pd.DataFrame(unique_check2,columns = ['full_name']) # create dataframe of shooters

---

# Defender Data Scraping:

In [8]:
# All the players have a similar pattern in their biopage URLs
# example: James Harden's URL --> http://www.basketball-reference.com/players/h/hardeja01.html
url1 = "http://www.basketball-reference.com/players/"
url2 = "01.html"

defenders['bio_url'] = 'x'

# Grabbing defender data
cnt = 0
for t in range(len(defenders)):
    name = defenders.full_name[t].split(", ") # splitting the name into first and last name
    
    # if the name fits the normal first and last name format:
    if len(name)==2:
        xx0 = name[0]
        xx1 = name[1]
        
        # use the first and last name to put together the unique URL
        bio_page_url = url1 + xx0[0].lower() + '/' + xx0[0:5].lower() + xx1[0:2].lower() + url2
        
        # store the URL
        defenders['bio_url'][t] = bio_page_url
    else:
        cnt = cnt+1
        print name
        print t

#Nene has no listed last name so his url is added manually
defenders.bio_url[46] = 'http://www.basketball-reference.com/players/h/hilarne01.html'

['Nene']
46


***initial height, weight, and birth year data scraping:***

In [9]:
# initialize defender data to be scraped
defenders['height'] = 'x'
defenders['weight'] = 'x'
defenders['birth_year'] = 'x'

cnt = 0

# looping through all defenders
for t in range(len(defenders)):
    if defenders.bio_url[t]!='x':             # if there the url for the player's bio exists
        site = defenders.bio_url[t]           # this is the site
        page = urllib.urlopen(site)           # this is the page
        soup = BeautifulSoup(page,"lxml")     # this is the soup for the page
        p = soup.findAll("span")
        
        # height
        if soup.find(itemprop='height')!=None:
            defenders['height'][t] = soup.find(itemprop='height').get_text()[0:3]
            
        # weight
        if soup.find(itemprop='weight')!=None:
            defenders['weight'][t] = int(soup.find(itemprop='weight').get_text()[0:3])
            
        # birth year
        if soup.find(itemprop="birthDate")!=None:
            temp_year = soup.find(itemprop="birthDate").get_text()[-5:]
            defenders['birth_year'][t] = int(temp_year[0:-1])
    if t%20==0:
            print t


0
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460


***converting strings to integers and checking for players misidentified as older players of the same name:***

In [10]:
# many players have the same name as a previous player. The ages of the players belie whether or not they have 
# been misidentified


# this section of the code also converts the height into inches, the weight into lbs (from a string), and the age into
# years (from the birth date string)

defenders['height_in'] = 'x'
defenders['weight_lbs'] = 'x'
defenders['age_years'] = 'x'
url3 = "02.html"

# these two variables help with debugging the code and making sure it is picking up the duplicates
duplicate_count=0
duplicate_locs=[]

# looping through the defender data
for t in range(len(defenders)):
    
    if defenders.height[t]!='x': # missing data given 'x' as placeholder
        h = defenders.height[t].split("-")              # height string
    else:
        h = [3,2,1]
    h_test = defenders.height[t].split(" ") 
    
    w = defenders.weight[t]                         # weight string
    
    if (len(h)==2) and (len(h_test)==1): # if there is no error in the formatting of the height string:
        defenders['height_in'][t] = int(h[0])*12 + int(h[1]) # converting the height string (in feet and inches) into inches
    if (w!='x'):
        defenders['weight_lbs'][t] = w
    if (defenders.birth_year[t]!='x'):
        y = 2013-defenders.birth_year[t]                # age in years
        # if player is too old, most likely duplicate name. Get new url and recalculate height, weight, age
        if y<45: # if player age is below 45 years
            Done=1
        else:          # if the player is too old, record as a duplicate
            Done=0
            duplicate_count+=1
            duplicate_locs.append(t)
            print t

            
        url_num=1    
        
        # the following while loop continues until the player is a reasonable age. There are very very few duplicate names
        # among currently active players, so picking a player with the correct age is a good proxy
        while Done!=1:
            url_num+=1 # counter
            url_new = "0"+str(url_num)+".html" # create a new url string containing the next count number (pattern in the URLs)
            
            name = defenders.full_name[t].split(" ")
            xx0 = name[0]
            xx1 = name[1]
            
            bio_page_url2 = url1 + xx0[0].lower() + '/' + xx0[0:5].lower() + xx1[0:2].lower() + url_new
            
            defenders['bio_url'][t] = bio_page_url2 # replace the bio_url listed with the current URL to test
            page2 = urllib.urlopen(bio_page_url2)
            soup2 = BeautifulSoup(page2,"lxml") # open the page and get the soup
            
            # grab the new height, store temporarily as the correct height in the dataframe
            if soup2.find(itemprop='height')!=None:
                temp_h1 = soup2.find(itemprop='height').get_text()[0:3]
                temp_h2 = defenders.height[t].split("-")
                if len(temp_h2)==2:
                    defenders['height'][t] = temp_h1
                    defenders['height_in'][t] = int(temp_h2[0])*12 + int(temp_h2[1])
                    
            # store the new weight temporarily as the correct weight in the dataframe
            if soup2.find(itemprop='weight')!=None:
                defenders['weight'][t] = soup2.find(itemprop='weight').get_text()
                defenders['weight_lbs'][t] = int(soup2.find(itemprop='weight').get_text()[0:3])
                defenders['weight'][t] = int(soup.find(itemprop='weight').get_text()[0:3])
            # store the new age in temporarily in the dataframe as the correct age
            if soup2.find(itemprop="birthDate")!=None:
                temp_year = soup2.find(itemprop="birthDate").get_text()[-5:]
                defenders['birth_year'][t] = int(temp_year[0:-1])
                y = 2013-defenders.birth_year[t]
                
            # if new age is less than 45 years old, or it has continued 3 times, then loop is done
            # otherwise, the count is increased and the loop repeats
            if y<45 or url_num>3:
                Done=1
            if url_num>3:
                y = 2013-defenders.birth_year[t] 
        defenders['age_years'][t] = y


16
26
72
91
115
126
140
148
151
183
188
206
244
270
277
279
283
289
300
334
349
353
354
361
388
409
437
446
463


---

In [12]:
defenders

Unnamed: 0,full_name,bio_url,height,weight,birth_year,height_in,weight_lbs,age_years
0,"Anderson, Alan",http://www.basketball-reference.com/players/a/...,6-6,220,1982,78,220,31
1,"Bogdanovic, Bojan",http://www.basketball-reference.com/players/b/...,6-6,205,1992,78,205,21
2,"Brown, Markel",http://www.basketball-reference.com/players/b/...,6-3,185,1974,75,185,39
3,"Young, Thaddeus",http://www.basketball-reference.com/players/y/...,6-8,221,1988,80,221,25
4,"Williams, Deron",http://www.basketball-reference.com/players/w/...,6-3,200,1984,75,200,29
5,"Jack, Jarrett",http://www.basketball-reference.com/players/j/...,6-3,200,1983,75,200,30
6,"Plumlee, Mason",http://www.basketball-reference.com/players/p/...,6-1,245,1990,73,245,23
7,"Morris, Darius",http://www.basketball-reference.com/players/m/...,6-4,190,1991,76,190,22
8,"Ellington, Wayne",http://www.basketball-reference.com/players/e/...,6-4,200,1987,76,200,26
9,"Lin, Jeremy",http://www.basketball-reference.com/players/l/...,6-3,200,1988,75,200,25


# Shooter Data Scraping:

In [13]:
# Grabbing shooter data
shooters['bio_url'] = 'x'

for t in range(len(shooters)):
    name = shooters.full_name[t].split(" ")
    xx0 = name[-1]
    xx1 = name[0]
    bio_page_url = url1 + xx0[0].lower() + '/' + xx0[0:5].lower() + xx1[0:2].lower() + url2
    shooters['bio_url'][t] = bio_page_url
    # the following if statement is used to check players with more than a simple first and last name
#     if len(name)!=2:
#         cnt = cnt+1
#         print name
#         print t

# because of the format of these players names, their bio url's were parsed incorrectly
shooters.bio_url[160] = 'http://www.basketball-reference.com/players/m/mbahalu01.html'
shooters.bio_url[188] = 'http://www.basketball-reference.com/players/h/hardati02.html'
shooters.bio_url[7] = 'http://www.basketball-reference.com/players/k/kiddgmi01.html'
print "The total number of shooters in the dataset is:  %d" % shooters.shape[0]

shooters.head(5)

The total number of shooters in the dataset is:  281


Unnamed: 0,full_name,bio_url
0,brian roberts,http://www.basketball-reference.com/players/r/...
1,bismack biyombo,http://www.basketball-reference.com/players/b/...
2,al jefferson,http://www.basketball-reference.com/players/j/...
3,cody zeller,http://www.basketball-reference.com/players/z/...
4,gary neal,http://www.basketball-reference.com/players/n/...


***initial scrape of height and weight data:***

In [14]:
# SHOOTERS
shooters['height'] = 'x'
shooters['weight'] = 'x'
shooters['birth_year'] = 'x'

cnt = 0
for t in range(len(shooters)):
    if shooters.bio_url[t]!='x':             #if there the url for the player's bio exists
        site = shooters.bio_url[t]           # this is the site
        page = urllib.urlopen(site)           # this is the page
        soup = BeautifulSoup(page,"lxml")     # this is the soup for the page
        
        # height
        if soup.find(itemprop='height')!=None:
            shooters['height'][t] = soup.find(itemprop='height').get_text()[0:3]
            
        # weight
        if soup.find(itemprop='weight')!=None:
            shooters['weight'][t] = int(soup.find(itemprop='weight').get_text()[0:3])
            
        # birth year
        if soup.find(itemprop="birthDate")!=None:
            temp_year = soup.find(itemprop="birthDate").get_text()[-5:]
            shooters['birth_year'][t] = int(temp_year[0:-1])
        print cnt
        cnt+=1


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

***converting strings to integers and checking for players misidentified as older players of the same name:***

In [15]:
shooters['height_in'] = 'x'
shooters['weight_lbs'] = 'x'
shooters['age_years'] = 'x'
url3 = "02.html"
duplicate_count=0
duplicate_locs=[]
for t in range(len(shooters)):
    
    if shooters.height[t]!='x':
        h = shooters.height[t].split("-")              # height string
    else:
        h = [3,2,1]
    h_test = shooters.height[t].split(" ") 
    
    w = shooters.weight[t]                         # weight string
    
    if (len(h)==2) and (len(h_test)==1): # if there is no error in the formatting of the height string:
        shooters['height_in'][t] = int(h[0])*12 + int(h[1])
    if (w!='x'):
        shooters['weight_lbs'][t] = w
    if (shooters.birth_year[t]!='x'):
        y = 2013-shooters.birth_year[t]                # age in years
        # if player is too old, most likely duplicate name. Get new url and recalculate height, weight, age
        if y<45: # if player age is below 45 years
            Done=1
        else: 
            Done=0
            duplicate_count+=1
            duplicate_locs.append(t)
            print t

            
        url_num=1    
        while Done!=1:
            url_num+=1
            url_new = "0"+str(url_num)+".html"
            
            name = shooters.full_name[t].split(" ")
            xx0 = name[1]
            xx1 = name[0]
            
            bio_page_url2 = url1 + xx0[0].lower() + '/' + xx0[0:5].lower() + xx1[0:2].lower() + url_new
            
            shooters['bio_url'][t] = bio_page_url2
            page2 = urllib.urlopen(bio_page_url2)
            soup2 = BeautifulSoup(page2,"lxml")
            
            # new height
            if soup2.find(itemprop='height')!=None:
                temp_h1 = soup2.find(itemprop='height').get_text()[0:3]
                temp_h2 = shooters.height[t].split("-")
                if len(temp_h2)==2:
                    shooters['height'][t] = temp_h1
                    shooters['height_in'][t] = int(temp_h2[0])*12 + int(temp_h2[1])
                    
            # new weight       
            if soup2.find(itemprop='weight')!=None:
                shooters['weight'][t] = soup2.find(itemprop='weight').get_text()
                shooters['weight_lbs'][t] = int(soup2.find(itemprop='weight').get_text()[0:3])
            
            # new age
            if soup2.find(itemprop="birthDate")!=None:
                temp_year = soup2.find(itemprop="birthDate").get_text()[-5:]
                shooters['birth_year'][t] = int(temp_year[0:-1])
                y = 2013-shooters.birth_year[t]
            # if new age is less than 45 years old, then loop is done
            if y<45 or url_num>3:
                Done=1
            if url_num>3:
                y = 2013-shooters.birth_year[t] 
        shooters['age_years'][t] = y

5
6
29
64
81
87
123
130
148
149
152
215
232
236
255
277


In [17]:
shooters.to_csv('shooter_bio_data.csv',delimiter=',',index=False)
defenders.to_csv('defender_bio_data.csv',delimiter=',',index=False)