In [1]:
#importing libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def download_player_html(playerID):
    last_initial = playerID[0]
    url_base = "https://www.basketball-reference.com/players/{}/{}.html"
    url = url_base.format(last_initial, playerID)
    data = requests.get(url)
     
    #page is saved as an html and placed in Players folder
    with open("Players/{}.html".format(playerID), "w+", encoding = "utf-8") as f:
        f.write(data.text)

In [3]:
def sort_rookies_FTA(csv_file):
    rookie = pd.read_csv(csv_file)
    rookie_FTA = rookie[rookie["FTA"] > 0]
    rookie_FTA_sorted = rookie_FTA.sort_values("FTA", ascending=False)
    rookie_FTA_top_100 = rookie_FTA_sorted.head(100)
    return rookie_FTA_top_100

In [4]:
def get_3PT_percentage_5yrs(playerID):
    text_list = []
        
#   open and read Player HTML page    
    with open("Players/{}.html".format(playerID), encoding = "utf-8") as r:
        page = r.read()
    
# create beautiful soup object from HTML
    soup = BeautifulSoup(page, "html.parser")
 
    #returning 5 seasons worth of "fg3_pct" = tr[4]td[13]
    for i in range(5):
        out = [tr.get_text() for tr in soup.select("tr.full_table")[i]]
        if out[13] != '':
            text_list.append(float(out[13]))
    r.close()
    return text_list

## Import and Sort DataFrames 

### Import Rookie Seasonal Data from Archived CSV

In [5]:
top_100_rookies_FTA = sort_rookies_FTA('Spreadsheets/NBA_Rookies_90-16.csv')

### Filter top FTA rookies with at least one 3PT make

In [6]:
#of the top 100 rookies with the most FTA, which made least one made three point shot
top_FTA_rookies_3PT = top_100_rookies_FTA[top_100_rookies_FTA["3P%"] > 0]

### Clean up DataFrame

In [7]:
top_FTA_rookies_3PT = top_FTA_rookies_3PT.reset_index(drop=True)

### Print DataFrame and verify attributes

In [8]:
print(top_FTA_rookies_3PT)
print(type(top_FTA_rookies_3PT))
print(len(top_FTA_rookies_3PT))

                 Player                     Debut  Age  Yrs   G    MP   FG  \
0         Blake Griffin  Oct 27, '10, LAC vs. POR   21    1  82  3112  696   
1    Christian Laettner     Nov 6, '92, MIN @ BOS   23    1  81  2823  503   
2        Allen Iverson*   Nov 1, '96, PHI vs. MIL   21    1  76  3045  625   
3       Carmelo Anthony  Oct 29, '03, DEN vs. SAS   19    1  82  2995  624   
4   Shareef Abdur-Rahim   Nov 1, '96, VAN vs. POR   20    1  80  2802  550   
..                  ...                       ...  ...  ...  ..   ...  ...   
74         Isaiah Rider     Nov 5, '93, MIN @ DET   22    1  79  2415  522   
75    Richard Jefferson  Oct 30, '01, NJN vs. IND   21    1  79  1917  270   
76        Lamond Murray   Nov 4, '94, LAC vs. POR   21    1  81  2556  439   
77         Marcus Camby   Nov 1, '96, TOR vs. NYK   22    1  63  1897  375   
78        Nick Anderson   Nov 4, '89, ORL vs. NJN   22    1  81  1785  372   

     FGA   3P  3PA  ...   PTS    FG%    3P%    FT%  MP.1  PTS.1

In [9]:
#getting player 3PT data five years
print(top_FTA_rookies_3PT[["Player","FTA", "FT%", "3P%", "BR ID", "Year"]])
print(type(top_FTA_rookies_3PT[["Player","FTA", "FT%", "3P%", "BR ID", "Year"]]))

                 Player  FTA    FT%    3P%      BR ID  Year
0         Blake Griffin  695  0.642  0.292  griffbl01  2011
1    Christian Laettner  553  0.835  0.100  laettch01  1993
2        Allen Iverson*  544  0.702  0.341  iversal01  1997
3       Carmelo Anthony  525  0.777  0.322  anthoca01  2004
4   Shareef Abdur-Rahim  519  0.746  0.259  abdursh01  1997
..                  ...  ...    ...    ...        ...   ...
74         Isaiah Rider  265  0.811  0.360  rideris01  1994
75    Richard Jefferson  265  0.713  0.232  jefferi01  2002
76        Lamond Murray  264  0.754  0.298  murrala01  1995
77         Marcus Camby  264  0.693  0.143  cambyma01  1997
78        Nick Anderson  264  0.705  0.059  anderni01  1990

[79 rows x 6 columns]
<class 'pandas.core.frame.DataFrame'>


## Downloading Player Pages

In [22]:
#segmenting "BR ID" series into thirds to avoid site restrictions
top_third = top_FTA_rookies_3PT["BR ID"].iloc[0:28]
mid_third = top_FTA_rookies_3PT["BR ID"].iloc[28:56]
bottom_third = top_FTA_rookies_3PT["BR ID"].iloc[56:]

In [21]:
#printing the segmented series' "BR ID"s
print("Top Third:")
print(top_third)
print(type(top_third))

print("Middle Third:")
print(mid_third)

print("Bottom Third:")
print(bottom_third)

Top Third:
0     griffbl01
1     laettch01
2     iversal01
3     anthoca01
4     abdursh01
5     stackje01
6      hillgr01
7     robingl01
8     stoudam01
9     gasolpa01
10    wiggian01
11    evansty01
12     paulch01
13    jamesle01
14    duranke01
15    colemde01
16    mashbja01
17    simmoli01
18    westbru01
19     odomla01
20    bakervi01
21    johnsla02
22    weathcl01
23    marcisa01
24    couside01
25     walljo01
26    smithjo02
27    butleca01
Name: BR ID, dtype: object
<class 'pandas.core.series.Series'>
Middle Third:
28     mingya01
29    kirilan01
30    cartemi01
31    walkean02
32    francst01
33    grantbr01
34    gordoer01
35    fortsda01
36    marbust01
37     loveke01
38     maystr01
39    hardaan01
40    douglsh01
41    finlemi01
42    lillada01
43    augmost01
44    oladivi01
45    ellisla01
46    anderde01
47     wadedw01
48    owensbi01
49    sabonar01
50    vanhoke01
51    williwa02
52    ilgauzy01
53    willier01
54      dayto01
55    stoudda01
Name: BR ID, dty

In [23]:
#for loop to download first third of player pages
for playerIDs in top_third:
    print(playerIDs)
    download_player_html(playerIDs)

griffbl01
laettch01
iversal01
anthoca01
abdursh01
stackje01
hillgr01
robingl01
stoudam01
gasolpa01
wiggian01
evansty01
paulch01
jamesle01
duranke01
colemde01
mashbja01
simmoli01
westbru01
odomla01
bakervi01
johnsla02
weathcl01
marcisa01
couside01
walljo01
smithjo02
butleca01


In [None]:
#for loop to download middle third of player pages
for playerIDs in mid_third:
    print(playerIDs)
    download_player_html(playerIDs)

In [None]:
#for loop to download last third of player pages
for playerIDs in bottom_third:
    print(playerIDs)
    download_player_html(playerIDs)

## Creating BR ID and 5 year 3P% Dataframe

### for loop

In [13]:
#single list object soon to house other lists
BR_ID_and_3PT_list =[]

In [14]:
for playerID in top_FTA_rookies_3PT["BR ID"]:
    temp_list = get_3PT_percentage_5yrs(playerID)
    temp_list.insert(0, playerID)
    BR_ID_and_3PT_list.append(temp_list)

In [15]:
#checking to see that this list is indeed list of lists: outerList[[innerList_1], [innerList_2],...[innerList_n]]
BR_ID_and_3PT_list

[['griffbl01', 0.292, 0.125, 0.179, 0.273, 0.4],
 ['laettch01', 0.1, 0.24, 0.325, 0.231, 0.352],
 ['iversal01', 0.341, 0.298, 0.291, 0.341, 0.32],
 ['anthoca01', 0.322, 0.266, 0.243, 0.268, 0.354],
 ['abdursh01', 0.259, 0.412, 0.306, 0.302, 0.188],
 ['stackje01', 0.318, 0.298, 0.241, 0.278, 0.288],
 ['hillgr01', 0.148, 0.192, 0.303, 0.143, 0.0],
 ['robingl01', 0.321, 0.342, 0.35, 0.385, 0.392],
 ['stoudam01', 0.2, 0.2, 0.188, 0.0, 0.0],
 ['gasolpa01', 0.2, 0.1, 0.267, 0.167, 0.25],
 ['wiggian01', 0.31, 0.3, 0.356, 0.331, 0.339],
 ['evansty01', 0.255, 0.291, 0.202, 0.338, 0.221],
 ['paulch01', 0.282, 0.35, 0.369, 0.364, 0.409],
 ['jamesle01', 0.29, 0.351, 0.335, 0.319, 0.315],
 ['duranke01', 0.288, 0.422, 0.365, 0.35, 0.387],
 ['colemde01', 0.342, 0.303, 0.232, 0.314, 0.233],
 ['mashbja01', 0.284, 0.328, 0.343, 0.325, 0.303],
 ['simmoli01', 0.273, 0.2, 0.091, 0.353, 0.375],
 ['westbru01', 0.271, 0.221, 0.33, 0.316, 0.323],
 ['odomla01', 0.36, 0.316, 0.19, 0.326, 0.298],
 ['bakervi01', 0

In [16]:
#create new dataframe with rookie ID and 5 years worth of 3P% data
rookie_df = pd.DataFrame(BR_ID_and_3PT_list, columns=['BR ID', 'Yr1', 'Yr2', 'Yr3', 'Yr4', 'Yr5'])

In [17]:
#verify the database infromation
rookie_df

Unnamed: 0,BR ID,Yr1,Yr2,Yr3,Yr4,Yr5
0,griffbl01,0.292,0.125,0.179,0.273,0.400
1,laettch01,0.100,0.240,0.325,0.231,0.352
2,iversal01,0.341,0.298,0.291,0.341,0.320
3,anthoca01,0.322,0.266,0.243,0.268,0.354
4,abdursh01,0.259,0.412,0.306,0.302,0.188
...,...,...,...,...,...,...
74,rideris01,0.360,0.351,0.371,0.385,0.321
75,jefferi01,0.232,0.250,0.364,0.337,0.319
76,murrala01,0.298,0.319,0.341,0.353,0.330
77,cambyma01,0.143,0.000,0.500,0.125,


## Plotting Using Seaborn

In [18]:
tips = sns.load_dataset("tips")

In [19]:
tips.head(10)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
6,8.77,2.0,Male,No,Sun,Dinner,2
7,26.88,3.12,Male,No,Sun,Dinner,4
8,15.04,1.96,Male,No,Sun,Dinner,2
9,14.78,3.23,Male,No,Sun,Dinner,2


In [20]:
sns.relplot(data=rookie_df, x="total_bil", y="tip", hue="day")

ValueError: Could not interpret value `total_bil` for parameter `x`

In [None]:
FTA_3Pper_plot = sns.relplot(x=rookie_FTA_top_30["3P%"]>0, y=rookie_FTA_top_30["FTA"])

In [None]:
FTA_3Pper_plot = sns.relplot(data=rookie_FTA_top_30, x="3P%", y="FTA")

In [None]:
FTA_plot = sns.relplot(x="Player", y="FTA", data=rookie_FTA_top_30)

In [None]:
FTA_plot = FTA_plot.set