#### Package Import

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from collections import OrderedDict
import random

#### Input Parameters

In [None]:
year = 2021

## 1. League URL Link Scrape

In [None]:
# Target web page:
url = "https://www.basketball-reference.com/leagues/NBA_" + str(year) + "_per_game.html"

# Establishing the connection to the web page:
response = requests.get(url)

print('Status Code: ',response.status_code)

# Pull the HTML string out of requests and convert it to a Python string.
html = response.text

soup = BeautifulSoup(html, features="lxml")

## 1a. Scraping the NBA Links

In [None]:
#master block

rows = soup.findAll('td')
links_with_text = [a['href'] for a in soup.find_all('a', href=True) if a.text]

#go through third link:  '/leagues/NBA_2022(year)_adj_shooting.html',
counter = 0
for i in range(0,len(links_with_text)):
    if links_with_text[i] == '/leagues/NBA_' + str(year) + '_adj_shooting.html':
        counter += 1
    if counter == 3:
        links_with_text = links_with_text[i+1:]
        break
        
#remove at last link: '/leagues/NBA_2022.html',
counter = 0
for i in range(0,len(links_with_text)):
    if links_with_text[i] == '/leagues/NBA_' + str(year) + '.html':
        links_with_text = links_with_text[:i]
        counter += 1
    if counter == 1:
        break
        
links_with_text = list(OrderedDict.fromkeys(links_with_text))
links_with_text = [x for x in links_with_text if not x.startswith('/teams/')]
links_with_text = [x.replace('.html', '/gamelog/') for x in links_with_text]

In [None]:
links_with_text

## 1b. Scraping the NBA Player Names

In [None]:
header_row = ['Player', 'Pos', 'Age', 'Tm', 'G', 'GS','MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']

player_names = soup.findAll('tr')[1:]
player_names_data = [[td.getText() for td in player_names[i].findAll('td')]
                    for i in range(len(player_names))]

df = pd.DataFrame(player_names_data,columns = header_row)
df.dropna(how = 'all', inplace = True)

df['Player'] = df['Player'].str.replace('*', '') #Remove all * from HoF

player_list = df['Player'].tolist()
player_list = list(OrderedDict.fromkeys(player_list))

In [None]:
player_list

## 1c. Verification Size Check

In [None]:
# Player Links List == Play Name List
len(links_with_text) == len(player_list)

### 1d. Scrape Individual Game Stats

In [None]:
master_header = ['Player', 'Year', 'G', 'Date', 'Age', 'Tm', '', 'Opp', '', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc', '+/-', 'Link']
header = ['G', 'Date', 'Age', 'Tm', '', 'Opp', '', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc', '+/-']

master_df = pd.DataFrame(columns = master_header)
master_df

In [None]:
for i in range(0, len(links_with_text)):

    # Target web page:
    url = "https://www.basketball-reference.com" + links_with_text[i] + str(year)

    # Establishing the connection to the web page:
    response = requests.get(url)

    # You can use status codes to understand how the target server responds to your request.
    # Ex., 200 = OK, 400 = Bad Request, 403 = Forbidden, 404 = Not Found.
    print('Status Code for Link', links_with_text[i], ': ', response.status_code)

    # Pull the HTML string out of requests and convert it to a Python string.
    html = response.text

    soup = BeautifulSoup(html, features="lxml")

    rows = soup.findAll('tr')
    rows_data = [[td.getText() for td in rows[x].findAll('td')]
                        for x in range(len(rows))]

    for j in range(0,len(rows_data)):
        if rows_data[j] == []:
            updated_rows_data = rows_data[j+1:]
            break

    df = pd.DataFrame(updated_rows_data, columns = header)
    df.dropna(how = 'all', inplace = True)
    df.reset_index(drop = True, inplace = True)
    df.insert(loc=0, column='Player', value=player_list[i])
    df.insert(loc=1, column='Year', value=year)
    df['Link'] = links_with_text[i] + str(year)

    master_df = pd.concat([master_df,df])
    time.sleep(random.randrange(1, 7, 1))

## LOOP SETUP

In [None]:
year = 2018

In [None]:
#part 3
master_header = ['Player', 'Year', 'G', 'Date', 'Age', 'Tm', '', 'Opp', '', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc', '+/-', 'Link']
header = ['G', 'Date', 'Age', 'Tm', '', 'Opp', '', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc', '+/-']
master_df = pd.DataFrame(columns = master_header)
master_df

In [None]:
for i in range(0, len(links_with_text)):

    #part 1
    url = "https://www.basketball-reference.com/leagues/NBA_" + str(year) + "_per_game.html"
    response = requests.get(url)
    print('Status Code: ',response.status_code)
    html = response.text
    soup = BeautifulSoup(html, features="lxml")

    rows = soup.findAll('td')
    links_with_text = [a['href'] for a in soup.find_all('a', href=True) if a.text]

    #go through third link:  '/leagues/NBA_2022(year)_adj_shooting.html',
    counter = 0
    for i in range(0,len(links_with_text)):
        if links_with_text[i] == '/leagues/NBA_' + str(year) + '_adj_shooting.html':
            counter += 1
        if counter == 3:
            links_with_text = links_with_text[i+1:]
            break

    #remove at last link: '/leagues/NBA_2022.html',
    counter = 0
    for i in range(0,len(links_with_text)):
        if links_with_text[i] == '/leagues/NBA_' + str(year) + '.html':
            links_with_text = links_with_text[:i]
            counter += 1
        if counter == 1:
            break

    links_with_text = list(OrderedDict.fromkeys(links_with_text))
    links_with_text = [x for x in links_with_text if not x.startswith('/teams/')]
    links_with_text = [x.replace('.html', '/gamelog/') for x in links_with_text]
    
    #part 2
    header_row = ['Player', 'Pos', 'Age', 'Tm', 'G', 'GS','MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']
    player_names = soup.findAll('tr')[1:]
    player_names_data = [[td.getText() for td in player_names[i].findAll('td')]
                        for i in range(len(player_names))]
    df = pd.DataFrame(player_names_data,columns = header_row)
    df.dropna(how = 'all', inplace = True)
    df['Player'] = df['Player'].str.replace('*', '') #Remove all * from HoF
    player_list = df['Player'].tolist()
    player_list = list(OrderedDict.fromkeys(player_list))
    
    time.sleep(4) #to make sure we dont get soft blocked
    

    # Target web page:
    url = "https://www.basketball-reference.com" + links_with_text[i] + str(year)

    # Establishing the connection to the web page:
    response = requests.get(url)

    print('Status Code for Player', ': ',response.status_code)

    # Pull the HTML string out of requests and convert it to a Python string.
    html = response.text

    soup = BeautifulSoup(html, features="lxml")

    rows = soup.findAll('tr')
    rows_data = [[td.getText() for td in rows[x].findAll('td')]
                        for x in range(len(rows))]

    for j in range(0,len(rows_data)):
        if rows_data[j] == []:
            updated_rows_data = rows_data[j+1:]
            break

    df = pd.DataFrame(updated_rows_data, columns = header)
    df.dropna(how = 'all', inplace = True)
    df.reset_index(drop = True, inplace = True)
    df.insert(loc=0, column='Player', value=player_list[i])
    df.insert(loc=1, column='Year', value=year)
    df['Link'] = links_with_text[i] + str(year)

    master_df = pd.concat([master_df,df])
    time.sleep(5)