# Sports Analytics Final Project
### John Eastman & Jack Flores

# Does Defense Still Win Championships?

### Offense Stats in Finals -- creating figures

1. Differential in shooting between winner and loser
2. Differential in true shooting between winner and loser
3. Differential in 3p shooting between winner and loser
4. Number of 3-pointers attempted over time in finals

In [1]:
import pandas as pd
import requests, re, math
from bs4 import BeautifulSoup, Comment
import time
import matplotlib.pyplot as plt
import numpy as np

# globals
# 1991 to 2023
YEARS = [y for y in range(1998, 2024)]
PRELINK = "https://www.basketball-reference.com"

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
def make_link (year):
    return f"https://www.basketball-reference.com/playoffs/NBA_{year}.html"

### Get the links to all Finals series from all years in YEARS

In [5]:
FINALS = []
for year in YEARS:
    page = requests.get(make_link(year))
    time.sleep(2)
    soup = BeautifulSoup(page.content, "html.parser")
    table = soup.find("table", {"id": "all_playoffs"})
    link = table.find('a', string='Series Stats')['href']
    finals = PRELINK + link
    FINALS.append(finals)

In [69]:
finals_data = []
advanced_stats = pd.DataFrame()
stats = pd.DataFrame()

for i, final in enumerate(FINALS):
    page = requests.get(final)
    time.sleep(2)
    soup = BeautifulSoup(page.content, "html.parser")
    comments = soup.find_all(string=lambda text: isinstance(text, Comment) and 'sortable stats_table' in text)
    comments = comments[1:]
    
    # Parse the content of the comment as HTML
    if comments:
        for comment in comments:
            comment_soup = BeautifulSoup(comment, 'html.parser')
            
            # Extract the table
            table = comment_soup.find('table', {'class': 'sortable stats_table'})

            # Extract table headers
            headers = [header.text for header in table.find_all('th')]
            # Extract the headers we actually want (bs4 doesn't get it exact)
            new_headers = headers[5:27]
            new_headers += ["MPPerGame", "PTSPerGame", "TRBPerGame", "ASTPerGame", "STLPerGame", "BLKPerGame"]

            # Extract table data
            data = []
            for row in table.find_all('tr'):
                row_data = []
                for cell in row.find_all(['td']):
                    row_data.append(cell.text)
                if row_data != []:
                    data.append(row_data)
            print(new_headers)
            try:
                df = pd.DataFrame(data[1:], columns=new_headers, index=None)
            except ValueError:
                new_headers = headers[4:22]
                df = pd.DataFrame(data[1:], columns=new_headers, index=None)
            df["year"] = YEARS[i]
            
            
            try:
                stats = pd.concat([df.reset_index(drop=True), stats.reset_index(drop=True)], ignore_index=True, axis=0)
            except ValueError:
                advanced_stats = pd.concat([df.reset_index(drop=True), advanced_stats.reset_index(drop=True)], ignore_index=True, axis=0)
            print(df)


['Player', 'Age', 'G', 'MP', 'FG', 'FGA', '3P', '3PA', 'FT', 'FTA', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'FG%', '3P%', 'FT%', 'MPPerGame', 'PTSPerGame', 'TRBPerGame', 'ASTPerGame', 'STLPerGame', 'BLKPerGame']
             Player Age  G    MP   FG  FGA  3P 3PA   FT  FTA  ...   FG%   3P%  \
0    Scottie Pippen  32  6   237   34   83   6  26   20   24  ...  .410  .231   
1        Toni Kukoč  29  6   222   38   76   7  23    8   13  ...  .500  .304   
2        Ron Harper  34  6   173   12   33   1   6    7   12  ...  .364  .167   
3       Luc Longley  29  6   131   12   27   0   0    6    8  ...  .444         
4        Steve Kerr  32  6   125    7   20   5  13    4    4  ...  .350  .385   
5     Scott Burrell  27  6    83    9   22   1   4    2    3  ...  .409  .250   
6     Dennis Rodman  36  6   182    6   13   0   0    8   12  ...  .462         
7      Jud Buechler  29  6    30    3    5   2   3    0    0  ...  .600  .667   
8   Bill Wennington  34  3    13    2

In [70]:
advanced_stats.to_csv("../data/Finals_Advanced_Stats.csv", index=False)
stats.to_csv("../data/Finals_Stats.csv", index=False)
print(advanced_stats.head())

Empty DataFrame
Columns: []
Index: []


In [65]:
totals = ['Player', 'Age', 'G', 'MP', 'FG', 'FGA', '3P', '3PA', 'FT', 'FTA', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'FG%', '3P%', 'FT%', 'MP', 'PTS', 'TRB', 'AST', 'STL', 'BLK']
totals = totals[:22]
print(totals)
["MPPerGame", "PTSPerGame", "TRBPerGame", "ASTPerGame", "STLPerGame", "BLKPerGame"]

['Player', 'Age', 'G', 'MP', 'FG', 'FGA', '3P', '3PA', 'FT', 'FTA', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'FG%', '3P%', 'FT%']


['MPPerGame',
 'PTSPerGame',
 'TRBPerGame',
 'ASTPerGame',
 'STLPerGame',
 'BLKPerGame']