# Sports Analytics Final Project
### John Eastman & Jack Flores

# Does Defense Still Win Championships?

### Offense Stats in Finals -- creating figures

1. Differential in shooting between winner and loser
2. Differential in true shooting between winner and loser
3. Differential in 3p shooting between winner and loser
4. Number of 3-pointers attempted over time in finals

In [1]:
import pandas as pd
import requests, re, math
from bs4 import BeautifulSoup, Comment
import time
import matplotlib.pyplot as plt
import numpy as np

# globals
# 1991 to 2023
YEARS = [y for y in range(1998, 2024)]
PRELINK = "https://www.basketball-reference.com"

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
def make_link (year):
    return f"https://www.basketball-reference.com/playoffs/NBA_{year}.html"

### Get the links to all Finals series from all years in YEARS

In [5]:
FINALS = []
for year in YEARS:
    page = requests.get(make_link(year))
    time.sleep(2)
    soup = BeautifulSoup(page.content, "html.parser")
    table = soup.find("table", {"id": "all_playoffs"})
    link = table.find('a', string='Series Stats')['href']
    finals = PRELINK + link
    FINALS.append(finals)

In [75]:
finals_data = []
stats_columns = ['Player', 'Age', 'G', 'MP', 'FG', 'FGA', '3P', '3PA', 'FT', 'FTA', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'FG%', '3P%', 'FT%', 'MPPerGame', 'PTSPerGame', 'TRBPerGame', 'ASTPerGame', 'STLPerGame', 'BLKPerGame', 'year']
stats = pd.DataFrame(columns=stats_columns)

advanced_stats = pd.DataFrame()

for i, final in enumerate(FINALS):
    page = requests.get(final)
    time.sleep(2)
    soup = BeautifulSoup(page.content, "html.parser")
    comments = soup.find_all(string=lambda text: isinstance(text, Comment) and 'sortable stats_table' in text)
    comments = comments[1:]
    
    # Parse the content of the comment as HTML
    if comments:
        for comment in comments:
            comment_soup = BeautifulSoup(comment, 'html.parser')
            
            # Extract the table
            table = comment_soup.find('table', {'class': 'sortable stats_table'})

            # Extract table headers
            headers = [header.text for header in table.find_all('th')]
            # Extract the headers we actually want (bs4 doesn't get it exact)
            new_headers = headers[5:27]
            new_headers += ["MPPerGame", "PTSPerGame", "TRBPerGame", "ASTPerGame", "STLPerGame", "BLKPerGame"]
            # Extract table data
            data = []
            for row in table.find_all('tr'):
                row_data = []
                for cell in row.find_all(['td']):
                    row_data.append(cell.text)
                if row_data != []:
                    data.append(row_data)
            
            try:
                df = pd.DataFrame(data[1:], columns=new_headers, index=None)
            except ValueError:
                new_headers = headers[4:22]
                df = pd.DataFrame(data[1:], columns=new_headers, index=None)
            df["year"] = YEARS[i]
            
            if df.shape[1] == stats.shape[1]:
                print("Got a stats")
                stats = pd.concat([df, stats], ignore_index=True, axis=0)
            else:
                print("Got advanced stats")
                advanced_stats = pd.concat([df.reset_index(drop=True), advanced_stats.reset_index(drop=True)], ignore_index=True, axis=0)
            

Got a stats
Got advanced stats
Got a stats
Got advanced stats
Got a stats
Got advanced stats
Got a stats
Got advanced stats
Got a stats
Got advanced stats
Got a stats
Got advanced stats
Got a stats
Got advanced stats
Got a stats
Got advanced stats
Got a stats
Got advanced stats
Got a stats
Got advanced stats
Got a stats
Got advanced stats
Got a stats
Got advanced stats
Got a stats
Got advanced stats
Got a stats
Got advanced stats
Got a stats
Got advanced stats
Got a stats
Got advanced stats
Got a stats
Got advanced stats
Got a stats
Got advanced stats
Got a stats
Got advanced stats
Got a stats
Got advanced stats
Got a stats
Got advanced stats
Got a stats
Got advanced stats
Got a stats
Got advanced stats
Got a stats
Got advanced stats
Got a stats
Got advanced stats
Got a stats
Got advanced stats
Got a stats
Got advanced stats
Got a stats
Got advanced stats
Got a stats
Got advanced stats
Got a stats
Got advanced stats
Got a stats
Got advanced stats
Got a stats
Got advanced stats
Got a st

In [76]:
advanced_stats.to_csv("../data/Finals_Advanced_Stats.csv", index=False)
stats.to_csv("../data/Finals_Stats.csv", index=False)
print(advanced_stats.head())

            Player Age  G GS   MP   TS%  eFG% ORB%  DRB% TRB%  AST% STL% BLK%  \
0     Jimmy Butler  33  5  5  205  .511  .451  4.8   9.1  6.7  27.5  1.2  1.3   
1     Gabe Vincent  26  5  5  142  .494  .473  0.7   0.9  0.8  14.4  1.9  0.7   
2       Kyle Lowry  36  5  0  146  .614  .563  2.0  14.2  7.3  26.5  1.8  0.6   
3  Duncan Robinson  28  5  0   94  .641  .650  0.0   7.7  3.4  14.9  0.6  1.8   
4     Caleb Martin  27  5  1  142  .453  .438  4.9  14.7  9.1   7.0  1.9  2.0   

   TOV%  USG% ORtg DRtg  GmSc  year  
0   5.4  25.2  114  118  17.6  2023  
1   6.5  20.1   96  119   6.5  2023  
2  17.0  16.7  115  115   9.6  2023  
3  11.6  16.9  107  119   5.0  2023  
4  12.8  15.2   89  114   4.7  2023  
