In [1]:
import logging
import pandas as pd
import requests
logging.basicConfig(
    level=logging.INFO,
    format="[%(levelname)s] %(asctime)s %(message)s",
    datefmt="%Y-%m-%d %I:%M:%S %p",
    handlers=[logging.FileHandler("logs/example.log"), logging.StreamHandler()],
)
logging.getLogger("requests").setLevel(logging.WARNING) 
import numpy as np
from datetime import datetime

In [2]:
def get_shooting_stats_data():
    """
    Web Scrape function w/ pandas read_html that grabs all raw shooting stats
    Args:
        None
    Returns:
        Pandas DataFrame of raw shooting stats
    """
    try:
        url = "https://www.basketball-reference.com/leagues/NBA_2022_shooting.html"
        df = pd.read_html(url)[0]
        logging.info(
            f"Shooting Stats Web Scrape Function Successful, retrieving {len(df)} rows for Shooting Stats"
        )
        return df
    except BaseException as error:
        logging.error(f"Shooting Stats Web Scrape Function Failed, {error}")
        df = []
        return df

def get_shooting_stats_transformed(df):
    """
    Web Scrape Transformation function for Shooting Stats
    Args:
        df (pandas DataFrame): The Raw Shooting Stats DF
    Returns:
        Pandas DataFrame of Transformed Shooting Stats
    """
    try:
        df.columns = df.columns.to_flat_index()
        df = df.rename(columns = {
            df.columns[1]: 'player',
            df.columns[6]: 'mp',
            df.columns[8]: 'avg_shot_distance',
            df.columns[10]: 'pct_fga_2p',
            df.columns[11]: 'pct_fga_0_3',
            df.columns[12]: 'pct_fga_3_10',
            df.columns[13]: 'pct_fga_10_16',
            df.columns[14]: 'pct_fga_16_3p',
            df.columns[15]: 'pct_fga_3p',
            df.columns[18]: 'fg_pct_0_3',
            df.columns[19]: 'fg_pct_3_10',
            df.columns[20]: 'fg_pct_10_16',
            df.columns[21]: 'fg_pct_16_3p',
            df.columns[24]: 'pct_2pfg_ast',
            df.columns[25]: 'pct_3pfg_ast',
            df.columns[27]: 'dunk_pct_tot_fg',
            df.columns[28]: 'dunks',
            df.columns[30]: 'corner_3_ast_pct',
            df.columns[31]: 'corner_3pm_pct',
            df.columns[33]: 'heaves_att',
            df.columns[34]: 'heaves_makes'
            })[['player', 'mp', 'avg_shot_distance', 'pct_fga_2p', 'pct_fga_0_3', 'pct_fga_3_10', 'pct_fga_10_16', 'pct_fga_16_3p', 'pct_fga_3p', 'fg_pct_0_3', 'fg_pct_3_10', 'fg_pct_10_16', 'fg_pct_16_3p', 'pct_2pfg_ast', 'pct_3pfg_ast', 'dunk_pct_tot_fg', 'dunks', 'corner_3_ast_pct', 'corner_3pm_pct', 'heaves_att', 'heaves_makes']]
        df = df.query('player != "Player"').copy()
        df["mp"] = pd.to_numeric(df["mp"])
        df = df.sort_values(['mp'], ascending = False).groupby('player').first().reset_index().drop('mp', axis = 1)
        df['scrape_date'] = datetime.now().date()
        logging.info(f"Shooting Stats Transformation Function Successful, retrieving {len(df)} rows")
        return df
    except BaseException as e:
        logging.info(f"Shooting Stats Transformation Function Failed, {e}")
        df = []
        return df
# def get_opp_stats_transformed(df):
#     try:
#         df = df[["Team", "FG%", "3P%", "3P", "PTS"]]
#         df = df.rename(
#             columns={
#                 df.columns[0]: "team",
#                 df.columns[1]: "fg_percent_opp",
#                 df.columns[2]: "threep_percent_opp",
#                 df.columns[3]: "threep_made_opp",
#                 df.columns[4]: "ppg_opp",
#             }
#         )
#         df = df.query('team != "League Average"')
#         df = df.reset_index(drop=True)
#         df["scrape_date"] = datetime.now().date()
#         logging.info(
#             f"Opp Stats Transformation Function Successful, retrieving {len(df)} rows for {year}-{month}-{day}"
#         )
#         return df
#     except BaseException as error:
#         logging.error(f"Opp Stats Function Failed, {error}")
#         df = []
#         return df
df = get_shooting_stats_data()
df1 = get_shooting_stats_transformed(df)

[INFO] 2022-02-22 06:02:24 PM Note: NumExpr detected 20 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
[INFO] 2022-02-22 06:02:24 PM NumExpr defaulting to 8 threads.
[INFO] 2022-02-22 06:02:24 PM Shooting Stats Web Scrape Function Successful, retrieving 763 rows for Shooting Stats
[INFO] 2022-02-22 06:02:24 PM Shooting Stats Transformation Function Successful, retrieving 592 rows


In [None]:
df['player'] = df['player'].str.replace(" Jr.", "", regex = True)
df['player'] = df['player'].str.replace(" Sr.", "", regex = True)
df['player'] = df['player'].str.replace(" II", "", regex = True)
df['player'] = df['player'].str.replace(" III", "", regex = True)
df['player'] = df['player'].str.replace(" IV", "", regex = True)

In [3]:
def clean_player_names(df: pd.DataFrame) -> pd.DataFrame:
    try:
        df['player'] = df['player'].str.replace(" Jr.", "", regex = True)
        df['player'] = df['player'].str.replace(" Sr.", "", regex = True)
        df['player'] = df['player'].str.replace(" II", "", regex = True)
        df['player'] = df['player'].str.replace(" III", "", regex = True)
        df['player'] = df['player'].str.replace(" IV", "", regex = True)
        return(df)
    except BaseException as e:
        print(f"Error Occurred with clean_player_names, {e}")

df_final = clean_player_names(df1)


In [5]:
df1.dtypes.to_dict()

{'player': dtype('O'),
 'avg_shot_distance': dtype('O'),
 'pct_fga_2p': dtype('O'),
 'pct_fga_0_3': dtype('O'),
 'pct_fga_3_10': dtype('O'),
 'pct_fga_10_16': dtype('O'),
 'pct_fga_16_3p': dtype('O'),
 'pct_fga_3p': dtype('O'),
 'fg_pct_0_3': dtype('O'),
 'fg_pct_3_10': dtype('O'),
 'fg_pct_10_16': dtype('O'),
 'fg_pct_16_3p': dtype('O'),
 'pct_2pfg_ast': dtype('O'),
 'pct_3pfg_ast': dtype('O'),
 'dunk_pct_tot_fg': dtype('O'),
 'dunks': dtype('O'),
 'corner_3_ast_pct': dtype('O'),
 'corner_3pm_pct': dtype('O'),
 'heaves_att': dtype('O'),
 'heaves_makes': dtype('O'),
 'scrape_date': dtype('O')}

In [86]:
df.to_csv('shooting_stats_raw.csv')

In [6]:
import os
fname = os.path.join(
    os.path.dirname(__file__), "fixture_csvs/shooting_stats_data.csv"
)
shooting_stats = pd.read_csv(fname)
# shooting_stats = get_transactions_transformed(shooting_stats)

NameError: name '__file__' is not defined

In [54]:
df1.columns = df1.columns.droplevel(0)

In [64]:
dict(zip(df1.columns.levels[1], ["heaves", "pct_3pa_corner", "pct_fga", "fg_pct_0_3", "fg_pct_10_16", "fg_pct_16_3p", "pct_2pfg_ast", "fg_pct_3_10", "pct_3pfg_ast", "pct_3p_corner", "age", "heave_attempts", "avg_shot_distance", "fg_pct", "gp", "mp", "player", "pos", "rk", "team", "fg1", "fg2", "fg3", "fg4", "fg5", "fg6"]))

{'#': 'heaves',
 '%3PA': 'pct_3pa_corner',
 '%FGA': 'pct_fga',
 '0-3': 'fg_pct_0_3',
 '10-16': 'fg_pct_10_16',
 '16-3P': 'fg_pct_16_3p',
 '2P': 'pct_2pfg_ast',
 '3-10': 'fg_pct_3_10',
 '3P': 'pct_3pfg_ast',
 '3P%': 'pct_3p_corner',
 'Age': 'age',
 'Att.': 'heave_attempts',
 'Dist.': 'avg_shot_distance',
 'FG%': 'fg_pct',
 'G': 'gp',
 'MP': 'mp',
 'Player': 'player',
 'Pos': 'pos',
 'Rk': 'rk',
 'Tm': 'team',
 'Unnamed: 16_level_1': 'fg1',
 'Unnamed: 23_level_1': 'fg2',
 'Unnamed: 26_level_1': 'fg3',
 'Unnamed: 29_level_1': 'fg4',
 'Unnamed: 32_level_1': 'fg5',
 'Unnamed: 9_level_1': 'fg6'}

In [70]:
# df_new = df1.rename(columns = {
#     df.columns[1]: 'player',
#     df.columns[12]: 'pct_fga_2p',
#     df.columns[13]: 'pct_fga_0_3',
#     df.columns[14]: 'pct_fga_3_10',
#     df.columns[15]: 'pct_fga_10_16',
#     df.columns[16]: 'pct_fga_16_3p',
#     df.columns[17]: 'pct_fga_3p',
#     df.columns[20]: 'fg_pct_0_3',
#     df.columns[21]: 'fg_pct_3_10',
#     df.columns[22]: 'fg_pct_10_16',
#     df.columns[23]: 'fg_pct_16_3p'
#     }, level = 1)
# print(df.columns.levels[1])
# print(df_new.columns.levels[0])
# print(df_new.columns)
# df_new
print(df1.columns.levels[1])
print(df1.columns.levels[1])
df1.columns = df1.columns.to_flat_index()
df1
# d = dict(zip(df1.columns.levels[1], ["heaves", "pct_3pa_corner", "pct_fga", "fg_pct_0_3", "fg_pct_10_16", "fg_pct_16_3p", "pct_2pfg_ast", "fg_pct_3_10", "pct_3pfg_ast", "pct_3p_corner", "age", "heave_attempts", "avg_shot_distance", "fg_pct", "gp", "mp", "player", "pos", "rk", "team", "fg1", "fg2", "fg3", "fg4", "fg5", "fg6"]))
# df = df1.rename(columns=d, level=1)
# df.columns = df.columns.droplevel(0)
# print(df1.columns.levels[1])
# df["mp"] = pd.to_numeric(df["mp"])
# df = df.sort_values(['mp'], ascending = False).groupby('player').first().reset_index()

AttributeError: 'Index' object has no attribute 'levels'

In [71]:
df1

Unnamed: 0,"(Unnamed: 0_level_0, Rk)","(Unnamed: 1_level_0, Player)","(Unnamed: 2_level_0, Pos)","(Unnamed: 3_level_0, Age)","(Unnamed: 4_level_0, Tm)","(Unnamed: 5_level_0, G)","(Unnamed: 6_level_0, MP)","(Unnamed: 7_level_0, FG%)","(Unnamed: 8_level_0, Dist.)","(Unnamed: 9_level_0, Unnamed: 9_level_1)",...,"(% of FG Ast'd, 3P)","(Unnamed: 26_level_0, Unnamed: 26_level_1)","(Dunks, %FGA)","(Dunks, #)","(Unnamed: 29_level_0, Unnamed: 29_level_1)","(Corner 3s, %3PA)","(Corner 3s, 3P%)","(Unnamed: 32_level_0, Unnamed: 32_level_1)","(Heaves, Att.)","(Heaves, #)"
0,1,Precious Achiuwa,C,22,TOR,48,1107,.423,8.3,,...,1.000,,.180,49,,.407,.292,,0,0
1,2,Steven Adams,C,28,MEM,56,1464,.546,2.7,,...,,,.128,31,,,,,0,0
2,3,Bam Adebayo,C,24,MIA,34,1134,.524,7.2,,...,,,.172,71,,.000,,,2,0
3,4,Santi Aldama,PF,21,MEM,27,277,.381,12.6,,...,1.000,,.093,9,,.231,.333,,0,0
4,5,LaMarcus Aldridge,C,36,BRK,39,894,.559,12.3,,...,1.000,,.037,15,,.308,.333,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
758,588,Thaddeus Young,PF,33,TOR,2,33,.500,8.4,,...,,,.000,0,,.500,.000,,0,0
759,589,Trae Young,PG,23,ATL,53,1824,.456,16.7,,...,.228,,.000,0,,.032,.462,,1,0
760,590,Omer Yurtseven,C,23,MIA,42,593,.538,5.6,,...,1.000,,.188,34,,.286,.000,,0,0
761,591,Cody Zeller,C,29,POR,27,355,.567,3.7,,...,,,.211,16,,.250,.000,,0,0


In [None]:
df = df1.rename(columns = {
{'#': 'heaves',
 '%3PA': 'pct_3pa_corner',
 '%FGA': 'pct_fga',
 '0-3': 'fg_pct_0_3',
 '10-16': 'fg_pct_10_16',
 '16-3P': 'fg_pct_16_3p',
 '2P': 'pct_2pfg_ast',
 '3-10': 'fg_pct_3_10',
 '3P': 'pct_3pfg_ast',
 '3P%': 'pct_3p_corner',
 'Age': 'age',
 'Att.': 'heave_attempts',
 'Dist.': 'avg_shot_distance',
 'FG%': 'fg_pct',
 'G': 'gp',
 'MP': 'mp',
 'Player': 'player',
 'Pos': 'pos',
 'Rk': 'rk',
 'Tm': 'team',
 'Unnamed: 16_level_1': 'fg1',
 'Unnamed: 23_level_1': 'fg2',
 'Unnamed: 26_level_1': 'fg3',
 'Unnamed: 29_level_1': 'fg4',
 'Unnamed: 32_level_1': 'fg5',
 'Unnamed: 9_level_1': 'fg6'}
}, levels = 1)

In [15]:
df = df1.droplevel(0, axis = 1)
df = df.sort_values([df.columns[1], df.columns[6]], ascending = False).groupby([df.columns[1]], as_index = False).first().reset_index().drop("index", axis =1)
df2 = df[['Player', '0-3']]

In [80]:
# df = get_shooting_stats_data()
# df = df1.droplevel(df1.index[0])
# df = df.sort_values([df.columns[1], df.columns[6]], ascending = False).groupby([df.columns[1]], as_index = False).first().reset_index().drop("index", axis =1)
# df.columns = df.columns.str.lower()
# print(df.columns)
df = df1.copy()
df = df.rename(columns = {
    df.columns[1]: 'player',
    df.columns[6]: 'mp',
    df.columns[8]: 'avg_shot_distance',
    df.columns[10]: 'pct_fga_2p',
    df.columns[11]: 'pct_fga_0_3',
    df.columns[12]: 'pct_fga_3_10',
    df.columns[13]: 'pct_fga_10_16',
    df.columns[14]: 'pct_fga_16_3p',
    df.columns[15]: 'pct_fga_3p',
    df.columns[18]: 'fg_pct_0_3',
    df.columns[19]: 'fg_pct_3_10',
    df.columns[20]: 'fg_pct_10_16',
    df.columns[21]: 'fg_pct_16_3p',
    df.columns[24]: 'pct_2pfg_ast',
    df.columns[25]: 'pct_3pfg_ast',
    df.columns[27]: 'dunk_pct_tot_fg',
    df.columns[28]: 'dunks',
    df.columns[30]: 'corner_3_ast_pct',
    df.columns[31]: 'corner_3pm_pct',
    df.columns[33]: 'heaves_att',
    df.columns[34]: 'heaves_makes'
    })[['player', 'mp', 'avg_shot_distance', 'pct_fga_2p', 'pct_fga_0_3', 'pct_fga_3_10', 'pct_fga_10_16', 'pct_fga_16_3p', 'pct_fga_3p', 'fg_pct_0_3', 'fg_pct_3_10', 'fg_pct_10_16', 'fg_pct_16_3p', 'pct_2pfg_ast', 'pct_3pfg_ast', 'dunk_pct_tot_fg', 'dunks', 'corner_3_ast_pct', 'corner_3pm_pct', 'heaves_att', 'heaves_makes']]
df = df.query('player != "Player"')
df["mp"] = pd.to_numeric(df["mp"])
df = df.sort_values(['mp'], ascending = False).groupby('player').first().reset_index()
# # df.columns = df.columns.droplevel(0)
# print(df.columns) # [['player', 'pct_fga_2p', 'pct_fga_0_3', 'pct_fga_3_10', 'pct_fga_10_16', 'pct_fga_16_3p', 'pct_fga_3p', 'fg_pct_0_3', 'fg_pct_3_10', 'fg_pct_10_16', 'fg_pct_16_3p']]
# # df.columns = df.columns.droplevel(0)
# # "['pct_fga_0_3', 'pct_fga_10_16', 'pct_fga_3_10']
# print(df.columns[13])
# print(df.columns[17])

In [6]:
df2 = df.copy().reset_index()
print(df2.columns)
df2 = df2[['player', 'pct_fga_2p', 'pct_fga_0_3', 'pct_fga_3_10', 'pct_fga_10_16', 'pct_fga_16_3p', 'pct_fga_3p', 'fg_pct_0_3', 'fg_pct_3_10', 'fg_pct_10_16', 'fg_pct_16_3p']]

Index(['index', 'player', 'rk', 'pos', 'age', 'tm', 'g', 'mp', 'fg%', 'dist.',
       'unnamed: 9_level_1', 'pct_fga_3p', '0-3', 'pct_fga_2p', 'fg_pct_0_3',
       'fg_pct_3_10', 'fg_pct_10_16', 'pct_fga_16_3p', 'pct_fga_3p', '0-3',
       'pct_fga_2p', 'fg_pct_0_3', 'fg_pct_3_10', 'fg_pct_10_16',
       'fg_pct_16_3p', 'pct_fga_3p', 'fg_pct_10_16', 'unnamed: 26_level_1',
       '%fga', '#', 'unnamed: 29_level_1', '%3pa', '3p%',
       'unnamed: 32_level_1', 'att.', '#'],
      dtype='object')


KeyError: "['pct_fga_0_3', 'pct_fga_10_16', 'pct_fga_3_10'] not in index"