In [1]:
import pandas as pd
import os
import re
import sqlite3

## Advanced Stats from 2014-24

### Load and Transform Data

In [2]:
dfs = []

# for each file (year) in advanced_stats (each file contains a year of stats)
for filename in os.scandir('./advanced_stats/'):
    if re.findall('\d+', filename.path):
        year = re.findall('\d+', filename.path)[0]
        # read data from file
        df = pd.read_csv(filename.path)
        
        # obtain necessary advanced statistics
        df = df[['Player', 'Pos', 'Age', 'G', 'MP', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']]
        
        # transform data
        df.loc[:, 'Player'] = df['Player'].str.replace('*', '', regex=False)
        df[['Season']] = year + '-' + str(int(year[2:])+1)
        df[['Season_FOR']] = int(year) + 1
        dfs.append(df)
        
# build a dataframe from each year
all_players_2014_2024 = pd.concat(dfs)

## Player Info

### Load Data

In [3]:
from nba_api.stats.static import players

# get current players
playersList = players.get_players()

playersDF = pd.DataFrame(playersList)

### Transform Data; Merge with Advanced Stats

In [4]:
players_w_ids = all_players_2014_2024.merge(right = playersDF, left_on = 'Player', right_on = 'full_name')
players_w_ids = players_w_ids.drop(columns = ['full_name', 'first_name', 'last_name', 'is_active'])

## SQL Connection: Merge Advanced Stats with Free Agent Data + Stats

In [5]:
con = sqlite3.connect('./nba_data.db')
cursor = con.cursor()

# read in tables for FA from 2010-20 and 2020-24
fa_2010 = pd.read_sql_query("SELECT * FROM FA2010_FULL", con)
fa_2020 = pd.read_sql_query("SELECT * FROM FA_FULL", con)
con.close()

### Clean FA 2010-20 Dataset

In [6]:
fa_2010_w_ids = fa_2010.merge(playersDF, left_on='Name', right_on='full_name')
fa_2010_w_ids = fa_2010_w_ids.drop(columns = ['full_name', 'first_name', 'last_name', 'is_active'])
fa_2010_w_ids = fa_2010_w_ids.rename(columns={'AVG_SALARY': 'Contract', 'id': 'ID'})

### Combine the 2 FA datasets into 1 2010-24 Dataset

In [7]:
fa_total = pd.concat([fa_2010_w_ids, fa_2020])
fa_cleaned = fa_total.merge(players_w_ids, left_on=['ID', 'FA_Year'], right_on = ['id', 'Season_FOR'], how='left')
fa_cleaned = fa_cleaned.dropna()
fa_cleaned = fa_cleaned.drop(columns = ['Age_y', 'Player', 'G', 'MP', 'Season_FOR', 'id'])

fa_cleaned.to_csv('./cleaned_data/FA_FULL_cleaned.csv', index=False)
con = sqlite3.connect('./nba_data.db')

fa_cleaned.to_sql('FA_FULL_cleaned', con = con, index=True, if_exists = 'replace')

con.close()