# DATA 440 Final Project
# Part II: Data Preprocessing


### 1) Imports


In [None]:
import pandas as pd

---
### 2) Functions

In [5]:
def create_target(team):
    team['target'] = team['won'].shift(-1,axis=0) # Get the values of the 'won' column 1 period ahead (-1 in this case)
    return team
    
def clean_data(data):
    '''
        This function takes the NBA box score data as the input and 
        returns a cleaned-up dataframe.
        
        
    '''
    data = data.sort_values(['season','date']).reset_index(drop=True)

    cols_to_move = ['team','total','team_opp','total_opp','season','date','home'] # Move team name columns to beginning of dataframe
    for idx,col in enumerate(cols_to_move): 
        col_to_pop = data.pop(col)
        data.insert(idx,col,col_to_pop)
    
    # Dropping all unnecessary columns
    cols_to_drop = ['mp','mp_opp','mp.1','mp_opp.1','+/-','+/-_opp','mp_max','mp_max.1',
                   'mp_max_opp','mp_max_opp.1','usg%','usg%_opp','total','total_opp','home_opp','index_opp'] # Drop unnecessary columns
    cols_to_drop = [c for c in data.columns if c in cols_to_drop]# Make sure these columns are in the data
    
    data = data.drop(cols_to_drop,axis=1)
    data = data.reset_index(drop=True) # Reset index of dataframe
    
    
    team_stats = [col for col in data.columns if '_max' not in col] # Remove individual statistics (i.e. max stats)
    data = data[team_stats]
    
    # Add new target
    data = data.groupby(['team','season'],group_keys=False).apply(create_target)
    data['target'].fillna(-1, inplace=True) # use placeholder of -1 for now
    data['target'] = data['target'].astype(int)
    
    
    if 'Unnamed: 0' in data.columns:
        data = data.drop('Unnamed: 0', axis=1)
    return data

def create_first_cleaned_data(cleaned_data):
    '''
        Establish first cleaned dataframe of NBA data from 2016-2022
    '''
    cleaned_data.to_csv('../data/cleaned_nba_data.csv', index=False)
    return cleaned_data

def add_new_data(new_data):
    '''
        Add new box score data to the dataframe stored as csv
    '''

    ## Add new comments for testing purposes
    original_data = pd.read_csv('../data/cleaned_nba_data.csv')
    nba_clean_data = pd.concat([original_data,new_data], axis=0).reset_index(drop=True)
    nba_clean_data['game_id'] = nba_clean_data.index + 1# Add unique game_id
    nba_clean_data.to_csv('../data/cleaned_nba_data.csv', index=False)# Save to dataframe
    return nba_clean_data

---
### 3) Execute


In [6]:
# Load in 2016-2022 data and 2023 data
nba_data_2016_2022 = pd.read_csv('../data/nba_games.csv')
nba_data_2023 = pd.read_csv('../data/nba_games_2023.csv')

nba_clean_data_2016_2022 = clean_data(nba_data_2016_2022) # Cleaned 2016-2022
nba_clean_data = create_first_cleaned_data(nba_clean_data_2016_2022) # first iteration
nba_clean_data_23 = clean_data(nba_data_2023)
nba_clean_data = add_new_data(nba_clean_data_23) # Combine to get cleaned 2016-2023

In [8]:
nba_clean_data

Unnamed: 0,team,team_opp,season,date,home,fg,fga,fg%,3p,3pa,...,trb%_opp,ast%_opp,stl%_opp,blk%_opp,tov%_opp,ortg_opp,drtg_opp,won,target,game_id
0,DET,ATL,2016,2015-10-27,0,37.0,96.0,0.385,12.0,29.0,...,40.4,59.5,9.4,6.0,14.5,98.6,111.2,True,1,1
1,ATL,DET,2016,2015-10-27,1,37.0,82.0,0.451,8.0,27.0,...,59.6,62.2,5.2,5.5,12.3,111.2,98.6,False,1,2
2,NOP,GSW,2016,2015-10-27,0,35.0,83.0,0.422,6.0,18.0,...,62.9,70.7,8.0,10.8,15.9,110.9,94.9,False,0,3
3,GSW,NOP,2016,2015-10-27,1,41.0,96.0,0.427,9.0,30.0,...,37.1,60.0,9.0,4.5,15.9,94.9,110.9,True,1,4
4,CLE,CHI,2016,2015-10-27,0,38.0,94.0,0.404,9.0,29.0,...,48.5,35.1,6.0,15.4,11.8,97.5,95.5,False,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20407,DEN,MIA,2023,2023-06-07,0,41.0,80.0,0.513,5.0,18.0,...,36.3,58.8,7.7,4.8,3.8,103.8,120.4,True,1,20408
20408,DEN,MIA,2023,2023-06-09,0,39.0,79.0,0.494,14.0,28.0,...,52.1,65.7,2.3,5.9,13.9,107.2,121.9,True,1,20409
20409,MIA,DEN,2023,2023-06-09,1,35.0,78.0,0.449,8.0,25.0,...,47.9,66.7,12.4,13.2,6.4,121.9,107.2,False,0,20410
20410,DEN,MIA,2023,2023-06-12,1,38.0,84.0,0.452,5.0,28.0,...,43.6,54.5,9.4,12.5,7.2,92.6,97.8,True,-1,20411
