# DATA 440 Final Project
# Part I: Data Preprocessing

Columns to Drop:
1. mp.1 = Copy of mp.1, which is the total number of minutes played of each player
2. +/- = Individual stat; all values will be NaN
3. mp_max and mp_max.1
4. usg% = individual statistic; all values will be 100% (1)
5. fg%_max = individual statistic; might almost a player on each team who goes for 1/1 on the evening; doesn't impact outcome of game, imo

---


In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as plt

In [4]:
def create_target(team):
    team['target'] = team['won'].shift(-1,axis=0) # Get the values of the 'won' column 1 period ahead (-1 in this case)
    return team
    
def clean_data(data,
               cols_to_drop
               ):
    '''
        This function takes the NBA box score data as the input and 
        returns a cleaned-up dataframe.
        
        cols_to_drop = A list of strings, representing the columns needed to be dropped from the dataframe
        
        Note: This function does include playoff games
    '''
    data = data.sort_values('date', ascending = True) 

    cols_to_move = ['team','total','team_opp','total_opp','season','date','home'] # Move team name columns to beginning of dataframe
    for idx,col in enumerate(cols_to_move): 
        col_to_pop = data.pop(col)
        data.insert(idx,col,col_to_pop)

    # Drop unnecessary columns
    data = data.drop(cols_to_drop,axis=1)
            
    
    data = data.reset_index(drop=True) # Reset index of dataframe
    
    # Add new target
    data = data.groupby('team',group_keys=False).apply(create_target)
    data['target'].fillna(-1, inplace=True) # use placeholder of -1 for now
    data['target'] = data['target'].astype(int)
    
    data = data.drop('Unnamed: 0', axis=1)
    data.to_csv('../data/cleaned_nba_data.csv', index=False) # Export cleaned data to cleaned_nba_data.csv
    return data

In [3]:
nba_data_2016_2022 = pd.read_csv('../data/nba_games.csv')
dropped_columns = ['mp.1','+/-','mp_max','mp_max.1','usg%','usg%_opp']

nba_clean_data = clean_data(nba_data_2016_2022,dropped_columns)
nba_clean_data

Unnamed: 0,team,total,team_opp,total_opp,season,date,home,mp,fg,fga,...,ast%_max_opp,stl%_max_opp,blk%_max_opp,tov%_max_opp,usg%_max_opp,ortg_max_opp,drtg_max_opp,home_opp,won,target
0,NOP,95,GSW,111,2016,2015-10-27,0,240.0,35.0,83.0,...,50.1,19.1,7.9,69.4,43.7,206.0,104.0,1,False,0
1,CLE,95,CHI,97,2016,2015-10-27,0,240.0,38.0,94.0,...,30.3,2.8,14.0,53.2,34.6,162.0,104.0,1,False,1
2,CHI,97,CLE,95,2016,2015-10-27,1,240.0,37.0,87.0,...,31.2,2.8,18.5,30.4,29.0,138.0,105.0,0,True,1
3,GSW,111,NOP,95,2016,2015-10-27,1,240.0,41.0,96.0,...,43.4,5.3,6.3,37.5,38.9,201.0,120.0,0,True,1
4,ATL,94,DET,106,2016,2015-10-27,1,240.0,37.0,82.0,...,35.6,3.2,4.7,33.3,23.6,132.0,104.0,0,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17767,BOS,97,GSW,107,2022,2022-06-10,1,240.0,34.0,85.0,...,30.2,6.1,10.3,42.9,36.3,133.0,112.0,0,False,0
17768,GSW,104,BOS,94,2022,2022-06-13,1,240.0,41.0,88.0,...,100.0,2.3,12.4,45.0,94.4,300.0,112.0,0,True,1
17769,BOS,94,GSW,104,2022,2022-06-13,0,240.0,31.0,75.0,...,59.5,5.7,7.6,33.3,36.2,222.0,107.0,1,False,0
17770,BOS,90,GSW,103,2022,2022-06-16,1,240.0,34.0,80.0,...,35.7,7.9,6.3,33.3,31.5,186.0,111.0,0,False,-1
