In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time

from tqdm.notebook import tqdm
# from sklearn.preprocessing import MinMaxScaler, StandardScaler

import matplotlib.style as style
style.use('seaborn-colorblind')

%matplotlib inline

#### Note:
- I prefer to encapsulate everything within a function rather than have variables within just the notebook.
- While I understand that this takes away from some of the perks of the sandbox environment of Jupyter Notebooks, I feel that maintaining the scope of data/variables in this way prevents issues further down the road, such as
    - conflicting variables/variable names,
    - unintended changes,
    - functions confusing which data to use
    - mutability

In [2]:
def raw_data_path() -> str:
    return '/'.join([
        os.getcwd().replace('src', 'data'),
        'season-data-raw.csv'
        
    ])

In [3]:
def load_raw() -> pd.DataFrame:
    return pd.read_csv(raw_data_path())

### Get basic overview

In [4]:
def data_overview():
    return (load_raw()
            .info()
           )

In [5]:
data_overview()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28448 entries, 0 to 28447
Data columns (total 48 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Date      28448 non-null  object 
 1   Name      28436 non-null  object 
 2   Team      28448 non-null  object 
 3   Opp       28448 non-null  object 
 4   FPTS      28448 non-null  float64
 5   MP        28448 non-null  float64
 6   FPTS/MP   28448 non-null  float64
 7   Home      28448 non-null  int64  
 8   W         28448 non-null  object 
 9   W_PTS     28448 non-null  int64  
 10  L         28448 non-null  object 
 11  L_PTS     28448 non-null  int64  
 12  Injury    28448 non-null  int64  
 13  Starter   28448 non-null  int64  
 14  FG        28448 non-null  float64
 15  FGA       28448 non-null  float64
 16  FG_perc   28448 non-null  float64
 17  3P        28448 non-null  float64
 18  3PA       28448 non-null  float64
 19  3P_perc   28448 non-null  float64
 20  FT        28448 non-null  fl

**As one can see, there are 48 columns and 28448 rows. Since I scraped this data myself, there should not be very many missing values, except on the rare occassion the scraper adds a blank space for a name after injuries and then fills the stats as if they were a player who DNP that game**
</br>
</br>
Therefore, I know I need to drop these rows from the dataset. I will write a basic function to check if there are missing data or not.

In [6]:
def no_missing(df) -> bool:
    return df.dropna().shape == df.shape
    
#     return f'{"No missing values anywhere" if not is_missing_values(df) else "Data is missing some values, need to fix"}'

In [7]:
no_missing( load_raw() )

False

#### clean_data():
 - load the raw data 
 - do some basic cleaning, including dropping rows with missing values
 - save as a new file 'season-data-clean.csv' to se for the rest of the time

In [8]:
def clean_data_path() -> str:
    return raw_data_path().replace('raw', 'clean')

In [9]:
def clean_data() -> None:
    
    df: pd.DataFrame = (load_raw()
                        .pipe(lambda df_: df_.loc[df_.isna().any(axis=1) == False])
                       )
        
    df.to_csv(clean_data_path(), index=False)
    
    return

In [10]:
def load_clean() -> pd.DataFrame:
    return pd.read_csv(clean_data_path())

#### Final check that all cleaning tasks came out as desired
- No missing values in any rows
- Columns all lowercase and any special formatting taken care of
- Only specified columns for now

In [11]:
def final_check() -> None:
#     asserts for all type of cleaning want to do
    
    clean_df: pd.DataFrame = load_clean()
    
    assert(no_missing(clean_df))
    
    return

In [12]:
final_check()