### MDSI Advanced Machine Learning Applications

Student: Ivan Cheung

Assignment: Kaggle 1


GitHub repo: https://github.com/ivanutsmdsi/amla2023

In [4]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification, load_breast_cancer
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder   ## for normalising the feature data
from sklearn.model_selection import train_test_split              ## making test/train splits, for producing AUROC score?
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as pyplot
from joblib import dump


In [6]:
import imblearn

In [7]:
## Load raw data into dataframe
df = pd.read_csv('../data/raw/train.csv', low_memory=False)

In [9]:
## PREPROCESSING FUNCTIONS

## create a function to remove the corrupt columns and return a clean df
def remove_unwanted_cols(df):
    # ftr - no description given, removed from feature analysis
    # yr - player year details are descriptive, need to vectorise in future modelling
    # ht - player height data has been corrupted
    # num - payer number not relevant as a feature
    # pfr - no description given, removed from feature analysis
    # type - metadata field not relevant as a feature
    # year - no longitudinal inclusion for feature modeling

    df_clean = df.drop(["ht", "yr", "num", "type" ,"year", "ftr", "pfr"], axis = 1)    
    ##df_clean = df.drop(["team", "conf", "ht", "yr", "num", "type" ,"year", "ftr", "pfr"], axis = 1)

    return df_clean

## fill null values with 0
## TO IMPROVE: not all columns should be filled with 0. Will need to reveiw this moving forward...
def fillna(df):
    df_clean = df.fillna(0)

    return df_clean


In [41]:
## Create Standard Scalar and save to models
def scaler(df, num_cols):
    scaler = StandardScaler()
    features_df = pd.DataFrame(scaler.fit_transform(df[num_cols]), columns = num_cols)
    dump(scaler, '../models/scaler.joblib')
    return features_df

## Create OHE and save to models
def OneHotEncode(df, cat_cols):
    ohe = OneHotEncoder(sparse_output=False, drop='first')
    features = ohe.fit_transform(df[cat_cols])
    features_df = pd.DataFrame(features, columns=ohe.get_feature_names_out())

    dump(ohe, '../models/ohe.joblib')
    return features_df


In [28]:
## Preprocessing dataframe 

# remove unwanted columns, previously identified as corrupt
df_clean = remove_unwanted_cols(df)

# apply placeholder values for nulls
df_clean = fillna(df_clean)


In [29]:
## pop columns into separate lists
target = df_clean.pop('drafted')
player_ids = df_clean.pop('player_id')

In [50]:
## Apply Standard Scalar and OHE (for team name and conference name)
## define the numeric cols and the category cols from the df_clean dataset
num_cols = list(df_clean.select_dtypes('number').columns)
cat_cols = list(set(df_clean.columns) - set(num_cols))

# apply Scalar, save to placeholder
scaler_df = scaler(df_clean, num_cols)

# apply ohe, save to placeholder
ohe_df = OneHotEncode(df_clean, cat_cols)

# merge the two placeholder dfs together into one DF
df_clean = pd.concat([scaler_df, ohe_df], axis = 1)

# clean up placeholder variables from memory
del scaler_df
del ohe_df

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56091 entries, 0 to 56090
Data columns (total 64 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   team                 56091 non-null  object 
 1   conf                 56091 non-null  object 
 2   GP                   56091 non-null  int64  
 3   Min_per              56091 non-null  float64
 4   Ortg                 56091 non-null  float64
 5   usg                  56091 non-null  float64
 6   eFG                  56091 non-null  float64
 7   TS_per               56091 non-null  float64
 8   ORB_per              56091 non-null  float64
 9   DRB_per              56091 non-null  float64
 10  AST_per              56091 non-null  float64
 11  TO_per               56091 non-null  float64
 12  FTM                  56091 non-null  int64  
 13  FTA                  56091 non-null  int64  
 14  FT_per               56091 non-null  float64
 15  twoPM                56091 non-null 

In [20]:
num_cols

['GP',
 'Min_per',
 'Ortg',
 'usg',
 'eFG',
 'TS_per',
 'ORB_per',
 'DRB_per',
 'AST_per',
 'TO_per',
 'FTM',
 'FTA',
 'FT_per',
 'twoPM',
 'twoPA',
 'twoP_per',
 'TPM',
 'TPA',
 'TP_per',
 'blk_per',
 'stl_per',
 'porpag',
 'adjoe',
 'Rec_Rank',
 'ast_tov',
 'rimmade',
 'rimmade_rimmiss',
 'midmade',
 'midmade_midmiss',
 'rim_ratio',
 'mid_ratio',
 'dunksmade',
 'dunksmiss_dunksmade',
 'dunks_ratio',
 'pick',
 'drtg',
 'adrtg',
 'dporpag',
 'stops',
 'bpm',
 'obpm',
 'dbpm',
 'gbpm',
 'mp',
 'ogbpm',
 'dgbpm',
 'oreb',
 'dreb',
 'treb',
 'ast',
 'stl',
 'blk',
 'pts',
 'drafted']

In [24]:
cat_cols

['team', 'conf']