# Playground-series-s3e26

Kyle Lacson 


## I. Libraries and Packages

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os 
import warnings
warnings.filterwarnings('ignore')

### I.ii Import dataset

In [None]:
# --- Setting up the directory based on local repository or kaggle
if 'kylelacson' in os.getcwd():
    base_dir = os.getcwd()
else:
    base_dir = r'/kaggle/input/playground-series-s3e26' 
    
for i in os.listdir(base_dir):
    if 'playground-series-s3e26' in i:
        base_dir = os.path.join(base_dir,i)   
    
# --- Importing csv files from directory
for i in os.listdir(base_dir):
    if 'sample' in i:
        sample = pd.read_csv(os.path.join(base_dir,i))
    elif 'train' in i:
        train = pd.read_csv(os.path.join(base_dir,i))
    elif 'test' in i:
        test = pd.read_csv(os.path.join(base_dir,i))
    else:
        continue

In [None]:
train.nunique().sort_values()[:10]

### I.iii Transfrom dataset

In [None]:
# --- Setting aside category/object columns to transform
category = train.select_dtypes(include = object)
number = train.select_dtypes(include = np.number)

In [None]:
from sklearn.preprocessing import LabelEncoder
'''
Function: Label encode categorical columns.
'''
def label_encoder(df: pd.DataFrame):
    for i in df.columns:
        le = LabelEncoder()
        le.fit(df[i])
        df[i] = le.transform(df[i])
    return df

In [None]:
# --- Initializing label_encoder function
category = label_encoder(category)

In [None]:
# --- Merging the new initialized dataset
data = pd.concat([number,category],axis = 1).drop(columns = 'id')

In [None]:
data.head()

## II. Explore data 

In [None]:
# --- Heatmap
plt.figure(figsize = (12,12))
mask = np.triu(np.ones_like(data.corr()))
sns.heatmap(data = data.corr(), annot = True, fmt = '.1f', cmap = 'vlag', mask = mask);

In [None]:
#  Ascites & Edema
fig, ax = plt.subplots(1,2,figsize = (8,8), sharey = True)
sns.countplot(data = data, x = 'Edema', hue = 'Status', ax = ax[0])
sns.countplot(data = data, x = 'Ascites', hue = 'Status', ax = ax[1])
ax[0].set_title('Edema')
ax[1].set_title('Ascites');


### II.ii  Prepare dataset

In [None]:
# --- Split dataset
from sklearn.model_selection import train_test_split
X = data.drop(columns = 'Status')
y = data.Status

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42, shuffle = True)

# --- Creating a valdiation set 
train_split = int(len(X_train) * 0.10) # 10 percent of the split training set will be used for the evalution set 
X_val, y_val = X_train[(len(X_train) - train_split):], y_train[(len(y_train) - train_split):]
X_train, y_train = X_train[:(len(X_train) - train_split)], y_train[:(len(y_train) - train_split)]

In [None]:
# --- Scaling dataset (not truly needed for tree based classifiers)
from sklearn.preprocessing import StandardScaler

def scale_dataframes(dataframe: pd.DataFrame):
    scaler = StandardScaler()
    df = pd.DataFrame(data = scaler.fit_transform(dataframe), columns = dataframe.columns)
    return df

In [None]:
X_train_sc = scale_dataframes(X_train)
X_val_sc = scale_dataframes(X_val)
X_test_sc =  scale_dataframes(X_test)

## III. Classification Modeling

### III.ii Decision Tree Classifier (SKLEARN)

In [None]:
# --- Sklearn Decision Tree Classifier 
from sklearn.tree import DecisionTreeClassifier

dt_sk = DecisionTreeClassifier(random_state = 42)
dt_sk.fit(X_train_sc, y_train)
y_dt_pred = dt_sk.predict(X_test_sc)

### III.iii Random Forest Classifier (SKLEARN)

In [None]:
# --- Sklearn Random Forest Tree Classifier
from sklearn.ensemble import RandomForestClassifier

rf_sk = RandomForestClassifier(random_state = 42)
rf_sk.fit(X_train_sc, y_train)
y_rf_pred = rf_sk.predict(X_test_sc)

### III.iv Random Forest XGBoost Classifier (Gradient Boosting) 

In [None]:
# --- Check if a GPU is available and set device to built in GPU support (MPS)
import torch
device = torch.device('mps') if torch.backends.mps.is_available() else 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [None]:
# --- XGBoost Classifier and Random Forest Classifier
from xgboost import XGBClassifier, XGBRFClassifier

# Gradient Boosting
params = {
    'random_state': 42,
    'n_jobs': 4,
    'verbosity': 1,
    'device': device,
    'early_stopping_rounds': 10
    # eval_metric: # change log loss to multiclass entropy loss
}

params2 = {
    'random_state': 42,
    'n_jobs': 4,
    'verbosity': 1,
    'device': device
    # eval_metric: # change log loss to multiclass entropy loss
}

xgb_clf = XGBClassifier(**params)
xgb_clf.fit(X_train_sc, y_train, eval_set = [(X_train_sc, y_train),(X_val_sc, y_val)])
y_xgbclf_pred = xgb_clf.predict(X_test_sc)

# Random Forest
xgbrf_clf = XGBRFClassifier(**params2)
xgbrf_clf.fit(X_train_sc, y_train, eval_set = [(X_train_sc, y_train), (X_val_sc, y_val)])
y_xgbrf_pred = xgbrf_clf.predict(X_test_sc)

In [None]:
# --- xgbclassifier evaluation metrics
xgb_results = xgb_clf.evals_result_

plt.figure(figsize=(10,7))
plt.plot(xgb_results["validation_0"]["mlogloss"], label="Training loss")
plt.plot(xgb_results["validation_1"]["mlogloss"], label="Validation loss")
plt.axvline(xgb_clf.best_iteration, color="gray", label="Optimal tree number")
plt.xlabel("Number of trees")
plt.ylabel("Loss")
plt.legend();

### III.v. Classifier Catboost

In [None]:
# --- Catboost Classifier
from catboost import CatBoostClassifier
import ipywidgets # required for plotting with catboosts

params = {
    'random_seed': 42, 
    'verbose': True,
    'thread_count': 4,
    'devices': 'CPU',
    'early_stopping_rounds': 25
}

cb_clf = CatBoostClassifier(**params)
cb_clf.fit(X_train_sc,y_train, verbose = True, early_stopping_rounds = 25, eval_set = (X_val_sc, y_val))

In [None]:
cb_clf.evals_result_

In [None]:
cb_clf.get_eval_result

## IV. Metrics

In [None]:
class Metric:
    
    metric_list = []
     
    def __init__(self, name_of_classifier, y_pred, classifier):
        self.name = name_of_classifier
        self.y_pred = y_pred
        self.classifier = classifier
        
    def metrics_func(self):
        pass
        
        

In [None]:
# --- Metrics
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, classification_report

def metrics_func(y_pred, classifier):
    acc = accuracy_score(y_test,y_pred)
    bal_acc = balanced_accuracy_score(y_test, y_pred)

    # --- Confusion Matrix
    cm = confusion_matrix(y_test,y_pred, labels = classifier.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix= cm, display_labels = classifier.classes_)
    disp.plot()
    plt.show()

    print(classification_report(y_test, y_pred))
    print('*'*75,'\n')
    print(f'Accuracy: {acc}\nBalanced Accuracy: {bal_acc}')
    
#     return acc, bal_acc


In [None]:
# XGBRFBoosting (Random Forest)
print('XGBRF Classifier')
metrics_func(y_xgbrf_pred, xgbrf_clf)

In [None]:
# XGBoosting (Gradient Boosting)
print('XGBoost Classifier')
metrics_func(y_xgbclf_pred, xgb_clf)

In [None]:
# Random Forest (SK)
print('Random Forest')
metrics_func(y_rf_pred,rf_sk)

In [None]:
# Decision Tree (SK)
print('Decision Tree')
metrics_func(y_dt_pred, dt_sk)

**Notes:** Need to handle class imbalance. Big disparity in class labeling.