In [None]:
%load_ext nb_black
# Autoformat cells

In [2]:
import pandas as pd
from dss_toolkit.data_build.cleaning import (
    qa_values,
    qa_strip_whitespace,
    replace_low_cardinality,
    replace_iqr_outlier,
)
from dss_toolkit.data_analysis.eda import eda_categorical_cols, eda_numeric_cols
from dss_toolkit.helpers.pandas import show

In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [4]:
show(train.head())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Identify variables

In [5]:
categorical_variables = ["Sex", "CabinClass", "Embarked"]
numeric_variables = [
    "Pclass",
    "Age",
    "SibSp",
    "Parch",
    "Fare",
]

target_variable = "Survived"
feature_variables = list(numeric_variables + categorical_variables)

In [6]:
train["CabinClass"] = train.Cabin.str[0].fillna("U")  # New Feature
test["CabinClass"] = test.Cabin.str[0].fillna("U")  # New Feature

In [7]:
display(eda_categorical_cols(train, categorical_cols=categorical_variables))
display(eda_numeric_cols(train, numeric_cols=numeric_variables))

Unnamed: 0,column_name,nunique,n_items,unique_vals,val_count,dtypes
0,Sex,2,891,"[male, female]","[577, 314]",object
1,CabinClass,9,891,"[U, C, B, D, E, A, F, G, T]","[687, 59, 47, 33, 32, 15, 13, 4, 1]",object
2,Embarked,3,889,"[S, C, Q]","[644, 168, 77]",object


Unnamed: 0,column_name,min_value,q05,q25,q50,q75,q95,max_value,mean_value,count,dtypes
0,Pclass,1.0,1.0,2.0,3.0,3.0,3.0,3.0,2.308642,891,int64
1,Age,0.42,4.0,20.125,28.0,38.0,56.0,80.0,29.699118,714,float64
2,SibSp,0.0,0.0,0.0,0.0,1.0,3.0,8.0,0.523008,891,int64
3,Parch,0.0,0.0,0.0,0.0,0.0,2.0,6.0,0.381594,891,int64
4,Fare,0.0,7.225,7.9104,14.4542,31.0,112.07915,512.3292,32.204208,891,float64


## Data Cleaning

In [8]:
# Remove white spaces, replace empty string with nan


def clean_dataframe(df, categorical_variables):
    df = df.copy()

    # Clean whitespace
    for c in categorical_variables:
        qa_strip_whitespace(df, c, replace_nan=True, inplace=True)
        df.loc[df[c].isna(), c] = "NA"

    replace_low_cardinality(df, "CabinClass", cut_off_percentile=0.9, inplace=True)

    # Clean Numeric Columns
    qa_values(
        df,
        "Age",
        min_value=0,
        max_value=100,
        mode="replace",
        replacement=df.Age.quantile(0.5),
        inplace=True,
    )
    qa_values(
        df,
        "Fare",
        min_value=0,
        mode="replace",
        replacement=df.Fare.quantile(0.5),
        inplace=True,
    )

    replace_iqr_outlier(df, "Fare", inplace=True)
    for c in numeric_variables:
        df.loc[df[c].isna(), c] = 0
    return df

In [9]:
train_df = clean_dataframe(train, categorical_variables)
test_df = clean_dataframe(test, categorical_variables)

In [10]:
display(eda_categorical_cols(train_df, categorical_cols=categorical_variables))
display(eda_numeric_cols(train_df, numeric_cols=numeric_variables))

Unnamed: 0,column_name,nunique,n_items,unique_vals,val_count,dtypes
0,Sex,2,891,"[male, female]","[577, 314]",object
1,CabinClass,2,891,"[U, Others]","[687, 204]",object
2,Embarked,4,891,"[S, C, Q, NA]","[644, 168, 77, 2]",object


Unnamed: 0,column_name,min_value,q05,q25,q50,q75,q95,max_value,mean_value,count,dtypes
0,Pclass,1.0,1.0,2.0,3.0,3.0,3.0,3.0,2.308642,891,int64
1,Age,0.0,0.0,6.0,24.0,35.0,54.0,80.0,23.799293,891,float64
2,SibSp,0.0,0.0,0.0,0.0,1.0,3.0,8.0,0.523008,891,int64
3,Parch,0.0,0.0,0.0,0.0,0.0,2.0,6.0,0.381594,891,int64
4,Fare,0.0,7.225,7.9104,14.4542,31.0,112.07915,112.07915,30.093492,891,float64


# Machine Learning Pipeline (Classification)

The following steps assumes that the data is already cleaned:
    - Correct data types
    - no missing values
    - no invalid values
    - no outliers

In [11]:
# Split your data
from sklearn.model_selection import train_test_split

X = train_df[feature_variables]
y = train_df[target_variable]

train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.8, stratify=y)

##  Use precoded preprocessing functions or develop your own

For custom preprocessing function, follow the structure

- Function Name: `preprocess_<name>()`
- Parameters:
    - `X`
    - `Y` (can be set with default value `None`)
    - `**kwargs` : to support any parameters used for preprocessing (eg. `n_components` for PCA)
- Returns a tuple as `(preprocessed_X, preprocessed_y), preprocessor_data` where"
    - `preprocessed_X` : Preprocessed X
    - `preprocessed_y` : Preprocessed y
    - `preprocessor_data` : any variables used to "fit" the preprocessor such as scalers, PCA components etc.
   

In [12]:
from dss_toolkit.modeling.preprocessing import (
    preprocess_scale_onehot,
    preprocess_scale_onehot_drop_correlated,
)

In [15]:
# Check output
(preprocessed_X, preprocessed_y), preprocessor_data = preprocess_scale_onehot(
    train_X,
    train_y,
    numeric_columns=numeric_variables,
    categorical_columns=categorical_variables,
)
display(preprocessed_X.head())
display(preprocessed_y.head())
display(preprocessor_data)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,CabinClass_Others,CabinClass_U,Embarked_C,Embarked_NA,Embarked_Q,Embarked_S
566,3.0,0.730769,0.0,0.0,0.337728,0,1,0,1,0,0,0,1
677,3.0,0.692308,0.0,0.0,0.42096,1,0,0,1,0,0,0,1
75,3.0,0.961538,0.0,0.0,0.327214,0,1,1,0,0,0,0,1
401,3.0,1.0,0.0,0.0,0.344323,0,1,0,1,0,0,0,1
349,3.0,1.615385,0.0,0.0,0.370522,0,1,0,1,0,0,0,1


566    0
677    1
75     0
401    0
349    0
Name: Survived, dtype: int64

{'scaler': RobustScaler(with_centering=False),
 'dummies_cols': Index(['Sex_female', 'Sex_male', 'CabinClass_Others', 'CabinClass_U',
        'Embarked_C', 'Embarked_NA', 'Embarked_Q', 'Embarked_S'],
       dtype='object')}

## Create Model Functions

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Must follow these parameter list
def train_random_forest(train_data, train_labels, val_data, val_labels, **kwargs):

    # Support for hyperparameters
    n_estimators = kwargs.get("rf_n_estimators", 100)  # default value
    max_depth = kwargs.get("rf_max_depth", None)

    clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
    clf.fit(train_data, train_labels)

    return clf


def predict_random_forest(model, X, y=None):
    preds = model.predict(X)
    return preds

In [None]:
# Test your function
model = train_random_forest(
    preprocessed_X, preprocessed_y, val_data=None, val_labels=None, rf_max_depth=20
)
preds = predict_random_forest(model, preprocessed_X)
preds[:4]

In [None]:
preprocessed_X.shape

# Use Pipeline

In [None]:
from dss_toolkit.modeling.base import train_test_oot_pipeline

In [None]:
results, runs = train_test_oot_pipeline(
    train_X,
    train_y,
    test_X,
    test_y,
    test_X,
    test_y,
    data_preprocessor_function=preprocess_scale_onehot,
    numeric_columns=numeric_variables,
    categorical_columns=categorical_variables,
    train_model_function=train_random_forest,
    predict_model_function=predict_random_forest,
    learning="classification",
)

In [None]:
results

In [None]:
runs["train"]["model"]

In [None]:
y_true, y_pred = runs["test"]["results"]

In [None]:
from dss_toolkit.modeling.model_metrics import ks_stat, ks_table, ks_2samp

In [None]:
ks_2samp(y_true, y_pred, alternative="less"), ks_stat(y_true, y_pred)

In [None]:
from scipy.stats import ks_2samp

a = [1, 2, 3, 4, 5, 10, 11, 12, 13, 14]
b = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
ks_2samp(a, b). 

In [None]:
len(a), len(b)

In [None]:
y_true.shape

In [None]:
ks_table(y_true, y_pred, bins=150).difference.max()

In [None]:
ks_table(y_true, y_pred, bins=150)

In [None]:
show(runs["test"]["decile_performance"], n_decimals=4)

In [None]:
runs["train"]["preprocessor_data"]

In [None]:
results, runs = train_test_oot_pipeline(
    train_X,
    train_y,
    test_X,
    test_y,
    test_X,
    test_y,
    data_preprocessor_function=preprocess_scale_onehot,
    numeric_columns=numeric_variables,
    categorical_columns=categorical_variables,
    train_model_function=train_random_forest,
    predict_model_function=predict_random_forest,
    learning="regression",
)

In [None]:
results

# Use Scoring from model

In [None]:
from dss_toolkit.modeling.base import inference_pipeline

In [None]:
model = runs["train"]["model"]
preprocessor_data = runs["train"]["preprocessor_data"]

score_X = train_df[feature_variables]

score_results = inference_pipeline(
    X,
    y=None,
    model=model,
    data_preprocessor_function=preprocess_scale_onehot,
    numeric_columns=numeric_variables,
    categorical_columns=categorical_variables,
    preprocessor_data=preprocessor_data,
    train_model_function=train_random_forest,
    predict_model_function=predict_random_forest,
    learning="classification",
)

In [None]:
score_results

In [None]:
score_results["results"]