# Preparation

<b>Load Libraries</b>

In [32]:
# data structures
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
## settings
plt.rcParams['figure.figsize'] = (10, 6)
sns.set_theme('notebook')

# models selection
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score

# metrics
from sklearn.metrics import fbeta_score, make_scorer

# pipeline
from imblearn.pipeline import Pipeline

# compose
from sklearn.compose import ColumnTransformer

# preprocessings
from sklearn.preprocessing import StandardScaler, PowerTransformer, QuantileTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, TargetEncoder
from category_encoders.cat_boost import CatBoostEncoder

# decomposition
from sklearn.decomposition import PCA

# features selection
from mlxtend.feature_selection import SequentialFeatureSelector

# resamplings
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTEENN, SMOTETomek

# algorithms
## linear_model
from sklearn.linear_model import LogisticRegression
## neighbors
from sklearn.neighbors import KNeighborsClassifier
## svm
from sklearn.svm import SVC
## tree
from sklearn.tree import DecisionTreeClassifier
## ensample
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# mflow
import mlflow
from mlflow.pyfunc import PythonModel
from mlflow.models.signature import infer_signature

# others
import re
from sklearn.base import BaseEstimator, TransformerMixin
from warnings import simplefilter
import joblib

In [33]:
# model's type separation
parametric_models = ['LR']
non_parametric_models = ['KNN', 'SVM', 'CART']
ensample_models = ['ET', 'RF', 'GB', 'LGBM', 'XGB']

<b>Load Dataset</b>

In [34]:
# dataset
df_base = pd.read_csv('../dataset/cleaned/train.csv')

## 
num_cols = df_base.select_dtypes(np.number).columns.tolist()
cat_cols = df_base.select_dtypes('object').columns.tolist()[0:-1]

##
num_idxes = [i for i, name in enumerate(df_base.columns.tolist()) if name in num_cols]
cat_idxes = [i for i, name in enumerate(df_base.columns.tolist()) if name in cat_cols]

In [35]:
# cross validation
##
arr = df_base.values
le = LabelEncoder()

##
X, y = arr[:, :-1], le.fit_transform(arr[:, -1])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=5)

<b>User-Defined Functions</b>

In [36]:
# parsed names:
def get_parsed_names(old_names: list, new_names: list) -> list:
    parsed_names = []

    for name in new_names:
        eles = [ele for ele in re.split('_', name) if ele != '']
        eles[1] = int(re.split('x', eles[1])[-1])

        if name != 'remainder':
            parsed_names.append(
                f'{old_names[eles[1]]}_{eles[-1]}'
            )
        else:
            parsed_names.append(f'{old_names[eles[1]]}')

    return parsed_names

In [37]:
# models
def load_base_models() -> list:
    models = []
    models.append(('LR', LogisticRegression(n_jobs=-1)))
    models.append(('KNN', KNeighborsClassifier(n_jobs=-1)))
    models.append(('SVM', SVC()))
    models.append(('CART', DecisionTreeClassifier()))
    models.append(('ET', ExtraTreesClassifier(n_jobs=-1)))
    models.append(('RF', RandomForestClassifier(n_jobs=-1)))
    models.append(('GB', GradientBoostingClassifier()))
    models.append(('LGBM', LGBMClassifier(verbose=-1, n_jobs=-1)))
    models.append(('XGB', XGBClassifier(n_jobs=-1)))

    return models 

In [38]:
# pipelines
def get_pipelines(
        base_models: list,
        ohe: list=[], other_encoders: list=[],
        scaling: list=[], pca: list=[], resampling: list=[], 
) -> list:
    ## [<name>, <estimator>]
    steps_num_pro = scaling + pca + resampling
    num_pro = [('num_pro', Pipeline(steps_num_pro), num_idxes)]

    ## [<name>, <estimator>, <idxes>]
    cat_pro = ohe + other_encoders
        
    ##
    if len(steps_num_pro) != 0:
        transformers = num_pro + cat_pro
    else:
        transformers = cat_pro
    ct = [('transform', ColumnTransformer(transformers, remainder='passthrough'))]

    ##
    pipelines = []
    for name, model in base_models:
        steps = ct + [(name, model)]
        pipelines.append((name, Pipeline(steps)))
        
    return pipelines

In [39]:
# kflold results
def get_kfold_results(models: list, X: np.ndarray, y: np.ndarray) -> tuple[list, list]:
    ##
    names, results = [], []
    cv = RepeatedStratifiedKFold(
        n_splits=10, n_repeats=3, 
        random_state=7
    )
    scoring = make_scorer(fbeta_score, beta=2)

    ##
    for name, model in models:
        cv_results = cross_val_score(
            estimator=model, 
            X=X, y=y, 
            cv=cv, scoring=scoring
        )

        print(f'{name}: {cv_results.mean()} ({cv_results.std()})')
        names.append(name); results.append(cv_results)

    return names, results

In [40]:
# selected models
def get_selected_base_models(names_to_choose: list) -> list:
    models = load_base_models()
    selected_models = [(name, model) for name, model in models if name in names_to_choose]

    return selected_models

In [41]:
# kfold visualization
def plot_kfold_results(names: list, results: list):
    data_to_plot = dict()
    for name, result in zip(names, results):
        data_to_plot[name] = result
    
    ##
    data_to_plot = pd.DataFrame(
        data=data_to_plot
    )

    new_idxes = data_to_plot.mean(axis=0)\
        .sort_values(ascending=False)\
            .index.tolist()
    data_to_plot = data_to_plot.reindex(
        labels=new_idxes, 
        axis=1
    )

    ##
    g = sns.boxplot(
        data_to_plot, 
        fill=False, 
        showmeans=True
    )
    for i in range(3):
        g.get_xticklabels()[i].set_fontweight('bold')
        g.get_xticklabels()[i].set_color('r')

<b>User-Defined Classes</b>

In [42]:
# class SFS Base
class FS_BaseUserDefinedTransformer(BaseEstimator, TransformerMixin):
    ## 
    def __init__(self, 
                 ohe=None, other_encoders=None,
                 scaling: list=[], factor_analysis: list=[]) -> None:
        ###
        self.scaling, self.factor_analysis = scaling, factor_analysis
        ### 
        self.ohe, self.other_encoders = ohe, other_encoders


    ##
    def _check_ndim(self, X: np.ndarray) -> tuple[np.ndarray, int]:
        ###
        if X.ndim == 2:
            X_ = X
            num_iters = X.shape[1]
        else:
            X_ = X.reshape(-1, 1)
            num_iters = 1

        return X_, num_iters
    
    ##
    def _category_detection(self, X: np.ndarray) -> tuple[list, list, np.ndarray]:
        ###
        num_idxes, cat_idxes = [], []

        ### check dimension
        X_, num_iters = self._check_ndim(X=X)

        ###
        for i in range(num_iters):
            try:
                X_[:1, i].astype(float)
                num_idxes.append(i)
            except:
                cat_idxes.append(i)
        
        return num_idxes, cat_idxes, X_
    
    ##
    def _get_transformers(self, cat_idxes: list=[], num_idxes: list=[]) -> list:
        if len(num_idxes) == 0:
            transformers = self.cat_pro
        elif len(cat_idxes) == 0:
            transformers = self.num_pro
        else:
            transformers = self.num_pro + self.cat_pro

        return transformers 
    
    ## 
    def fit(self, X: np.ndarray, y=None):
        ###
        transformers = self._get_transformers(self.cat_idxes, self.num_idxes)
        self.ct = ColumnTransformer(transformers, remainder='passthrough')
        self.ct.fit(self.X_fit_)

        return self
    
    ##
    def transform(self, X: np.ndarray, y=None):
        X_, _ = self._check_ndim(X=X)

        return self.ct.transform(X=X_)

In [43]:
# Onehot_Scaling_Pca
class SFS_OSP(FS_BaseUserDefinedTransformer):
    def fit(self, X: np.ndarray, y=None):
        ###
        self.num_idxes, self.cat_idxes, self.X_fit_ = self._category_detection(X)

        ###
        steps = self.scaling + self.factor_analysis
        self.num_pro = [('num_pro', Pipeline(steps), self.num_idxes)]

        self.cat_pro = [('cat_pro', self.ohe, self.cat_idxes)]

        ###
        super().fit(X=X)
        
        return self

In [1]:
from draft import testing_mlflow_model

# 1. Testing

In [45]:
## warnings
simplefilter(action='ignore')

## transformers
transformers = SFS_OSP(
    ohe=OneHotEncoder(drop='first', sparse_output=False), 
    scaling=[('scaling', StandardScaler())]
)
## base model
base_model = get_selected_base_models(
    names_to_choose=['LGBM']
)

## pipeline
pipeline = Pipeline(
    steps=[('transformers', transformers)] + base_model
)

## sfs: forward
sfs = SequentialFeatureSelector(
    estimator=pipeline, 
    k_features='best', forward=True, 
    cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=7),
    scoring=make_scorer(fbeta_score, beta=2), 
    verbose=2
)
sfs.fit(X_train, y_train)


[2024-07-24 13:59:46] Features: 1/19 -- score: 0.2381489118845433
[2024-07-24 14:00:26] Features: 2/19 -- score: 0.42408191254368577
[2024-07-24 14:01:10] Features: 3/19 -- score: 0.5754211684925191
[2024-07-24 14:01:44] Features: 4/19 -- score: 0.6225729598342019
[2024-07-24 14:02:34] Features: 5/19 -- score: 0.6584611009627147
[2024-07-24 14:03:17] Features: 6/19 -- score: 0.66052986385995
[2024-07-24 14:03:54] Features: 7/19 -- score: 0.6617499533205715
[2024-07-24 14:04:28] Features: 8/19 -- score: 0.6601361994499585
[2024-07-24 14:04:54] Features: 9/19 -- score: 0.7311445443588926
[2024-07-24 14:05:17] Features: 10/19 -- score: 0.7990904116019475
[2024-07-24 14:05:52] Features: 11/19 -- score: 0.8020770922467613
[2024-07-24 14:06:17] Features: 12/19 -- score: 0.8020770922467613
[2024-07-24 14:06:39] Features: 13/19 -- score: 0.8005928428259913
[2024-07-24 14:06:58] Features: 14/19 -- score: 0.8008545466925517
[2024-07-24 14:07:15] Features: 15/19 -- score: 0.8014141696172937
[202

In [46]:
mlflow.set_tracking_uri(uri='http://127.0.0.1:5000/')

In [47]:
experiment_name = 'draft'
try:
    mlflow.create_experiment(
        name=experiment_name, 
        artifact_location='mlflow/_artifacts_store/'
    )
    mlflow.set_experiment(
        experiment_name=experiment_name
    )
except:
    print(f'\"{experiment_name}\" has already existed')
    mlflow.set_experiment(
        experiment_name=experiment_name
    )

In [48]:
# to log
## data
signature = infer_signature(
    model_input=sfs.transform(X_train)
)
avg_fbeta = sfs.k_score_
selected_features_name = df_base.iloc[:, list(sfs.k_feature_idx_)].columns.tolist()

## FeatureSelector
artifacts_path = dict(
    feature_selector = '_artifacts/ohe_std.joblib'
)
joblib.dump(
    value=sfs, 
    filename=artifacts_path['feature_selector']
)

['_artifacts_store/ohe_std.joblib']

In [49]:
# to log: model
from draft import testing_class as testing_transformer

## transformer
transformers = testing_transformer(
    ohe=OneHotEncoder(drop='first', sparse_output=False), 
    scaling=[('scaling', StandardScaler())]
)

## base model
base_model = get_selected_base_models(
    names_to_choose=['LGBM']
)

## pipeline
pipeline = Pipeline(
    steps=[('transformers', transformers)] + base_model
)
pipeline.fit(sfs.transform(X_train), y_train)

## save model
artifacts_path['model'] = '_artifacts/ohe_std_lgbm.joblib'
joblib.dump(
    value=pipeline, 
    filename=artifacts_path['model']
)


['_artifacts_store/ohe_std_lgbm.joblib']

In [50]:
# the "signature" parameters should be "X_train" not "selected_X_train" which is transformed by "sfs"
with mlflow.start_run():
    model_info = mlflow.pyfunc.log_model(
        artifact_path='testing', 
        python_model=testing_mlflow_model(), 
        signature=infer_signature(X_train),
        artifacts=artifacts_path, 
        pip_requirements=['joblib', 'mlxtend', 'sklearn']
    )

In [51]:
x = mlflow.pyfunc.load_model(model_info.model_uri)
x.predict(X_train)

array([0, 0, 0, ..., 0, 1, 0])