## Develop Classification Models

In [1]:
import pandas as pd
from scipy.stats import distributions
import mlflow
import pickle

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline


import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_columns", None)
%config Completer.use_jedi = False

Helper Functions

In [2]:
class LogUniformInt:
    
    def __init__(self, min_val, max_val):
        self.min_val = min_val
        self.max_val = max_val 
        
    def rvs(self, random_state=None):
        """
        rvs method that is needed by RandomSearchCV
        """
        
        # call the loguniform distribution that is built into scipy
        lu = distributions.loguniform(self.min_val, self.max_val)
        
        # convert outut to integer
        rand_int = int(lu.rvs(random_state=random_state))
        
        return rand_int

Create our MLflow Database to Log Results

`experiment_id` is `1`

In [3]:
mlflow.set_tracking_uri("../mlflow/mlruns/")
#experiment_id = mlflow.create_experiment(name="credit_score_classification")
mlflow.set_experiment(experiment_id="1")
mlflow.sklearn.autolog(log_models=False, max_tuning_runs=100)

#### Read in and standardize data

In [4]:
# get training data
X_train_df = pd.read_csv("../data/train_cleaned_imputed.csv")
y_train_df = X_train_df.pop("Credit_Score")

y_train = y_train_df.values
X_train = X_train_df.values

# get dev data
X_dev_df = pd.read_csv("../data/dev_cleaned_imputed.csv")
y_dev_df = X_dev_df.pop("Credit_Score")

y_dev = y_dev_df.values
X_dev = X_dev_df.values

Train Model

According to the `RandomSearchCV` [documentation](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html), our param_distributions are lists that will be sampled uniformly or distributions with a `rvs` method for sampling (such as those from scipy.stats.distributions).


[Example project](https://jamesrledoux.com/code/randomized_parameter_search) using `RandomSearchCV`.

For values that can span mulitiple orders of magnitude, 
we will want to sample using a [loguniform distribution](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.loguniform.html#scipy.stats.loguniform).
[This blog post](https://towardsdatascience.com/why-is-the-log-uniform-distribution-useful-for-hyperparameter-tuning-63c8d331698) does a good job of explaining why to use the log normal.


Also here is [an interesting blog post](https://towardsdatascience.com/comprehensive-guide-on-multiclass-classification-metrics-af94cfb83fbd) on multiclass-classification.

In [None]:
rf_model = RandomForestClassifier(n_estimators=500,
                                  criterion="entropy",
                                  max_leaf_nodes=None)


# ----- Parameter Distributions -----
param_dists = {"max_depth": distributions.randint(2, 30),
               "min_samples_split": LogUniformInt(min_val=2, max_val=50),
               # normally distributed max_features, with mean .25 stddev 0.1, bounded between 0 and 1
               "max_features":  distributions.truncnorm(a=0, b=1, loc=0.25, scale=0.1)}


f1_scorer = make_scorer(f1_score , average='weighted')
n_iter=100

clf = RandomizedSearchCV(rf_model,    
                         param_dists,
                         n_jobs=1,
                         n_iter=n_iter,
                         cv=5, # for classification defaults to StratefiedKFold
                         random_state=99,
                         scoring=f1_scorer,
                         verbose=10)


cv_model = clf.fit(X_train, y_train)


# get the best params
best_params = cv_model.best_estimator_.get_params()

2022/09/10 19:11:01 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'bb7592752abd49a4981b4a39ea28ca20', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 1/5; 1/100] START max_depth=3, max_features=0.31639383705357266, min_samples_split=29
[CV 1/5; 1/100] END max_depth=3, max_features=0.31639383705357266, min_samples_split=29;, score=0.652 total time=  47.0s
[CV 2/5; 1/100] START max_depth=3, max_features=0.31639383705357266, min_samples_split=29
[CV 2/5; 1/100] END max_depth=3, max_features=0.31639383705357266, min_samples_split=29;, score=0.657 total time=  39.1s
[CV 3/5; 1/100] START max_depth=3, max_features=0.31639383705357266, min_samples_split=29
[CV 3/5; 1/100] END max_depth=3, max_features=0.31639383705357266, min_samples_split=29;, score=0.652 total time=  44.9s
[CV 4/5; 1/100] START max_depth=3, max_features=0.31639383705357266, min_samples_split=29
[CV 4/5; 1/100] END max_depth=3, max_features=0.31639383705357266, min_samples_split=29;, score=0.642 total time=  48.0s
[CV 5/5; 1/100] START max_depth=3, max_features=0.31639383705357266, min_samples_split=29
[CV

[CV 5/5; 8/100] END max_depth=17, max_features=0.2580183485058877, min_samples_split=27;, score=0.742 total time= 2.2min
[CV 1/5; 9/100] START max_depth=6, max_features=0.2857806489039908, min_samples_split=21
[CV 1/5; 9/100] END max_depth=6, max_features=0.2857806489039908, min_samples_split=21;, score=0.701 total time= 1.1min
[CV 2/5; 9/100] START max_depth=6, max_features=0.2857806489039908, min_samples_split=21
[CV 2/5; 9/100] END max_depth=6, max_features=0.2857806489039908, min_samples_split=21;, score=0.699 total time= 1.2min
[CV 3/5; 9/100] START max_depth=6, max_features=0.2857806489039908, min_samples_split=21
[CV 3/5; 9/100] END max_depth=6, max_features=0.2857806489039908, min_samples_split=21;, score=0.698 total time=47.8min
[CV 4/5; 9/100] START max_depth=6, max_features=0.2857806489039908, min_samples_split=21
[CV 4/5; 9/100] END max_depth=6, max_features=0.2857806489039908, min_samples_split=21;, score=0.690 total time=50.1min
[CV 5/5; 9/100] START max_depth=6, max_feat

#### View Feature Sensativities

TODO: update to read in historical data using MLFlow

In [None]:
results_df = pd.DataFrame(cv_model.cv_results_["params"])
results_df["mean_test_score"] = cv_model.cv_results_["mean_test_score"]

In [None]:
results_df.sort_values(["mean_test_score"], ascending=False).head(20)

__Generate predictions__: on our dev set, so that we can compare to other models

In [None]:
best_rfc_model = RandomForestClassifier(**best_params)


with mlflow.start_run(experiment_id=1) as run:
    best_rfc_model.fit(X_train, y_train)
    mlflow.sklearn.eval_and_log_metrics(model=best_rfc_model,
                                        X=X_dev_clean,
                                        y_true=y_dev,
                                        prefix="dev_")
    y_pred = best_rfc_model.predict(X_dev)
    
    
    mlflow.sklearn.save_model(best_rfc_model, path="../artifacts/rf_best2/")