## Develop Classification Models

In [1]:
import pandas as pd
from scipy.stats import distributions
import pickle
import time

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline


import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_columns", None)
%config Completer.use_jedi = False

Helper Functions

In [2]:
class LogUniformInt:
    
    def __init__(self, min_val, max_val):
        self.min_val = min_val
        self.max_val = max_val 
        
    def rvs(self, random_state=None):
        """
        rvs method that is needed by RandomSearchCV
        """
        
        # call the loguniform distribution that is built into scipy
        lu = distributions.loguniform(self.min_val, self.max_val)
        
        # convert outut to integer
        rand_int = int(lu.rvs(random_state=random_state))
        
        return rand_int

#### Read in and standardize data

In [3]:
input_path = "../input/credit-score-prediction-project/"

# get training data
X_train_df = pd.read_csv(input_path + "train_cleaned_imputed.csv")
y_train_df = X_train_df.pop("Credit_Score")

y_train = y_train_df.values
X_train = X_train_df.values

# get dev data
X_dev_df = pd.read_csv(input_path + "dev_cleaned_imputed.csv")
y_dev_df = X_dev_df.pop("Credit_Score")

y_dev = y_dev_df.values
X_dev = X_dev_df.values

Train Model

According to the `RandomSearchCV` [documentation](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html), our param_distributions are lists that will be sampled uniformly or distributions with a `rvs` method for sampling (such as those from scipy.stats.distributions).


[Example project](https://jamesrledoux.com/code/randomized_parameter_search) using `RandomSearchCV`.

For values that can span mulitiple orders of magnitude, 
we will want to sample using a [loguniform distribution](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.loguniform.html#scipy.stats.loguniform).
[This blog post](https://towardsdatascience.com/why-is-the-log-uniform-distribution-useful-for-hyperparameter-tuning-63c8d331698) does a good job of explaining why to use the log normal.


Also here is [an interesting blog post](https://towardsdatascience.com/comprehensive-guide-on-multiclass-classification-metrics-af94cfb83fbd) on multiclass-classification.

In [4]:
start = time.time()
rf_model = RandomForestClassifier(n_estimators=500,
                                  criterion="entropy",
                                  max_leaf_nodes=None)


# ----- Parameter Distributions -----
param_dists = {"max_depth": distributions.randint(2, 30),
               "min_samples_split": LogUniformInt(min_val=2, max_val=50),
               # normally distributed max_features, with mean .25 stddev 0.1, bounded between 0 and 1
               "max_features":  distributions.truncnorm(a=0, b=1, loc=0.25, scale=0.1)}


f1_scorer = make_scorer(f1_score , average='weighted')
n_iter=50

clf = RandomizedSearchCV(rf_model,    
                         param_dists,
                         n_jobs=-1,
                         n_iter=n_iter,
                         cv=5, # for classification defaults to StratefiedKFold
                         random_state=99,
                         scoring=f1_scorer,
                         verbose=10)


cv_model = clf.fit(X_train, y_train)


# get the best params
best_params = cv_model.best_estimator_.get_params()


total_time_minutes = (time.time() - start)/60
print("total_time_minutes: {}".format(total_time_minutes))

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 3/5; 1/50] START max_depth=3, max_features=0.3163938370535726, min_samples_split=29
[CV 3/5; 1/50] END max_depth=3, max_features=0.3163938370535726, min_samples_split=29;, score=0.653 total time= 1.3min
[CV 5/5; 1/50] START max_depth=3, max_features=0.3163938370535726, min_samples_split=29
[CV 5/5; 1/50] END max_depth=3, max_features=0.3163938370535726, min_samples_split=29;, score=0.648 total time= 1.3min
[CV 4/5; 2/50] START max_depth=10, max_features=0.25269095435762046, min_samples_split=26
[CV 4/5; 2/50] END max_depth=10, max_features=0.25269095435762046, min_samples_split=26;, score=0.707 total time= 2.9min
[CV 3/5; 3/50] START max_depth=3, max_features=0.3468540008034445, min_samples_split=12
[CV 3/5; 3/50] END max_depth=3, max_features=0.3468540008034445, min_samples_split=12;, score=0.652 total time= 1.5min
[CV 1/5; 4/50] START max_depth=19, max_features=0.34868644709620894, min_samples_split=2
[CV 1/5; 4/50] EN



[CV 1/5; 1/50] START max_depth=3, max_features=0.3163938370535726, min_samples_split=29
[CV 1/5; 1/50] END max_depth=3, max_features=0.3163938370535726, min_samples_split=29;, score=0.653 total time= 1.4min
[CV 2/5; 2/50] START max_depth=10, max_features=0.25269095435762046, min_samples_split=26
[CV 2/5; 2/50] END max_depth=10, max_features=0.25269095435762046, min_samples_split=26;, score=0.716 total time= 2.9min
[CV 2/5; 3/50] START max_depth=3, max_features=0.3468540008034445, min_samples_split=12
[CV 2/5; 3/50] END max_depth=3, max_features=0.3468540008034445, min_samples_split=12;, score=0.656 total time= 1.5min
[CV 4/5; 3/50] START max_depth=3, max_features=0.3468540008034445, min_samples_split=12
[CV 4/5; 3/50] END max_depth=3, max_features=0.3468540008034445, min_samples_split=12;, score=0.643 total time= 1.5min
[CV 3/5; 4/50] START max_depth=19, max_features=0.34868644709620894, min_samples_split=2
[CV 3/5; 4/50] END max_depth=19, max_features=0.34868644709620894, min_samples_

#### Save model

In [5]:
model_file_name="rf_cv_model.pickle"
with open(model_file_name, "wb") as f:
    pickle.dump(cv_model, f)

#### Print best models

In [6]:
results_df = pd.DataFrame(cv_model.cv_results_["params"])
results_df["mean_test_score"] = cv_model.cv_results_["mean_test_score"]
results_df.sort_values(["mean_test_score"], ascending=False).head(20)

Unnamed: 0,max_depth,max_features,min_samples_split,mean_test_score
25,29,0.307486,3,0.782412
11,29,0.285985,2,0.781109
19,29,0.291014,4,0.780655
6,27,0.281352,5,0.779277
45,25,0.256463,4,0.778246
31,28,0.291367,11,0.774764
49,21,0.312913,4,0.772523
34,24,0.29219,11,0.771835
12,23,0.256885,11,0.770145
33,20,0.270268,2,0.768541


#### Compute Score of best model on dev set

In [7]:
full_data_rf_model = RandomForestClassifier(**clf.best_estimator_.get_params())
full_data_rf_model.fit(X_train, y_train)
y_pred = full_data_rf_model.predict(X_dev)

# computing score on dev set will help figure out if we 
# are overfitting
dev_score = f1_score(y_dev, y_pred, average="weighted")
print(dev_score)


# save the full model
model_file_name="rf_full_model.pickle"
with open(model_file_name, "wb") as f:
    pickle.dump(full_data_rf_model, f)

0.7643016396752815
