### **Road to Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.calibration import calibration_curve
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV
from matplotlib import pyplot


1. Data Standization

In [None]:
# Data Standaridzation: Mean을 0으로 Variance를 1로 만들어주는 과정
from sklearn.preprocessing import LabelEncoder

categorical_col = df.select_dtypes(exclude=['int64', 'float64'])
label = LabelEncoder()
for column in categorical_col:
    df[column] = label.fit_transform(df[column])

2. Defining Algorithm

In [None]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        # print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
        print("tn, fp, fn, tp")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}.ravel() \n")
        p = ConfusionMatrixDisplay(confusion_matrix = confusion_matrix(y_train, pred), display_labels = ('positive', 'negative'))
        p.plot(include_values=True, cmap='Blues', ax=None, xticks_rotation='horizontal')
        # tn, fp, fn, tp = confusion_matrix([0, 1, 0, 1], [1, 1, 1, 0]).ravel()

    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

3. Hyperparameter Tuning

In [None]:

n_estimators = [100, 500, 1000, 1500]
max_features = ['auto', 'sqrt']
max_depth = [2, 3, 5]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4, 10]
bootstrap = [True, False]

params_grid = {'n_estimators': n_estimators, 'max_features': max_features,
               'max_depth': max_depth, 'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap}

rf_clf = RandomForestClassifier(random_state=42)

rf_cv = GridSearchCV(rf_clf, params_grid, scoring="f1", cv=3, verbose=2, n_jobs=-1)


rf_cv.fit(X_train, y_train)
best_params = rf_cv.best_params_
print(f"Best parameters: {best_params}")

rf_clf = RandomForestClassifier(**best_params)
rf_clf.fit(X_train, y_train)

print_score(rf_clf, X_train, y_train, X_test, y_test, train=True)
print_score(rf_clf, X_train, y_train, X_test, y_test, train=False)

4. Isotonic Calibration

In [None]:
def uncalibrated(X_train, X_test, y_train):
    model = RandomForestClassifier(n_estimators=100)
    model.fit(X_train, y_train)
    # predict probabilities
    return model.predict_proba(X_test)[:, 1]
 
def calibrated(X_train, X_test, y_train):
    model = RandomForestClassifier(n_estimators=100)
    calibrated = CalibratedClassifierCV(model, method='isotonic')
    calibrated.fit(X_train, y_train)
    # predict probabilities
    return calibrated.predict_proba(X_test)[:, 1]

In [None]:
# uncalibrated predictions
yhat_uncalibrated = uncalibrated(X_train, X_test, y_train)

# calibrated predictions
yhat_calibrated = calibrated(X_train, X_test, y_train)

# reliability diagrams
fop_uncalibrated, mpv_uncalibrated = calibration_curve(y_test, yhat_uncalibrated, n_bins=10)
fop_calibrated, mpv_calibrated = calibration_curve(y_test, yhat_calibrated, n_bins=10)

# plot perfectly calibrated
pyplot.plot([0, 1], [0, 1], linestyle='--', color='black')

# plot model reliabilities
pyplot.plot(mpv_uncalibrated, fop_uncalibrated, marker='.', label='Random Forest')
pyplot.plot(mpv_calibrated, fop_calibrated, marker='.', label='Isotonic Calibration')
plt.xlabel('Fraction belonging to positive class')
plt.ylabel('Predicted probability of positive class')
pyplot.legend()
pyplot.show()