In [None]:
#import
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

#partitioning & grid
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV, KFold
from scipy.stats import uniform
from sklearn.metrics import make_scorer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, mutual_info_regression, f_regression, r_regression

#models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

#evaluation
from scikitplot.metrics import (
    plot_roc, 
    plot_precision_recall, 
    plot_cumulative_gain, 
    plot_lift_curve,
)


In [None]:
df_train = pd.read_csv('monks_3_train', sep=' ', header=None, skipinitialspace=True)
df_train.head(10)

In [None]:
df_train.drop(columns=7, inplace=True)
df_train.head()

In [None]:
df_test = pd.read_csv('monks_3_test', sep=' ', header=None, skipinitialspace=True)
df_test.drop(columns=7, inplace=True)
df_test.head(10)

In [None]:
df_train.dtypes

In [None]:
df_train = pd.get_dummies(data=df_train, columns=df_train.columns[1:])
df_train.head(10)

In [None]:
X_train = df_train.iloc[:, 1:18].values
X_train

In [None]:
X_train.shape

In [None]:
y_train = df_train.iloc[:, 0].values
y_train

In [None]:
df_test = pd.get_dummies(data=df_test, columns=range(1,7))
df_test.head()

In [None]:
X_test = df_test.iloc[:, 1:18].values
X_test

In [None]:
y_test = df_test.iloc[:, 0].values
y_test

# Grid search

## Decision Tree

In [None]:
dt = DecisionTreeClassifier()

In [None]:
def get_alphas(y_train, vals=50):
    path = dt.cost_complexity_pruning_path(X_train, y_train)
    ccp_alphas, impurities = path.ccp_alphas, path.impurities
    ccp_alphas = np.linspace(min(ccp_alphas), max(ccp_alphas), vals)
    return ccp_alphas

ccp_alphas = get_alphas(y_train)
ccp_alphas

In [None]:
param_grid = {
    "criterion": ['gini', 'entropy'],
    "ccp_alpha": ccp_alphas,
    "max_depth": [i for i in range (4, 10)],
}

In [None]:
grid = GridSearchCV(
    dt,
    param_grid=param_grid,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    )

grid.fit(X_train, y_train)
print(grid.best_estimator_, grid.best_score_)

In [None]:
best_dt = grid.best_estimator_

In [None]:
best_params, best_score = grid.best_params_, grid.best_score_
best_params, best_score

In [None]:
best_dt.score(X_test, y_test)

## Random Forest

In [None]:
rf = RandomForestClassifier()

In [None]:
param_list = {'n_estimators': [15,18,20],            # number of trees in the forest
              'max_depth': np.arange(4, 10, 1),           # max depth of the tree
              'max_features': ['auto', 'sqrt', 'log2'],                   # number of features to consider at each split
              'min_samples_split': np.arange(6, 10, 1),          # samples required to split a node
              'min_samples_leaf': np.arange(2, 8, 2),           # samples required at leaf nodes
              'bootstrap': [True, False]                                  # method of selecting samples for training each tree
             }

In [None]:
grid = GridSearchCV(
    rf,
    param_grid=param_list,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    refit=True
    )

In [None]:
grid.fit(X_train, y_train)

In [None]:
best_rf = grid.best_estimator_

In [None]:
best_params_rf, best_score_rf = grid.best_params_, grid.best_score_
best_params_rf, best_score_rf

In [None]:
best_rf.score(X_test, y_test)