# Nice Confusion Matrix Plot

In [None]:
conf_matrix = np.array([[24489,  7681],[ 3260, 10735]])

group_names = ['True Neg','False Pos','False Neg','True Pos']
group_counts = ["{0:0.0f}".format(value) for value in
                conf_matrix.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
                     conf_matrix.flatten()/np.sum(conf_matrix)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.set(font_scale=1.6)
fig, ax = plt.subplots(figsize=(8,7)) 
sns.heatmap(conf_matrix, annot=labels, fmt='', cmap='Blues',annot_kws={"size": 16})
plt.title('Confusion Matrix: Random Forest (optimized)', fontsize = 16); # title with fontsize 20
plt.xlabel('Predicted', fontsize = 16);
plt.ylabel('Actual', fontsize = 16);
#plt.savefig('plots/confusion_matrix_RF_optimized.png',dpi=300)

# Gridearch Pipeline

In [214]:
def pipeline_optimization_gridsearch(X,y,balance=None):
    
    # devide features
    categoric_features = list(X.columns[X.dtypes==object])

    numeric_features = list(X.columns[X.dtypes != object])

    # split train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,stratify=y)
    
    if balance == 'over':
        # define oversampling strategy
        print('Oversampling')
        oversample = RandomOverSampler(sampling_strategy='minority')
        X_train, y_train = oversample.fit_resample(X_train, y_train)

    if balance == 'under':
        print('Undersampling')
        # define undersample strategy
        undersample = RandomUnderSampler(sampling_strategy='majority')
        X_train, y_train = undersample.fit_resample(X_train, y_train)
        
    # Hyperparameter grid
    param_randomforest = {
    'randomforest__criterion' : ['gini', 'entropy'],
    'randomforest__n_estimators': [360,380,400,420,440], #np.linspace(10, 500,50).astype(int),
    'randomforest__max_depth': [32,35,37],
    'randomforest__max_features': ['auto'],
    'randomforest__bootstrap': [True],
    'randomforest__min_samples_leaf': [1, 2],
    'randomforest__min_samples_split': [3, 4, 5, 6, 7],
    }
    
    models={
        #'KNN' : KNeighborsClassifier(n_neighbors=5, metric='euclidean',n_jobs=-1),
        #'SVC' : LinearSVC(),
        #'logreg': LogisticRegression(random_state=random_state,n_jobs=-1), 
        #'decisiontree': DecisionTreeClassifier(random_state=random_state,max_depth=10),
        'randomforest': RandomForestClassifier(random_state=random_state, n_jobs=-1,n_estimators=300)
        #'XGBoost' : XGBClassifier(random_state=random_state, n_jobs=-1),
        #'adaBoost' : AdaBoostClassifier(random_state=random_state)
        }  
    
    # create preprocessors
    numeric_transformer = Pipeline(steps=[
        ('imputer_num', SimpleImputer(strategy='median')),
        ('scaler', MinMaxScaler())
        #('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
            ('imputer_cat', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categoric_features)
        ])

    model_results = pd.DataFrame(columns=['model','acc_train','acc_test','f1_train','f1_test','recall_train','recall_test','precision_train','precision_test'])
    
    # process pipeline for every model
    for model in models.items():
        
        print(model[0])
        pipe = Pipeline(steps=[('preprocessor', preprocessor), 
                               (model[0], model[1])
                               ])
        
        grid_randomforest = GridSearchCV(pipe, param_grid = param_randomforest, 
                              cv = 3, scoring='f1', n_jobs = -1, verbose = 5)
        # fit model
        grid_randomforest.fit(X_train, y_train)
        #pipe.fit(X_train, y_train)
        
        # Show best parameters
        print('Best score:\n{:.2f}'.format(grid_randomforest.best_score_))
        print("Best parameters:\n{}".format(grid_randomforest.best_params_))
        
        # Save best model as best_model
        best_model = grid_randomforest.best_estimator_
        
        #pipe_best = Pipeline(steps=[('preprocessor', preprocessor), 
        #                       (model[0], best_model)
        #                       ])
        
        # fit best pipe
        #pipe_best.fit(X_train, y_train)
        #predict results
        #y_train_pred = cross_val_predict(grid_randomforest,X_train,y_train,cv=5)
        y_train_pred = grid_randomforest.predict(X_train)
        #y_train_pred = cross_val_predict(pipe_best, X_train, y_train, cv=5)

        y_test_pred = grid_randomforest.predict(X_test)
        
        results = train_predict(model[0],y_train, y_test, y_train_pred, y_test_pred)
        
        model_results = pd.concat([model_results, pd.DataFrame(results,index=[0])])
        # print results
        #print("\nResults on training data: ")
        #print(classification_report(y_train, y_train_pred))
        #print("\nResults on test data:")
        #print(classification_report(y_test, y_test_pred))
        print("\nConfusion matrix on test")
        print(confusion_matrix(y_test, y_test_pred))
        print("\n")
    return model_results