In [None]:
def pipeline_optimization(X,y,balance=None):
    
    # devide features
    categoric_features = list(X.columns[X.dtypes==object])

    numeric_features = list(X.columns[X.dtypes != object])

    # split train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state,stratify=y)
    
    if balance == 'over':
        # define oversampling strategy
        print('Oversampling')
        oversample = RandomOverSampler(sampling_strategy='minority')
        X_train, y_train = oversample.fit_resample(X_train, y_train)

    if balance == 'under':
        print('Undersampling')
        # define undersample strategy
        undersample = RandomUnderSampler(sampling_strategy='majority')
        X_train, y_train = undersample.fit_resample(X_train, y_train)
        
    # Hyperparameter grid
    param_randomforest = {
    'randomforest__n_estimators': [100, 200, 300, 400, 500, 1000, 2000], #np.linspace(10, 500,50).astype(int),
    'randomforest__max_depth': [None] + list(np.arange(5, 100, 5).astype(int)),
    'randomforest__max_features': ['auto', 'sqrt', None],
    'randomforest__bootstrap': [True, False],
    'randomforest__min_samples_leaf': [1, 2, 4],
    'randomforest__min_samples_split': [2, 5, 10],
    }
    
    models={
        #'KNN' : KNeighborsClassifier(n_neighbors=5, metric='euclidean',n_jobs=-1),
        #'SVC' : LinearSVC(),
        #'logreg': LogisticRegression(random_state=random_state,n_jobs=-1), 
        #'decisiontree': DecisionTreeClassifier(random_state=random_state,max_depth=10),
        'randomforest': RandomForestClassifier(random_state=random_state, n_jobs=-1,n_estimators=300)
        #'XGBoost' : XGBClassifier(random_state=random_state, n_jobs=-1),
        #'adaBoost' : AdaBoostClassifier(random_state=random_state)
        }  
    
    # create preprocessors
    numeric_transformer = Pipeline(steps=[
        ('imputer_num', SimpleImputer(strategy='median')),
        ('scaler', MinMaxScaler())
        #('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
            ('imputer_cat', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categoric_features)
        ])

    model_results = pd.DataFrame(columns=['model','acc_train','acc_test','f1_train','f1_test','recall_train','recall_test','precision_train','precision_test'])
    
    # process pipeline for every model
    for model in models.items():
        
        print(model[0])
        pipe = Pipeline(steps=[('preprocessor', preprocessor), 
                               (model[0], model[1])
                               ])

        grid_randomforest = RandomizedSearchCV(pipe, param_randomforest, cv=3, scoring='recall', 
                           verbose=5, n_jobs=-1, n_iter = 100)
        # fit model
        grid_randomforest.fit(X_train, y_train)
        #pipe.fit(X_train, y_train)
        
        # Show best parameters
        print('Best score:\n{:.2f}'.format(grid_randomforest.best_score_))
        print("Best parameters:\n{}".format(grid_randomforest.best_params_))
        
        # Save best model as best_model
        best_model = grid_randomforest.best_estimator_
        
        #pipe_best = Pipeline(steps=[('preprocessor', preprocessor), 
        #                       (model[0], best_model)
        #                       ])
        
        # fit best pipe
        #pipe_best.fit(X_train, y_train)
        #predict results
        #y_train_pred = cross_val_predict(grid_randomforest,X_train,y_train,cv=5)
        y_train_pred = grid_randomforest.predict(X_train)
        #y_train_pred = cross_val_predict(pipe_best, X_train, y_train, cv=5)

        y_test_pred = grid_randomforest.predict(X_test)
        
        results = train_predict(model[0],y_train, y_test, y_train_pred, y_test_pred)
        
        model_results = pd.concat([model_results, pd.DataFrame(results,index=[0])])
        # print results
        #print("\nResults on training data: ")
        #print(classification_report(y_train, y_train_pred))
        #print("\nResults on test data:")
        #print(classification_report(y_test, y_test_pred))
        print("\nConfusion matrix on test")
        print(confusion_matrix(y_test, y_test_pred))
        print("\n")
    return model_results

In [None]:
RandomizedSearch_randomforst = pipeline_optimization(X,y,balance='under')

In [None]:
RandomizedSearch_randomforst