# PREPROCESSING PIPELINE

In [2]:
# Standard library imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Sklearn models
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier)
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB

# Sklearn preprocessing and pipeline
from sklearn.preprocessing import (OneHotEncoder, QuantileTransformer, FunctionTransformer)
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Sklearn utilities
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn import set_config
set_config(transform_output='pandas')

In [3]:
# Load data
train_data=pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

X_train = train_data.drop('target', axis=1)
y_train = train_data['target']

In [4]:
# Create a function to treat the date and id variables and turn it into an sklearn transformer

def date_and_id(data):
    
    # ensure 'Connect_Date' is of the correct type and correct for Y2K
    date_df = pd.DataFrame()
    date_df['Connect_Date'] = pd.to_datetime(data['Connect_Date'], format='%d/%m/%y')
    date_df['Connect_Date'] = date_df['Connect_Date'].apply(lambda x: x - pd.DateOffset(years=100) if x.year > 2022 else x)
    
    # Create features out of the "Connect_Date" variable
    reference_date = pd.to_datetime('2023-01-01')
    date_df['Account_Age_days'] = (reference_date - date_df['Connect_Date']).dt.days
    date_df['Connect_Month'] = date_df['Connect_Date'].dt.month
    date_df['Connect_Quarter'] = date_df['Connect_Date'].dt.quarter
    date_df['Connect_Year'] = date_df['Connect_Date'].dt.year

    # Put the new features back to the dataset and remove the 'id' column
    date_df=date_df.drop('Connect_Date', axis=1)
    res = pd.concat([data.drop('Connect_Date', axis=1), date_df], axis=1)
    res = res.drop('id', axis=1)
    return res

date_id_preprocessor = FunctionTransformer(date_and_id)

In [5]:
# Transformer to impute missing values in certain variables

numeric_missing = ['Dropped_calls_ratio', 'call_cost_per_min']
categorical_missing = ['Usage_Band']

imputer = ColumnTransformer(transformers=[
    ('num_na', SimpleImputer(strategy='median'), numeric_missing),
    ('cat_na', SimpleImputer(strategy='most_frequent'), categorical_missing),
], remainder='passthrough', verbose_feature_names_out=False)

In [6]:
# Transformer to one-hot encode categorical data and quantile-transform numeric data

numeric_columns = list(train_data.drop('target', axis=1)._get_numeric_data().columns)
categorical_columns = list(set(train_data.columns) - set(numeric_columns))
categorical_columns = [i for i in categorical_columns if i not in ['Connect_Date','id', 'target']]

transformer = ColumnTransformer(transformers=[
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first'), categorical_columns),
    ('quant', QuantileTransformer(output_distribution = 'normal', random_state = 0), numeric_columns)
], remainder='passthrough', verbose_feature_names_out=False)

In [7]:
# Custom transformer to remove correlated features

class RemoveCorrelatedFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.95):
        self.threshold = threshold
        self.correlated_features = None

    def fit(self, X, y=None):
        # Calculate the correlation matrix
        corr_matrix = X.corr().abs()
        # Select upper triangle of correlation matrix
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        # Find index of feature columns with correlation greater than the threshold
        self.correlated_features = [column for column in upper.columns if any(upper[column] > self.threshold)]
        return self

    def transform(self, X):
        # Drop highly correlated features
        X = X.drop(columns=self.correlated_features, errors='ignore')
        return X

In [8]:
# The preprocessing-classifier pipeline

p = Pipeline(steps=[
    ('dtid', date_id_preprocessor),
    ('impu', imputer),
    ('trfm', transformer),
    ('rffs', SelectFromModel(RandomForestClassifier(random_state=0), threshold=0.01)),
    ('corr', RemoveCorrelatedFeatures(threshold=0.7)),
    ('classifier', LogisticRegression())
])

# MODEL FITTER

In [9]:
# Define custom metric (profit at top 20 prediction)

def profit_top_20(y_true, y_pred):
    dft = y_true.to_frame()
    dft['y_pred'] = y_pred
    dft = dft.sort_values('y_pred', ascending=False).head(20)
    dft = dft[dft['target'] == 1]
    profit = train_data['average cost min'][dft.index].sum()
    return profit

In [101]:
# GridSearchCV does exhaustive search on all model/param combination, expect it to take some time

# Object for managing multiple searches

class Search():
    
    def __init__(self):
        self.runs = dict()
    
    def run(self, run_name, param_grid, X_train, y_train, estimator=p, cv=5):

        # Use GridSearchCV to search for the best classifier and classifier parameters
        grid_search = GridSearchCV(estimator, param_grid, cv, scoring=make_scorer(profit_top_20, response_method="predict_proba", greater_is_better=True), n_jobs=-1, verbose=2)
        grid_search.fit(X_train, y_train)
        self.runs[run_name] = grid_search
        print ("see results dataframe by accessing the object's .result(run_name)")

    def result(self, run_name):
        return pd.DataFrame(self.runs[run_name].cv_results_)
    
    def predict(self, run_name, pred_data, export=False):
        res = self.runs[run_name].predict_proba(test_data)
        res = pd.concat([pred_data['id'], pd.Series([i[1] for i in res])], axis=1)
        if export is True:
            res.to_csv(run_name + '.csv', header=False, index=False)
        elif isinstance(export, str):
            res.to_csv(export + '.csv', header=False, index=False)
        return res

In [102]:
fitter = Search()

## SAMPLE RUNS

In [19]:
# Define the parameter grid for all the classifiers we want to try

param_grid = [
    {'classifier': [LogisticRegression(max_iter=1000)],
     'classifier__C': [3, 10, 30, 100],
     'classifier__solver': ['liblinear', 'saga'],  # 'liblinear' and 'saga' work well with small datasets and L1 penalty
     'classifier__penalty': ['l1', 'l2']},
    
    {'classifier': [RandomForestClassifier()],
     'classifier__n_estimators': [50, 75, 100],
     'classifier__max_features': ['sqrt', 'log2'],
     'classifier__max_depth': [10, 20, 30]},
    
    {'classifier': [GradientBoostingClassifier()],
     'classifier__n_estimators': [50, 100, 150],
     'classifier__learning_rate': [0.1, 0.15, 0.2],
     'classifier__max_depth': [3, 5, 7]}
]

fitter.fit('first', param_grid, X_train, y_train)

Fitting 5 folds for each of 61 candidates, totalling 305 fits
see results dataframe by accessing the object's .result(run_name)


In [25]:
fitter.result('first').sort_values('mean_test_score', ascending=False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier,param_classifier__C,param_classifier__penalty,param_classifier__solver,param_classifier__max_depth,param_classifier__max_features,...,param_classifier__learning_rate,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,2.886199,0.126983,0.116400,0.011977,LogisticRegression(max_iter=1000),3,l1,liblinear,,,...,,{'classifier': LogisticRegression(max_iter=100...,3.563949,3.445613,4.024599,5.189658,5.334287,4.311621,0.801019,1
1,3.481198,0.222346,0.128200,0.015867,LogisticRegression(max_iter=1000),3,l1,saga,,,...,,{'classifier': LogisticRegression(max_iter=100...,3.563949,3.445613,4.024599,5.189658,5.334287,4.311621,0.801019,1
4,3.017399,0.158965,0.135601,0.027302,LogisticRegression(max_iter=1000),10,l1,liblinear,,,...,,{'classifier': LogisticRegression(max_iter=100...,3.563949,3.445613,4.024599,5.189658,5.334287,4.311621,0.801019,1
5,4.029799,0.488782,0.121801,0.010265,LogisticRegression(max_iter=1000),10,l1,saga,,,...,,{'classifier': LogisticRegression(max_iter=100...,3.563949,3.445613,4.024599,5.189658,5.334287,4.311621,0.801019,1
6,3.607801,0.748012,0.136399,0.037329,LogisticRegression(max_iter=1000),10,l2,liblinear,,,...,,{'classifier': LogisticRegression(max_iter=100...,3.563949,3.445613,4.024599,5.189658,5.334287,4.311621,0.801019,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,6.139000,0.828675,0.116400,0.005004,GradientBoostingClassifier(),,,,7,,...,0.2,"{'classifier': GradientBoostingClassifier(), '...",2.784254,2.950014,1.906301,3.469542,2.044264,2.630875,0.582728,57
46,5.147145,0.971601,0.115999,0.020465,GradientBoostingClassifier(),,,,5,,...,0.15,"{'classifier': GradientBoostingClassifier(), '...",2.441914,2.939267,2.235129,2.988435,2.429802,2.606909,0.300943,58
50,9.357200,1.160856,0.122600,0.007659,GradientBoostingClassifier(),,,,7,,...,0.15,"{'classifier': GradientBoostingClassifier(), '...",2.511185,2.689453,2.092552,3.306216,2.398705,2.599622,0.403151,59
31,3.785234,0.145016,0.135600,0.010929,RandomForestClassifier(),,,,30,log2,...,,"{'classifier': RandomForestClassifier(), 'clas...",2.221817,2.495887,2.616284,2.971683,2.546203,2.570375,0.241156,60


In [27]:
fitter.predict('first', test_data)

Unnamed: 0,id,0
0,K751808,0.013287
1,K837351,0.017356
2,K548114,0.016381
3,K736156,0.012953
4,K508080,0.036996
...,...,...
1677,K588314,0.015316
1678,K826807,0.066678
1679,K982731,0.045070
1680,K623037,0.073918


In [30]:
# Trying Lili's method (fit on top spenders)

high_spenders_threshold = np.quantile(train_data['average cost min'], 0.92)
high_spenders_data = train_data[train_data['average cost min'] > high_spenders_threshold]
high_X_train = train_data.drop('target', axis=1)
high_y_train = train_data['target']

# Everything under the sun because why not

param_grid = [
    {'classifier': [LogisticRegression(max_iter=1000)],
     'classifier__C': [3, 10, 30, 100],
     'classifier__solver': ['liblinear', 'saga'],
     'classifier__penalty': ['l1', 'l2']},
    
    {'classifier': [RandomForestClassifier()],
     'classifier__n_estimators': [50, 75, 100],
     'classifier__max_features': ['sqrt', 'log2'],
     'classifier__max_depth': [10, 20, 30]},
    
    {'classifier': [GradientBoostingClassifier()],
     'classifier__n_estimators': [50, 100, 150],
     'classifier__learning_rate': [0.1, 0.15, 0.2],
     'classifier__max_depth': [3, 5, 7]},

    {'classifier': [KNeighborsClassifier()],
     'classifier__n_neighbors': [3, 5, 7],
     'classifier__weights': ['uniform', 'distance'],
     'classifier__algorithm': ['ball_tree', 'kd_tree', 'brute']},

    {'classifier': [DecisionTreeClassifier()],
     'classifier__max_depth': [None, 10, 20],
     'classifier__min_samples_split': [2, 10],
     'classifier__min_samples_leaf': [1, 5]},

    {'classifier': [GaussianNB()],
     'classifier__var_smoothing': [1e-9, 1e-8, 1e-7]},

    {'classifier': [AdaBoostClassifier()],
     'classifier__n_estimators': [50, 100],
     'classifier__learning_rate': [0.01, 0.1, 1],
     'classifier__estimator': [DecisionTreeClassifier(max_depth=1)]},

    {'classifier': [ExtraTreesClassifier()],
     'classifier__n_estimators': [50, 100],
     'classifier__max_features': ['sqrt', 'log2'],
     'classifier__max_depth': [None, 10, 20],
     'classifier__min_samples_split': [2, 10],
     'classifier__min_samples_leaf': [1, 5]},
]

fitter.run('a lot', param_grid, high_X_train, high_y_train)

Fitting 5 folds for each of 148 candidates, totalling 740 fits
see results dataframe by accessing the object's .result(run_name)


In [33]:
fitter.result('a lot').sort_values('mean_test_score', ascending=False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier,param_classifier__C,param_classifier__penalty,param_classifier__solver,param_classifier__max_depth,param_classifier__max_features,...,param_classifier__estimator,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,3.140798,0.207011,0.115000,0.007642,LogisticRegression(max_iter=1000),3,l1,liblinear,,,...,,{'classifier': LogisticRegression(max_iter=100...,3.563949,3.445613,4.024599,5.189658,5.334287,4.311621,0.801019,1
1,4.840998,1.600826,0.156401,0.057386,LogisticRegression(max_iter=1000),3,l1,saga,,,...,,{'classifier': LogisticRegression(max_iter=100...,3.563949,3.445613,4.024599,5.189658,5.334287,4.311621,0.801019,1
15,3.341001,0.192444,0.124000,0.012378,LogisticRegression(max_iter=1000),100,l2,saga,,,...,,{'classifier': LogisticRegression(max_iter=100...,3.563949,3.445613,4.024599,5.189658,5.334287,4.311621,0.801019,1
4,3.093603,0.294704,0.125197,0.010906,LogisticRegression(max_iter=1000),10,l1,liblinear,,,...,,{'classifier': LogisticRegression(max_iter=100...,3.563949,3.445613,4.024599,5.189658,5.334287,4.311621,0.801019,1
5,3.594799,0.243836,0.122400,0.017095,LogisticRegression(max_iter=1000),10,l1,saga,,,...,,{'classifier': LogisticRegression(max_iter=100...,3.563949,3.445613,4.024599,5.189658,5.334287,4.311621,0.801019,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73,3.065939,0.111803,0.265056,0.011079,KNeighborsClassifier(),,,,,,...,,"{'classifier': KNeighborsClassifier(), 'classi...",2.171491,1.963469,1.376532,1.411713,2.027311,1.790103,0.330454,143
61,3.177905,0.404880,0.313599,0.045077,KNeighborsClassifier(),,,,,,...,,"{'classifier': KNeighborsClassifier(), 'classi...",2.171491,1.963469,1.376532,1.411713,2.027311,1.790103,0.330454,143
68,2.883122,0.163767,0.243801,0.032121,KNeighborsClassifier(),,,,,,...,,"{'classifier': KNeighborsClassifier(), 'classi...",1.565956,1.571470,1.508361,1.887930,1.622676,1.631279,0.133339,146
62,3.095008,0.433348,0.287406,0.014696,KNeighborsClassifier(),,,,,,...,,"{'classifier': KNeighborsClassifier(), 'classi...",1.565956,1.571470,1.508361,1.887930,1.622676,1.631279,0.133339,146


In [34]:
fitter.predict('a lot', test_data, export=True)

Unnamed: 0,id,0
0,K751808,0.013289
1,K837351,0.017357
2,K548114,0.016381
3,K736156,0.012955
4,K508080,0.036997
...,...,...
1677,K588314,0.015317
1678,K826807,0.066690
1679,K982731,0.045067
1680,K623037,0.073934


In [36]:
train_data['target'].value_counts()

0    4299
1     745
Name: target, dtype: int64

In [61]:
from imblearn.over_sampling import RandomOverSampler, SMOTE, SMOTENC
from imblearn.pipeline import Pipeline as imbpipe

In [46]:
tes = date_and_id(X_train)
tes = imputer.fit_transform(tes)
oversampler = SMOTENC(categorical_features=categorical_columns).fit_resample(tes, y_train)

ValueError: Pandas output does not support sparse data. Set sparse_output=False to output pandas dataframes or disable Pandas output via` ohe.set_output(transform="default").

In [78]:
oversampler = SMOTENC(categorical_features=categorical_columns, categorical_encoder=OneHotEncoder(sparse_output=False))

In [114]:
# The preprocessing-classifier pipeline

p3 = Pipeline(steps=[
    ('trfm', transformer),
    ('rffs', SelectFromModel(RandomForestClassifier(random_state=0), threshold=0.01)),
    ('corr', RemoveCorrelatedFeatures(threshold=0.7)),
    ('classifier', LogisticRegression())
])

In [88]:
p2 = imbpipe(steps=[
    ('dtid', date_id_preprocessor),
    ('impu', imputer),
    ('over', oversampler)
])

In [94]:
smote_X_train, smote_y_train = p2.fit_resample(X_train, y_train)

In [128]:
# Everything under the sun because why not

param_grid = [
    {'classifier': [LogisticRegression(max_iter=1000)],
     'classifier__C': [3, 10, 30, 100],
     'classifier__solver': ['liblinear', 'saga'],
     'classifier__penalty': ['l1', 'l2']},
    
    {'classifier': [RandomForestClassifier()],
     'classifier__n_estimators': [50, 75, 100],
     'classifier__max_features': ['sqrt', 'log2'],
     'classifier__max_depth': [10, 20, 30]},
    
    {'classifier': [GradientBoostingClassifier()],
     'classifier__n_estimators': [50, 100, 150],
     'classifier__learning_rate': [0.1, 0.15, 0.2],
     'classifier__max_depth': [3, 5, 7]},

    {'classifier': [KNeighborsClassifier()],
     'classifier__n_neighbors': [3, 5, 7],
     'classifier__weights': ['uniform', 'distance'],
     'classifier__algorithm': ['ball_tree', 'kd_tree', 'brute']},

    {'classifier': [DecisionTreeClassifier()],
     'classifier__max_depth': [None, 10, 20],
     'classifier__min_samples_split': [2, 10],
     'classifier__min_samples_leaf': [1, 5]},

    {'classifier': [GaussianNB()],
     'classifier__var_smoothing': [1e-9, 1e-8, 1e-7]},

    {'classifier': [AdaBoostClassifier()],
     'classifier__n_estimators': [50, 100],
     'classifier__learning_rate': [0.01, 0.1, 1],
     'classifier__estimator': [DecisionTreeClassifier(max_depth=1)]},

    {'classifier': [ExtraTreesClassifier()],
     'classifier__n_estimators': [50, 100],
     'classifier__max_features': ['sqrt', 'log2'],
     'classifier__max_depth': [None, 10, 20],
     'classifier__min_samples_split': [2, 10],
     'classifier__min_samples_leaf': [1, 5]},
]

# Define the parameter grid for all the classifiers we want to try

# Define custom metric (profit at top 20 prediction)

def profit_top_20(y_true, y_pred):
    dft = y_true.to_frame()
    dft['y_pred'] = y_pred
    dft = dft.sort_values('y_pred', ascending=False).head(20)
    dft = dft[dft['target'] == 1]
    profit = smote_X_train['average cost min'][dft.index].sum()
    return profit

param_grid = [
    {'classifier': [LogisticRegression(max_iter=1000)],
     'classifier__C': [1, 3, 10, 30],
     'classifier__solver': ['liblinear', 'saga'],  # 'liblinear' and 'saga' work well with small datasets and L1 penalty
     'classifier__penalty': ['l1', 'l2']},
    
    {'classifier': [RandomForestClassifier()],
     'classifier__n_estimators': [50, 75, 100],
     'classifier__max_features': ['sqrt', 'log2'],
     'classifier__max_depth': [10, 20, 30]},
    
    {'classifier': [GradientBoostingClassifier()],
     'classifier__n_estimators': [50, 100, 150],
     'classifier__learning_rate': [0.1, 0.3, 1, 3],
     'classifier__max_depth': [3, 5, 7]}
]

grid_search = GridSearchCV(p3, param_grid, cv=5, scoring=make_scorer(profit_top_20, response_method="predict_proba", greater_is_better=True), n_jobs=-1, verbose=2)
grid_search.fit(smote_X_train, smote_y_train)

Fitting 5 folds for each of 61 candidates, totalling 305 fits


In [143]:
res = grid_search.predict_proba(pt.fit_transform(test_data))
res = pd.concat([test_data['id'], pd.Series([i[1] for i in res])], axis=1)

In [140]:
pt = imbpipe(steps=[
    ('dtid', date_id_preprocessor),
    ('impu', imputer),
    # ('over', oversampler)
])

In [145]:
res.to_csv('smotenc.csv', header=False, index=False)

In [166]:
oversampler2 = RandomOverSampler()

p4 = imbpipe(steps=[
    ('dtid', date_id_preprocessor),
    ('impu', imputer),
    ('over', oversampler2)
])

In [167]:
ros_X_train, ros_y_train = p4.fit_resample(X_train, y_train)

In [160]:
ros_X_train['target'] = ros_y_train

In [161]:
high_spenders_threshold = np.quantile(ros_X_train['average cost min'], 0.92)
high_spenders_data = ros_X_train[ros_X_train['average cost min'] > high_spenders_threshold]
ros_y_train = ros_X_train['target']
ros_X_train = ros_X_train.drop('target', axis=1)

In [164]:
param_grid = [
    {'classifier': [LogisticRegression(max_iter=1000)],
     'classifier__C': [1, 3, 10, 30],
     'classifier__solver': ['liblinear', 'saga'],  # 'liblinear' and 'saga' work well with small datasets and L1 penalty
     'classifier__penalty': ['l1', 'l2']},
    
    {'classifier': [RandomForestClassifier()],
     'classifier__n_estimators': [50, 100],
     'classifier__max_features': ['sqrt', 'log2'],
     'classifier__max_depth': [10, 20],
     'classifier__ccp_alpha': [0.001, 0.01, 0.1]},
    
    {'classifier': [GradientBoostingClassifier()],
     'classifier__n_estimators': [50, 100, 150],
     'classifier__learning_rate': [0.1, 0.3, 1, 3],
     'classifier__max_depth': [2, 3, 4, 5]}
]

grid_search3 = GridSearchCV(p3, param_grid, cv=5, scoring=make_scorer(profit_top_20, response_method="predict_proba", greater_is_better=True), n_jobs=-1, verbose=2)
grid_search3.fit(ros_X_train, ros_y_train)

Fitting 5 folds for each of 88 candidates, totalling 440 fits


In [165]:
pd.DataFrame(grid_search3.cv_results_).sort_values('mean_test_score', ascending=False).head(30)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier,param_classifier__C,param_classifier__penalty,param_classifier__solver,param_classifier__ccp_alpha,param_classifier__max_depth,...,param_classifier__learning_rate,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
43,5.673215,0.860467,0.108399,0.015266,GradientBoostingClassifier(),,,,,3.0,...,0.1,"{'classifier': GradientBoostingClassifier(), '...",5.176099,3.268242,3.663619,3.680448,4.43549,4.04478,0.680231,1
2,4.111799,0.611319,0.115802,0.019863,LogisticRegression(max_iter=1000),1.0,l2,liblinear,,,...,,{'classifier': LogisticRegression(max_iter=100...,5.193495,3.123746,4.069964,3.474262,3.959773,3.964248,0.702677,2
52,4.801806,0.529737,0.094599,0.005851,GradientBoostingClassifier(),,,,,2.0,...,0.3,"{'classifier': GradientBoostingClassifier(), '...",5.052874,3.282544,3.239807,3.536049,4.498721,3.921999,0.725911,3
64,5.175601,1.04706,0.111801,0.034055,GradientBoostingClassifier(),,,,,2.0,...,1.0,"{'classifier': GradientBoostingClassifier(), '...",4.778453,3.238441,4.155755,3.50525,3.717314,3.879043,0.540695,4
7,4.109799,0.142681,0.096601,0.010249,LogisticRegression(max_iter=1000),3.0,l2,saga,,,...,,{'classifier': LogisticRegression(max_iter=100...,4.517294,3.185326,4.101594,3.41457,4.074699,3.858697,0.48787,5
23,5.530398,0.170529,0.116399,0.008016,RandomForestClassifier(),,,,0.001,20.0,...,,"{'classifier': RandomForestClassifier(), 'clas...",3.418798,4.013157,4.691968,3.362089,3.768925,3.850988,0.482927,6
1,4.850599,0.599887,0.1062,0.011736,LogisticRegression(max_iter=1000),1.0,l1,saga,,,...,,{'classifier': LogisticRegression(max_iter=100...,4.517294,3.185326,4.101594,3.41457,3.960405,3.835838,0.479825,7
12,3.814401,0.165497,0.096001,0.005177,LogisticRegression(max_iter=1000),30.0,l1,liblinear,,,...,,{'classifier': LogisticRegression(max_iter=100...,4.517294,3.185634,4.101594,3.447474,3.902329,3.830865,0.471783,8
13,4.8956,0.653005,0.112,0.020445,LogisticRegression(max_iter=1000),30.0,l1,saga,,,...,,{'classifier': LogisticRegression(max_iter=100...,4.517294,3.185634,4.101594,3.447474,3.902329,3.830865,0.471783,8
14,4.366399,0.673443,0.117601,0.025382,LogisticRegression(max_iter=1000),30.0,l2,liblinear,,,...,,{'classifier': LogisticRegression(max_iter=100...,4.517294,3.185634,4.101594,3.44141,3.902329,3.829652,0.472774,10


In [168]:
param_grid = [
    {'classifier': [LogisticRegression(max_iter=1000)],
     'classifier__C': [1, 3, 10, 30],
     'classifier__solver': ['liblinear', 'saga'],  # 'liblinear' and 'saga' work well with small datasets and L1 penalty
     'classifier__penalty': ['l1', 'l2']},
    
    {'classifier': [RandomForestClassifier()],
     'classifier__n_estimators': [50, 100],
     'classifier__max_features': ['sqrt', 'log2'],
     'classifier__max_depth': [10, 20],
     'classifier__ccp_alpha': [0.001, 0.01, 0.1]},
    
    {'classifier': [GradientBoostingClassifier()],
     'classifier__n_estimators': [50, 100, 150],
     'classifier__learning_rate': [0.1, 0.3, 1, 3],
     'classifier__max_depth': [2, 3, 4, 5]}
]

grid_search4 = GridSearchCV(p3, param_grid, cv=5, scoring=make_scorer(profit_top_20, response_method="predict_proba", greater_is_better=True), n_jobs=-1, verbose=2)
grid_search4.fit(ros_X_train, ros_y_train)

Fitting 5 folds for each of 88 candidates, totalling 440 fits


In [1]:
pd.DataFrame(grid_search4.cv_results_).sort_values('mean_test_score', ascending=False).head(30)

NameError: name 'pd' is not defined