In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

cur_dir = os.getcwd()
train_set = pd.read_csv(cur_dir + '/../data/train_set_artificial.csv', low_memory=False)
test_set = pd.read_csv(cur_dir + '/../data/test_set.csv', low_memory=False)
val_set = pd.read_csv(cur_dir + '/../data/validation_set.csv', low_memory=False)

#Add validation set to test set, since we wound up using cross validation
test_set = pd.concat([test_set, val_set], axis=0)

df = pd.concat([train_set, test_set, val_set], axis=0)

X_train = train_set.drop(['delay_class'],axis=1)
y_train = train_set['delay_class']

X_test = test_set.drop(['delay_class'],axis=1)
y_test = test_set['delay_class']

In [2]:
df

Unnamed: 0,carrier,origin,dest,air_time,distance,Maximum,Minimum,Average,Departure,Precipitation,New Snow,Snow Depth,days_in_365,sched_time_in_min,Precipitation Binary,New Snow Binary,Snow Depth Binary,delay_class
0,DL,JFK,MCO,0.170370,0.176219,0.358025,0.346667,0.350649,0.418301,0.000000,0.0,0.0,0.917582,0.316067,yes,no,no,no
1,EV,EWR,MCI,0.229630,0.206404,0.407407,0.333333,0.370130,0.254902,0.000000,0.0,0.0,0.840659,0.374012,yes,no,no,no
2,B6,JFK,MSY,0.232593,0.224760,0.308642,0.293333,0.298701,0.241830,0.000000,0.0,0.0,0.250000,0.087796,no,no,no,no
3,B6,EWR,FLL,0.195556,0.200897,0.506173,0.493333,0.500000,0.459695,0.000000,0.0,0.0,0.274725,0.579456,yes,no,no,yes
4,WN,LGA,BNA,0.130370,0.139506,0.691358,0.773333,0.733766,0.250545,0.004988,0.0,0.0,0.530220,0.267779,yes,no,no,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65464,UA,EWR,SJU,0.266667,0.311646,0.740741,0.640000,0.694805,0.246187,0.000000,0.0,0.0,0.623626,0.210711,no,no,no,yes
65465,EV,EWR,ATL,0.130370,0.135835,0.259259,0.266667,0.259740,0.344227,0.000000,0.0,0.0,0.164835,0.611062,yes,yes,no,no
65466,UA,EWR,SFO,0.441481,0.506833,0.814815,0.826667,0.824675,0.457516,0.000000,0.0,0.0,0.494505,0.759438,yes,no,no,yes
65467,DL,LGA,PBI,0.198519,0.194779,0.407407,0.306667,0.357143,0.267974,0.000000,0.0,0.0,0.870879,0.373134,no,no,no,no


In [3]:
from sklearn.preprocessing import OneHotEncoder

# Transform categorical features into binary features
categorical_columns = list(X_train.select_dtypes(include=['object']).columns)
encoder = OneHotEncoder()

# Use df to fit the encoder to prevent scenarios that the binary features in train and test sets are different, 
# For example, test set has dest_LAX, but train set does not have dest_LAX
encoder.fit(df[categorical_columns])

X_train_category = encoder.transform(X_train[categorical_columns])
X_test_category = encoder.transform(X_test[categorical_columns])

# Get numerical features
numerical_columns = list(X_train.select_dtypes(include=['float64']).columns)

# Combine the numerical and categorical features
X_train_df_category = pd.DataFrame(X_train_category.toarray())
X_train_df_category.columns = encoder.get_feature_names_out()
X_train_df_numerical = pd.DataFrame(X_train[numerical_columns]).reset_index(drop=True)
X_train_encoded = pd.concat([X_train_df_numerical, X_train_df_category], axis=1)


X_test_df_category = pd.DataFrame(X_test_category.toarray())
X_test_df_category.columns = encoder.get_feature_names_out()
X_test_df_numerical = pd.DataFrame(X_test[numerical_columns]).reset_index(drop=True)
X_test_encoded = pd.concat([X_test_df_numerical, X_test_df_category], axis=1)

# Drop encoded columns that have very few positive values, in order to reduce our dimensionality.
# These columns were identified in "DropOneHotEncodingColumn.ipynb"
columns_to_drop = ['dest_CHO', 'dest_BUF', 'dest_CAE', 'dest_CHS', 'dest_BUR', 'dest_BQN', 'dest_BGR', 'dest_SNA', 'dest_IND', 'dest_ABQ', 'dest_BWI', 'dest_OAK', 'dest_SAN', 'dest_SDF', 'dest_HNL', 'dest_GSP', 'dest_TUL', 'dest_TYS', 'dest_RSW', 'New Snow Binary_no','dest_SBN', 'dest_OMA', 'dest_GRR', 'dest_BDL', 'dest_MKE', 'dest_PWM', 'dest_DSM', 'dest_JAX', 'dest_HDN','dest_STL','Precipitation Binary_no','dest_EYW','dest_EGE','dest_PVD','dest_PSE','dest_PDX','dest_MTJ','dest_MEM','dest_JAC','dest_SAV','dest_SLC','dest_CAK','dest_RIC','dest_MVY','dest_LEX','dest_DAY','dest_PIT','dest_CRW','dest_BZN','dest_ORF','dest_BTV','dest_XNA','dest_IAD','dest_GSO','dest_MHT','dest_SEA','dest_SRQ','dest_PSP','dest_ANC','dest_CVG','dest_STT','dest_MYR','dest_SJC','dest_MDW','dest_AUS','dest_ACK','dest_CMH','dest_PHL','dest_MSN','dest_SMF','dest_CLE','dest_PHX','dest_AVL','dest_ALB','dest_ILM','dest_ROC','Snow Depth Binary_no','dest_MCI','dest_HOU','dest_LAS','dest_OKC','dest_SAT','dest_TVC','dest_SYR','dest_BHM','dest_LGB','dest_MSY']

X_train_encoded.drop(columns_to_drop, axis=1, inplace=True)
X_test_encoded.drop(columns_to_drop, axis=1, inplace=True)

In [4]:
X_train_encoded

Unnamed: 0,air_time,distance,Maximum,Minimum,Average,Departure,Precipitation,New Snow,Snow Depth,days_in_365,...,dest_MSP,dest_ORD,dest_PBI,dest_RDU,dest_SFO,dest_SJU,dest_TPA,Precipitation Binary_yes,New Snow Binary_yes,Snow Depth Binary_yes
0,0.170370,0.176219,0.358025,0.346667,0.350649,0.418301,0.000000,0.0,0.0,0.917582,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.229630,0.206404,0.407407,0.333333,0.370130,0.254902,0.000000,0.0,0.0,0.840659,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.232593,0.224760,0.308642,0.293333,0.298701,0.241830,0.000000,0.0,0.0,0.250000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.195556,0.200897,0.506173,0.493333,0.500000,0.459695,0.000000,0.0,0.0,0.274725,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.130370,0.139506,0.691358,0.773333,0.733766,0.250545,0.004988,0.0,0.0,0.530220,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
293895,0.432344,0.483009,0.716049,0.666667,0.694805,0.405229,0.000000,0.0,0.0,0.445055,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
293896,0.119368,0.122401,0.740741,0.680000,0.714286,0.601307,0.000000,0.0,0.0,0.758242,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
293897,0.182222,0.207220,0.595520,0.589003,0.593587,0.350269,0.002057,0.0,0.0,0.372975,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
293898,0.048651,0.045074,0.738151,0.736785,0.740578,0.395707,0.018992,0.0,0.0,0.480462,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [5]:
X_test_encoded

Unnamed: 0,air_time,distance,Maximum,Minimum,Average,Departure,Precipitation,New Snow,Snow Depth,days_in_365,...,dest_MSP,dest_ORD,dest_PBI,dest_RDU,dest_SFO,dest_SJU,dest_TPA,Precipitation Binary_yes,New Snow Binary_yes,Snow Depth Binary_yes
0,0.305185,0.267387,0.444444,0.253333,0.350649,0.627451,0.112219,0.0,0.0,0.082418,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,0.407407,0.488476,0.827160,0.840000,0.837662,0.501089,0.000000,0.0,0.0,0.516484,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.405926,0.437895,0.296296,0.253333,0.272727,0.294118,0.000000,0.0,0.0,0.197802,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,0.183704,0.139099,0.790123,0.813333,0.805195,0.385621,0.000000,0.0,0.0,0.607143,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.171852,0.133184,0.345679,0.266667,0.305195,0.405229,0.000000,0.0,0.1,0.967033,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130934,0.266667,0.311646,0.740741,0.640000,0.694805,0.246187,0.000000,0.0,0.0,0.623626,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
130935,0.130370,0.135835,0.259259,0.266667,0.259740,0.344227,0.000000,0.0,0.0,0.164835,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
130936,0.441481,0.506833,0.814815,0.826667,0.824675,0.457516,0.000000,0.0,0.0,0.494505,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
130937,0.198519,0.194779,0.407407,0.306667,0.357143,0.267974,0.000000,0.0,0.0,0.870879,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

classifier = LogisticRegression()

param_grid = {
    'penalty': ['l1','l2','elasticnet',None],
    'solver': ['lbfgs', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
    'C': [1, 10, 15]
}

print("Performing grid search...")
grid_search = GridSearchCV(classifier, param_grid=param_grid, cv=4, verbose=2)
grid_search.fit(X_train_encoded, y_train)
print("Grid Search complete!")
print("Best Hyperparameters:", grid_search.best_params_)

accuracy = grid_search.score(X_test_encoded, y_test)
print("Accuracy: {:.2f}%".format(accuracy * 100))

classifier = grid_search

Performing grid search...
Fitting 4 folds for each of 60 candidates, totalling 240 fits
[CV] END ......................C=1, penalty=l1, solver=lbfgs; total time=   0.2s
[CV] END ......................C=1, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END ......................C=1, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END ......................C=1, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END ..................C=1, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END ..................C=1, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END ..................C=1, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END ..................C=1, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END ............C=1, penalty=l1, solver=newton-cholesky; total time=   0.0s
[CV] END ............C=1, penalty=l1, solver=newton-cholesky; total time=   0.0s
[CV] END ............C=1, penalty=l1, solver=newton-cholesky; total time=   0.0s
[CV] END ............

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ......................C=1, penalty=l2, solver=lbfgs; total time=   2.5s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ......................C=1, penalty=l2, solver=lbfgs; total time=   2.7s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ......................C=1, penalty=l2, solver=lbfgs; total time=   2.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ......................C=1, penalty=l2, solver=lbfgs; total time=   2.7s
[CV] END ..................C=1, penalty=l2, solver=newton-cg; total time=   9.0s
[CV] END ..................C=1, penalty=l2, solver=newton-cg; total time=   7.0s
[CV] END ..................C=1, penalty=l2, solver=newton-cg; total time=   8.7s
[CV] END ..................C=1, penalty=l2, solver=newton-cg; total time=   9.9s
[CV] END ............C=1, penalty=l2, solver=newton-cholesky; total time=   0.8s
[CV] END ............C=1, penalty=l2, solver=newton-cholesky; total time=   0.8s
[CV] END ............C=1, penalty=l2, solver=newton-cholesky; total time=   0.8s
[CV] END ............C=1, penalty=l2, solver=newton-cholesky; total time=   0.8s
[CV] END ........................C=1, penalty=l2, solver=sag; total time=   7.1s
[CV] END ........................C=1, penalty=l2, solver=sag; total time=   5.3s
[CV] END ........................C=1, penalty=l2, solver=sag; total time=   6.0s
[CV] END ...................

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ....................C=1, penalty=None, solver=lbfgs; total time=   2.6s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ....................C=1, penalty=None, solver=lbfgs; total time=   2.7s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ....................C=1, penalty=None, solver=lbfgs; total time=   2.5s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ....................C=1, penalty=None, solver=lbfgs; total time=   2.8s
[CV] END ................C=1, penalty=None, solver=newton-cg; total time=   7.2s
[CV] END ................C=1, penalty=None, solver=newton-cg; total time=   6.2s
[CV] END ................C=1, penalty=None, solver=newton-cg; total time=   6.4s
[CV] END ................C=1, penalty=None, solver=newton-cg; total time=   6.3s


Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=2.67452e-17): result may not be accurate.


[CV] END ..........C=1, penalty=None, solver=newton-cholesky; total time=   1.7s


Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=2.21252e-17): result may not be accurate.


[CV] END ..........C=1, penalty=None, solver=newton-cholesky; total time=   1.6s


Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=4.16947e-18): result may not be accurate.


[CV] END ..........C=1, penalty=None, solver=newton-cholesky; total time=   1.5s


Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=6.07913e-18): result may not be accurate.


[CV] END ..........C=1, penalty=None, solver=newton-cholesky; total time=   1.6s
[CV] END ......................C=1, penalty=None, solver=sag; total time=  10.2s
[CV] END ......................C=1, penalty=None, solver=sag; total time=   7.6s
[CV] END ......................C=1, penalty=None, solver=sag; total time=   8.0s
[CV] END ......................C=1, penalty=None, solver=sag; total time=   8.7s
[CV] END .....................C=1, penalty=None, solver=saga; total time=   5.3s
[CV] END .....................C=1, penalty=None, solver=saga; total time=   4.2s
[CV] END .....................C=1, penalty=None, solver=saga; total time=   4.6s
[CV] END .....................C=1, penalty=None, solver=saga; total time=   4.7s
[CV] END .....................C=10, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END .....................C=10, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END .....................C=10, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END ...................

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END .....................C=10, penalty=l2, solver=lbfgs; total time=   2.6s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END .....................C=10, penalty=l2, solver=lbfgs; total time=   2.6s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END .....................C=10, penalty=l2, solver=lbfgs; total time=   2.4s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END .....................C=10, penalty=l2, solver=lbfgs; total time=   2.5s
[CV] END .................C=10, penalty=l2, solver=newton-cg; total time=   8.2s




[CV] END .................C=10, penalty=l2, solver=newton-cg; total time=   8.2s
[CV] END .................C=10, penalty=l2, solver=newton-cg; total time=   6.0s
[CV] END .................C=10, penalty=l2, solver=newton-cg; total time=   7.4s
[CV] END ...........C=10, penalty=l2, solver=newton-cholesky; total time=   0.8s
[CV] END ...........C=10, penalty=l2, solver=newton-cholesky; total time=   0.8s
[CV] END ...........C=10, penalty=l2, solver=newton-cholesky; total time=   0.8s
[CV] END ...........C=10, penalty=l2, solver=newton-cholesky; total time=   0.8s




[CV] END .......................C=10, penalty=l2, solver=sag; total time=  18.8s




[CV] END .......................C=10, penalty=l2, solver=sag; total time=  18.7s




[CV] END .......................C=10, penalty=l2, solver=sag; total time=  16.9s




[CV] END .......................C=10, penalty=l2, solver=sag; total time=  18.7s




[CV] END ......................C=10, penalty=l2, solver=saga; total time=  20.2s




[CV] END ......................C=10, penalty=l2, solver=saga; total time=  28.8s




[CV] END ......................C=10, penalty=l2, solver=saga; total time=  28.9s




[CV] END ......................C=10, penalty=l2, solver=saga; total time=  20.0s
[CV] END .............C=10, penalty=elasticnet, solver=lbfgs; total time=   0.0s
[CV] END .............C=10, penalty=elasticnet, solver=lbfgs; total time=   0.0s
[CV] END .............C=10, penalty=elasticnet, solver=lbfgs; total time=   0.0s
[CV] END .............C=10, penalty=elasticnet, solver=lbfgs; total time=   0.0s
[CV] END .........C=10, penalty=elasticnet, solver=newton-cg; total time=   0.0s
[CV] END .........C=10, penalty=elasticnet, solver=newton-cg; total time=   0.0s
[CV] END .........C=10, penalty=elasticnet, solver=newton-cg; total time=   0.0s
[CV] END .........C=10, penalty=elasticnet, solver=newton-cg; total time=   0.0s
[CV] END ...C=10, penalty=elasticnet, solver=newton-cholesky; total time=   0.0s
[CV] END ...C=10, penalty=elasticnet, solver=newton-cholesky; total time=   0.0s
[CV] END ...C=10, penalty=elasticnet, solver=newton-cholesky; total time=   0.0s
[CV] END ...C=10, penalty=el

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ...................C=10, penalty=None, solver=lbfgs; total time=   2.5s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ...................C=10, penalty=None, solver=lbfgs; total time=   2.7s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ...................C=10, penalty=None, solver=lbfgs; total time=   2.6s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ...................C=10, penalty=None, solver=lbfgs; total time=   2.6s
[CV] END ...............C=10, penalty=None, solver=newton-cg; total time=   7.2s




[CV] END ...............C=10, penalty=None, solver=newton-cg; total time=   6.1s




[CV] END ...............C=10, penalty=None, solver=newton-cg; total time=   6.7s




[CV] END ...............C=10, penalty=None, solver=newton-cg; total time=   6.1s


Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=2.67452e-17): result may not be accurate.


[CV] END .........C=10, penalty=None, solver=newton-cholesky; total time=   1.7s


Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=2.21252e-17): result may not be accurate.


[CV] END .........C=10, penalty=None, solver=newton-cholesky; total time=   1.6s


Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=4.16947e-18): result may not be accurate.


[CV] END .........C=10, penalty=None, solver=newton-cholesky; total time=   1.6s


Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=6.07913e-18): result may not be accurate.


[CV] END .........C=10, penalty=None, solver=newton-cholesky; total time=   1.6s




[CV] END .....................C=10, penalty=None, solver=sag; total time=   9.3s




[CV] END .....................C=10, penalty=None, solver=sag; total time=   9.4s




[CV] END .....................C=10, penalty=None, solver=sag; total time=   8.9s




[CV] END .....................C=10, penalty=None, solver=sag; total time=   8.9s




[CV] END ....................C=10, penalty=None, solver=saga; total time=   9.0s




[CV] END ....................C=10, penalty=None, solver=saga; total time=   5.1s




[CV] END ....................C=10, penalty=None, solver=saga; total time=   4.6s




[CV] END ....................C=10, penalty=None, solver=saga; total time=   4.2s
[CV] END .....................C=15, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END .....................C=15, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END .....................C=15, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END .....................C=15, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END .................C=15, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END .................C=15, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END .................C=15, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END .................C=15, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END ...........C=15, penalty=l1, solver=newton-cholesky; total time=   0.0s
[CV] END ...........C=15, penalty=l1, solver=newton-cholesky; total time=   0.0s
[CV] END ...........C=15, penalty=l1, solver=newton-cholesky; total time=   0.0s
[CV] END ...........C=15, pe

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END .....................C=15, penalty=l2, solver=lbfgs; total time=   2.7s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END .....................C=15, penalty=l2, solver=lbfgs; total time=   2.5s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END .....................C=15, penalty=l2, solver=lbfgs; total time=   2.6s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END .....................C=15, penalty=l2, solver=lbfgs; total time=   2.6s
[CV] END .................C=15, penalty=l2, solver=newton-cg; total time=   6.7s
[CV] END .................C=15, penalty=l2, solver=newton-cg; total time=   9.6s
[CV] END .................C=15, penalty=l2, solver=newton-cg; total time=   7.0s
[CV] END .................C=15, penalty=l2, solver=newton-cg; total time=  10.6s
[CV] END ...........C=15, penalty=l2, solver=newton-cholesky; total time=   0.8s
[CV] END ...........C=15, penalty=l2, solver=newton-cholesky; total time=   0.8s
[CV] END ...........C=15, penalty=l2, solver=newton-cholesky; total time=   0.8s
[CV] END ...........C=15, penalty=l2, solver=newton-cholesky; total time=   0.8s




[CV] END .......................C=15, penalty=l2, solver=sag; total time=  17.5s




[CV] END .......................C=15, penalty=l2, solver=sag; total time=  16.9s




[CV] END .......................C=15, penalty=l2, solver=sag; total time=  15.0s




[CV] END .......................C=15, penalty=l2, solver=sag; total time=  15.0s




[CV] END ......................C=15, penalty=l2, solver=saga; total time=  20.7s




[CV] END ......................C=15, penalty=l2, solver=saga; total time=  21.2s




[CV] END ......................C=15, penalty=l2, solver=saga; total time=  18.4s




[CV] END ......................C=15, penalty=l2, solver=saga; total time=  17.5s
[CV] END .............C=15, penalty=elasticnet, solver=lbfgs; total time=   0.0s
[CV] END .............C=15, penalty=elasticnet, solver=lbfgs; total time=   0.0s
[CV] END .............C=15, penalty=elasticnet, solver=lbfgs; total time=   0.0s
[CV] END .............C=15, penalty=elasticnet, solver=lbfgs; total time=   0.0s
[CV] END .........C=15, penalty=elasticnet, solver=newton-cg; total time=   0.0s
[CV] END .........C=15, penalty=elasticnet, solver=newton-cg; total time=   0.0s
[CV] END .........C=15, penalty=elasticnet, solver=newton-cg; total time=   0.0s
[CV] END .........C=15, penalty=elasticnet, solver=newton-cg; total time=   0.0s
[CV] END ...C=15, penalty=elasticnet, solver=newton-cholesky; total time=   0.0s
[CV] END ...C=15, penalty=elasticnet, solver=newton-cholesky; total time=   0.0s
[CV] END ...C=15, penalty=elasticnet, solver=newton-cholesky; total time=   0.0s
[CV] END ...C=15, penalty=el

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ...................C=15, penalty=None, solver=lbfgs; total time=   2.9s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ...................C=15, penalty=None, solver=lbfgs; total time=   2.8s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ...................C=15, penalty=None, solver=lbfgs; total time=   2.7s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ...................C=15, penalty=None, solver=lbfgs; total time=   2.8s
[CV] END ...............C=15, penalty=None, solver=newton-cg; total time=   7.6s




[CV] END ...............C=15, penalty=None, solver=newton-cg; total time=   6.4s




[CV] END ...............C=15, penalty=None, solver=newton-cg; total time=   6.4s




[CV] END ...............C=15, penalty=None, solver=newton-cg; total time=   6.4s


Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=2.67452e-17): result may not be accurate.


[CV] END .........C=15, penalty=None, solver=newton-cholesky; total time=   1.7s


Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=2.21252e-17): result may not be accurate.


[CV] END .........C=15, penalty=None, solver=newton-cholesky; total time=   1.6s


Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=4.16947e-18): result may not be accurate.


[CV] END .........C=15, penalty=None, solver=newton-cholesky; total time=   1.5s


Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=6.07913e-18): result may not be accurate.


[CV] END .........C=15, penalty=None, solver=newton-cholesky; total time=   1.5s




[CV] END .....................C=15, penalty=None, solver=sag; total time=   8.0s




[CV] END .....................C=15, penalty=None, solver=sag; total time=   8.5s




[CV] END .....................C=15, penalty=None, solver=sag; total time=   8.1s




[CV] END .....................C=15, penalty=None, solver=sag; total time=   9.2s




[CV] END ....................C=15, penalty=None, solver=saga; total time=   6.8s




[CV] END ....................C=15, penalty=None, solver=saga; total time=   4.2s




[CV] END ....................C=15, penalty=None, solver=saga; total time=   4.4s




[CV] END ....................C=15, penalty=None, solver=saga; total time=   5.4s


108 fits failed out of a total of 240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

------------------------------------

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=3.45564e-18): result may not be accurate.


Grid Search complete!
Best Hyperparameters: {'C': 1, 'penalty': None, 'solver': 'newton-cholesky'}
Accuracy: 65.52%


In [7]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

y_pred = classifier.predict(X_test_encoded)
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

print(classification_report(y_test, y_pred))

[[64388 33707]
 [11446 21398]]
              precision    recall  f1-score   support

          no       0.85      0.66      0.74     98095
         yes       0.39      0.65      0.49     32844

    accuracy                           0.66    130939
   macro avg       0.62      0.65      0.61    130939
weighted avg       0.73      0.66      0.68    130939

