In [1]:
import pandas as pd

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC


In [2]:
df = pd.read_csv('preprocessedData.csv')

X = df.copy()
X.drop(columns=['good_indicator', 'bad_indicator'], inplace=True)

y = df[['bad_indicator']]


In [8]:
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l1','l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=27)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y.values.ravel())
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best: 0.888062 using {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}
0.000000 (0.000000) with: {'C': 100, 'penalty': 'l1', 'solver': 'newton-cg'}
0.000000 (0.000000) with: {'C': 100, 'penalty': 'l1', 'solver': 'lbfgs'}
0.888036 (0.000071) with: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
0.888036 (0.000071) with: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.888049 (0.000063) with: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
0.888039 (0.000075) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.000000 (0.000000) with: {'C': 10, 'penalty': 'l1', 'solver': 'newton-cg'}
0.000000 (0.000000) with: {'C': 10, 'penalty': 'l1', 'solver': 'lbfgs'}
0.888036 (0.000071) with: {'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}
0.888037 (0.000071) with: {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
0.888047 (0.000059) with: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
0.888038 (0.000071) with: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
0.000000 (0.000000) with:

In [11]:
model = RidgeClassifier()
alpha = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
# define grid search
grid = dict(alpha=alpha)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y.values.ravel())
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.888052 using {'alpha': 0.3}
0.888051 (0.000042) with: {'alpha': 0.1}
0.888051 (0.000042) with: {'alpha': 0.2}
0.888052 (0.000042) with: {'alpha': 0.3}
0.888052 (0.000042) with: {'alpha': 0.4}
0.888052 (0.000042) with: {'alpha': 0.5}
0.888052 (0.000042) with: {'alpha': 0.6}
0.888052 (0.000042) with: {'alpha': 0.7}
0.888052 (0.000042) with: {'alpha': 0.8}
0.888052 (0.000042) with: {'alpha': 0.9}
0.888052 (0.000042) with: {'alpha': 1.0}


In [16]:
model = KNeighborsClassifier()

n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
# define grid search
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0, refit= 'accuracy')
grid_result = grid_search.fit(X, y.values.ravel())

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
prec = grid_result.cv_results_['mean_test_precision_macro']
rec = grid_result.cv_results_['mean_test_recall_macro']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for prec, rec, mean, stdev, param in zip( prec, rec, means, stds, params):
    print("prec %f, recall %f, mean %f (%f) with: %r" % (prec, rec, means, stdev, param))

In [3]:
model = SVC()

kernel = ['poly', 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]
gamma = ['scale']

# define grid search
grid = dict(kernel=kernel,C=C,gamma=gamma)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y.values.ravel())

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

KeyboardInterrupt: 

In [5]:
model = SVC()

kernel = ['poly', 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]
gamma = ['scale']

# define grid search
grid = dict(kernel=kernel,C=C,gamma=gamma)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0, refit= 'accuracy')
grid_result = grid_search.fit(X, y.values.ravel())

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
prec = grid_result.cv_results_['mean_test_precision_macro']
rec = grid_result.cv_results_['mean_test_recall_macro']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for prec, rec, mean, stdev, param in zip( prec, rec, means, stds, params):
    print("prec %f, recall %f, mean %f (%f) with: %r" % (prec, rec, means, stdev, param))

KeyboardInterrupt: 

In [None]:
from sklearn.ensemble import BaggingClassifier

model = BaggingClassifier()
n_estimators = [10, 100, 1000]
# define grid search
grid = dict(n_estimators=n_estimators)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0, refit= 'accuracy')
grid_result = grid_search.fit(X, y.values.ravel())

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
prec = grid_result.cv_results_['mean_test_precision_macro']
rec = grid_result.cv_results_['mean_test_recall_macro']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for prec, rec, mean, stdev, param in zip( prec, rec, means, stds, params):
    print("prec %f, recall %f, mean %f (%f) with: %r" % (prec, rec, means, stdev, param))

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
n_estimators = [10, 100, 1000]
max_features = ['sqrt', 'log2']
# define grid search
grid = dict(n_estimators=n_estimators,max_features=max_features)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0, refit= 'accuracy')
grid_result = grid_search.fit(X, y.values.ravel())

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
prec = grid_result.cv_results_['mean_test_precision_macro']
rec = grid_result.cv_results_['mean_test_recall_macro']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for prec, rec, mean, stdev, param in zip( prec, rec, means, stds, params):
    print("prec %f, recall %f, mean %f (%f) with: %r" % (prec, rec, means, stdev, param))