In [7]:
#Basic imports for all datasets
import numpy as np 
import pandas as pd   # for data reading 
import matplotlib.pyplot as plt
import sklearn
import sklearn.linear_model
import sklearn.tree
import sklearn.ensemble
import sklearn.naive_bayes
import sklearn.neural_network
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import sklearn.metrics           # For accuracy_score
import sklearn.model_selection   # For GridSearchCV and RandomizedSearchCV
import scipy
import scipy.stats               # For reciprocal distribution
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)  # Ignore sklearn deprecation warnings
warnings.filterwarnings("ignore", category=FutureWarning)       # Ignore sklearn deprecation warnings
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, RationalQuadratic, ExpSineSquared

In [8]:
#Dataset 2 : Default of credit card clients

#Loading dataset
data = np.loadtxt('australian.dat',dtype = 'str', delimiter=' ')

df = pd.DataFrame({'A1':data[:, 0], 'A2':data[:, 1], 'A3':data[:, 2], 'A4':data[:, 4], 'A5':data[:, 5], 
                   'A6':data[:, 5], 'A7':data[:, 6], 'A8':data[:, 7], 'A9':data[:, 8], 'A10':data[:, 9], 
                   'A11':data[:, 10], 'A12':data[:, 11], 'A13':data[:, 12], 'A14':data[:, 13], 'Class attribute':data[:, 14] })

df.sample(5)

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,Class attribute
668,0,32.25,1.5,8,4,4,0.25,0,0,0,1,2,372,123,0
462,1,20.42,1.085,11,4,4,1.5,0,0,0,0,2,108,8,0
608,0,27.67,2.04,9,4,4,0.25,0,0,0,1,2,180,51,0
119,1,48.75,8.5,8,8,8,12.5,1,1,9,0,2,181,1656,1
334,0,42.25,1.75,8,4,4,0.0,0,0,0,1,2,150,2,0


In [9]:
#Splitting training and testing data
X_train, X_test, y_train, y_test = train_test_split(
    data[:,:14], 
    df['Class attribute'] , 
    test_size=0.2, 
    random_state=0)
X_train = X_train.astype(np.float)
X_test = X_test.astype(np.float)

In [10]:
#k-Nearest neighbours classification
knn_model = sklearn.neighbors.KNeighborsClassifier(n_jobs=-1)
param_grid = {'n_neighbors':(np.arange(2,52, 5))}

mdls = sklearn.model_selection.GridSearchCV(knn_model, param_grid, verbose=1,cv=5).fit(X_train, y_train)
print(mdls.best_estimator_)
y_pred = mdls.best_estimator_.predict(X_test)
sklearn.metrics.accuracy_score(y_test, y_pred)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=32, p=2,
           weights='uniform')


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    1.1s finished


0.6811594202898551

In [11]:
#Logistic regression (for classification)
#Fit_intercept is set to True because we don't have bias
# logistic_model = sklearn.linear_model.LogisticRegression(fit_intercept=True)
logistic_model = sklearn.linear_model.LogisticRegression(n_jobs=-1)
param_grid = { "fit_intercept":[True], "solver":['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 
             "max_iter":np.arange(100,400, 100)}


mdls = sklearn.model_selection.GridSearchCV(logistic_model, param_grid, verbose=1,cv=5).fit(X_train, y_train)
print(mdls.best_estimator_)

y_pred = mdls.best_estimator_.predict(X_test)
sklearn.metrics.accuracy_score(y_test, y_pred)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 15 candidates, totalling 75 fits


  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)


[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:    3.4s finished
  " = {}.".format(effective_n_jobs(self.n_jobs)))


0.8840579710144928

In [12]:
#Decision tree classification
DTC_model = sklearn.tree.DecisionTreeClassifier(random_state=0)
Max_features = ['auto', 'sqrt', 'log2']
Max_depths = np.arange(1,34,2)
Min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)
Min_samples_leafs = np.linspace(0.01, 0.05, 5, endpoint=True)
param_grid = {'max_features': Max_features, 'max_depth': Max_depths,  'min_samples_split': Min_samples_splits, 'min_samples_leaf': Min_samples_leafs}

mdls = sklearn.model_selection.GridSearchCV(DTC_model, param_grid, verbose=1,cv=5).fit(X_train, y_train)
print(mdls.best_estimator_)

y_pred = mdls.best_estimator_.predict(X_test)
sklearn.metrics.accuracy_score(y_test, y_pred)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 2550 candidates, totalling 12750 fits
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=0.01, min_samples_split=0.1,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')


[Parallel(n_jobs=1)]: Done 12750 out of 12750 | elapsed:   34.8s finished


0.8115942028985508

In [13]:
#Random forest classification
RFC_model = sklearn.ensemble.RandomForestClassifier(random_state=0)
Estimators = np.arange(100,105,1)
Max_features = ['auto', 'sqrt', 'log2']
param_grid = {'n_estimators': Estimators,'max_features': Max_features, }

mdls = sklearn.model_selection.GridSearchCV(RFC_model, param_grid, verbose=1,cv=5).fit(X_train, y_train)
print(mdls.best_estimator_)

y_pred = mdls.best_estimator_.predict(X_test)
sklearn.metrics.accuracy_score(y_test, y_pred)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 15 candidates, totalling 75 fits
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=104, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)


[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:    8.6s finished


0.8913043478260869

In [14]:
#AdaBoost classification
ABC_model = sklearn.ensemble.AdaBoostClassifier(random_state=0)
Estimators = np.arange(50,100,10)
Learning_rates = [0.01,0.05,0.1,0.3,1]
algorithm = ['SAMME', 'SAMME.R']
param_grid = {'n_estimators': Estimators, 'learning_rate': Learning_rates, 'algorithm': algorithm}

mdls = sklearn.model_selection.GridSearchCV(ABC_model, param_grid, verbose=1,cv=5).fit(X_train, y_train)
print(mdls.best_estimator_)

y_pred = mdls.best_estimator_.predict(X_test)
sklearn.metrics.accuracy_score(y_test, y_pred)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 50 candidates, totalling 250 fits
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.05, n_estimators=90, random_state=0)


[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:   31.7s finished


0.8913043478260869

In [15]:
#Gaussian naive Bayes classification

zero_prob = y_train[y_train == 0].shape[0]/y_train.shape[0]
one_prob = 1 - zero_prob
prob = np.array([zero_prob,one_prob])
GNB_model = sklearn.naive_bayes.GaussianNB(priors = prob)
GNB_model.fit(X_train, y_train)
# mdls = sklearn.model_selection.GridSearchCV(GNB_model, param_grid, verbose=1,cv=5).fit(X_train, y_train)
# print(mdls.best_estimator_)

y_pred = GNB_model.predict(X_test)
sklearn.metrics.accuracy_score(y_test, y_pred)

  jointi = np.log(self.class_prior_[i])


0.47101449275362317

In [16]:
#Neural network classification
NNC_model = sklearn.neural_network.MLPClassifier()
# batch_size = [10, 20, 40, 60, 80, 100]
batch_size = [100]
# epochs = [10, 50, 100]
epochs = [10]
learn_rate = [0.001, 0.01, 0.1]
momentum = [0.0, 0.2, 0.4, 0.6, 0.8]
neurons = [1, 5, 10, 15, 20, 25, 30] 
activation = ['identity', 'logistic', 'tanh', 'relu']
alpha = [0.0001,0.002]
param_grid = {'batch_size':batch_size,  'momentum':momentum, 
              'activation' : activation, }

mdls = sklearn.model_selection.GridSearchCV(NNC_model, param_grid, verbose=1,cv=5).fit(X_train, y_train)
print(mdls.best_estimator_)

y_pred = mdls.best_estimator_.predict(X_test)
sklearn.metrics.accuracy_score(y_test, y_pred)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   29.7s finished


MLPClassifier(activation='logistic', alpha=0.0001, batch_size=100, beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.6,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)


0.7753623188405797

In [17]:
#SVM classifier
svm_model = sklearn.svm.SVC()
Kernels = ['linear', 'poly', 'rbf', 'sigmoid']
Epsilons = [0.1,0.2,0.5,0.3]
Cs = [0.001, 0.01, 0.1, 1, 10]
Gammas = [0.001, 0.01, 0.1, 1]
param_grid = {'C': Cs, 'gamma' : Gammas}

mdls = sklearn.model_selection.GridSearchCV(svm_model, param_grid, verbose=1,cv=3).fit(X_train, y_train)
print(mdls.best_estimator_)

y_pred = mdls.best_estimator_.predict(X_test)
sklearn.metrics.accuracy_score(y_test, y_pred)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:    1.1s finished


0.717391304347826