In [1]:
#Basic imports for all datasets
import numpy as np 
import pandas as pd   # for data reading 
import matplotlib.pyplot as plt
import sklearn
import sklearn.linear_model
import sklearn.tree
import sklearn.ensemble
import sklearn.naive_bayes
import sklearn.neural_network
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import sklearn.metrics           # For accuracy_score
import sklearn.model_selection   # For GridSearchCV and RandomizedSearchCV
import scipy
import scipy.stats               # For reciprocal distribution
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)  # Ignore sklearn deprecation warnings
warnings.filterwarnings("ignore", category=FutureWarning)       # Ignore sklearn deprecation warnings
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, RationalQuadratic, ExpSineSquared

In [3]:

import pandas as pd

class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self 
    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = preprocessing.LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = preprocessing.LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [5]:
data = np.loadtxt('seismic-bumps.arff',dtype = 'str',delimiter=',',comments=('@','%'))
columns = np.arange(19)
df = pd.DataFrame(data,columns=columns)
df = MultiColumnLabelEncoder(columns = [0,1,2,7]).fit_transform(df)
data = df.values
data = data.astype(np.float)
X_train, X_test, y_train, y_test = train_test_split(
    data[:,:18],data[:,18], test_size=0.2, random_state=42)

In [6]:
#k-Nearest neighbours classification
# knn_model = sklearn.neighbors.KNeighborsClassifier(n_neighbors=35)
# knn_model.fit(X_train, y_train)
# y_pred = knn_model.predict(X_test)
# sklearn.metrics.accuracy_score(y_test, y_pred)



knn_model = sklearn.neighbors.KNeighborsClassifier(n_jobs=-1)
param_grid = {'n_neighbors':(np.arange(2,30, 3))}

mdls = sklearn.model_selection.GridSearchCV(knn_model, param_grid, verbose=1,cv=5).fit(X_train, y_train)
print(mdls.best_estimator_)
y_pred = mdls.best_estimator_.predict(X_test)
sklearn.metrics.accuracy_score(y_test, y_pred)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=20, p=2,
                     weights='uniform')


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    6.6s finished


0.9284332688588007

In [21]:
#Logistic regression
# logistic_model = sklearn.linear_model.LogisticRegression(C = 35,fit_intercept=False, penalty='l2', solver='lbfgs',max_iter = 1000)
# logistic_model.fit(X_train, y_train);
# y_pred = logistic_model.predict(X_test)
# sklearn.metrics.accuracy_score(y_test, y_pred)


param_grid = { "fit_intercept":[True], "solver":['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 
             "max_iter":np.arange(100,400, 100)}


mdls = sklearn.model_selection.GridSearchCV(logistic_model, param_grid, verbose=1,cv=5).fit(X_train, y_train)
print(mdls.best_estimator_)

y_pred = mdls.best_estimator_.predict(X_test)
sklearn.metrics.accuracy_score(y_test, y_pred)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.












LogisticRegression(C=35, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='sag', tol=0.0001, verbose=0,
                   warm_start=False)


[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:   29.7s finished


0.9284332688588007

In [22]:
#SVM classifier
svm_model = sklearn.svm.SVC()
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
sklearn.metrics.accuracy_score(y_test, y_pred)

0.9284332688588007

In [28]:
#Decision tree classification
# DTC_model = sklearn.tree.DecisionTreeClassifier()
# DTC_model.fit(X_train, y_train)
# y_pred = DTC_model.predict(X_test)
# sklearn.metrics.accuracy_score(y_test, y_pred)


DTC_model = sklearn.tree.DecisionTreeClassifier(random_state=0)
Max_features = ['auto', 'sqrt', 'log2']
Max_depths = np.arange(1,34,2)
Min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)
Min_samples_leafs = np.linspace(0.01, 0.05, 5, endpoint=True)
param_grid = {'max_features': Max_features, 'max_depth': Max_depths,  'min_samples_split': Min_samples_splits, 'min_samples_leaf': Min_samples_leafs}

mdls = sklearn.model_selection.GridSearchCV(DTC_model, param_grid, verbose=1,cv=5).fit(X_train, y_train)
print(mdls.best_estimator_)

y_pred = mdls.best_estimator_.predict(X_test)
sklearn.metrics.accuracy_score(y_test, y_pred)

Fitting 5 folds for each of 2550 candidates, totalling 12750 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
                       max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=0.01, min_samples_split=0.1,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=0, splitter='best')


[Parallel(n_jobs=1)]: Done 12750 out of 12750 | elapsed:   40.2s finished


0.9284332688588007

In [29]:
#Random forest classification
# RFC_model = sklearn.ensemble.RandomForestClassifier()
# RFC_model.fit(X_train, y_train)
# y_pred = RFC_model.predict(X_test)
# sklearn.metrics.accuracy_score(y_test, y_pred)



RFC_model = sklearn.ensemble.RandomForestClassifier(random_state=0)
Estimators = np.arange(100,105,1)
Max_features = ['auto', 'sqrt', 'log2']
param_grid = {'n_estimators': Estimators,'max_features': Max_features, }

mdls = sklearn.model_selection.GridSearchCV(RFC_model, param_grid, verbose=1,cv=5).fit(X_train, y_train)
print(mdls.best_estimator_)

y_pred = mdls.best_estimator_.predict(X_test)
sklearn.metrics.accuracy_score(y_test, y_pred)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:   38.2s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=104,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)


0.9284332688588007

In [30]:
#AdaBoost classification
# ABC_model = sklearn.ensemble.AdaBoostClassifier()
# ABC_model.fit(X_train, y_train)
# y_pred = ABC_model.predict(X_test)
# sklearn.metrics.accuracy_score(y_test, y_pred)



ABC_model = sklearn.ensemble.AdaBoostClassifier(random_state=0)
Estimators = np.arange(50,100,10)
Learning_rates = [0.01,0.05,0.1,0.3,1]
algorithm = ['SAMME', 'SAMME.R']
param_grid = {'n_estimators': Estimators, 'learning_rate': Learning_rates, 'algorithm': algorithm}

mdls = sklearn.model_selection.GridSearchCV(ABC_model, param_grid, verbose=1,cv=5).fit(X_train, y_train)
print(mdls.best_estimator_)

y_pred = mdls.best_estimator_.predict(X_test)
sklearn.metrics.accuracy_score(y_test, y_pred)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:  1.5min finished


AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.3,
                   n_estimators=50, random_state=0)


0.9284332688588007

In [26]:

#Gaussian naive Bayes classification
GNB_model = sklearn.naive_bayes.GaussianNB()
GNB_model.fit(X_train, y_train)
y_pred = GNB_model.predict(X_test)
sklearn.metrics.accuracy_score(y_test, y_pred)

0.9187620889748549

In [7]:
#Neural network classification
# NNC_model = sklearn.neural_network.MLPClassifier()
# NNC_model.fit(X_train, y_train)
# y_pred = NNC_model.predict(X_test)
# sklearn.metrics.accuracy_score(y_test, y_pred)


NNC_model = sklearn.neural_network.MLPClassifier()
batch_size = [50, 100]
epochs = [10, 50, 100]
learn_rate = [0.001, 0.01, 0.1]
momentum = [ 0.4, 0.8]
neurons = [1, 5, 10, 15, 20, 25, 30] 
activation = ['identity', 'logistic', 'tanh', 'relu']
alpha = [0.0001,0.002]
param_grid = {'batch_size':batch_size,  'momentum':momentum, 
              'activation' : activation, 'alpha':alpha }

mdls = sklearn.model_selection.GridSearchCV(NNC_model, param_grid, verbose=1,cv=5).fit(X_train, y_train)
print(mdls.best_estimator_)

y_pred = mdls.best_estimator_.predict(X_test)
sklearn.metrics.accuracy_score(y_test, y_pred)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.




[Parallel(n_jobs=1)]: Done 160 out of 160 | elapsed:  9.1min finished


MLPClassifier(activation='logistic', alpha=0.0001, batch_size=50, beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.4,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)


0.9284332688588007