<a href="https://colab.research.google.com/github/irenezi/solid-guide/blob/master/coursework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import pandas as pd
import numpy as np
from pandas import read_csv, set_option
from numpy import set_printoptions
from matplotlib import pyplot
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import Normalizer, MinMaxScaler
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2, RFE
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.ensemble import ExtraTreesClassifier
from keras.layers import Dense, Input 
from keras.models import Model
from keras.optimizers import Adam
from keras.regularizers import l1

In [4]:
def filter_constant_columns(df_data_set_x, variance=0):
  # This function filters out all the constant columns (variance = 0)
  
  constant_filter = VarianceThreshold(threshold=variance)
  constant_filter.fit(df_data_set_x)
  non_constant = constant_filter.get_support()

  columns_to_keep = (df_data_set_x.columns[constant_filter.get_support()])

  return df_data_set_x[columns_to_keep]

In [5]:
def normalize(x_train_input, x_test_input):
  # Normalize train and test data
  # Normalize the training set and save the normalization parameters
  # Normalize the test set using the training normalization parameters

  scaler=Normalizer().fit(x_train_input)
  x_train_norm=scaler.transform(x_train_input)
  x_test_norm=scaler.transform(x_test_input)
  
  return x_train_norm, x_test_norm

In [6]:
def rescale(x_train_input, x_test_input):
  # Rescale train and test data
  # Rescale the training set and save the normalization parameters
  # Rescale the test set using the training normalization parameters

  scaler=MinMaxScaler(feature_range=(0,1))
  scaler = scaler.fit(x_train_input)
  x_train_resc = scaler.transform(x_train_input)
  x_test_resc = scaler.transform(x_test_input)
  return x_train_resc, x_test_resc

In [9]:
def compare_algorithms(x_train_input, y_train_output, x_test_input, y_test_output):

  # prints the validation accuracy (using K-Folf method on the train data set)
  # and the test accuracy by apllying a variety of models to the test data set  
  
  models=[]
  models.append(('LR', LogisticRegression(solver='liblinear'))) 
  models.append(('LDA', LinearDiscriminantAnalysis()))
  models.append(('KNN', KNeighborsClassifier()))
  #models.append(('CART', DecisionTreeClassifier()))
  #models.append(('NB', GaussianNB()))
  models.append(('SVC', SVC()))
  models.append(('LnSVC', LinearSVC()))

  print("Validation accuracy:")
  for name, model in models:
    kfold = KFold(n_splits=5, random_state=3, shuffle=True)
    cv_results = cross_val_score(model,x_train_input,y_train_output,cv=kfold,scoring="accuracy")
    print("%s: %.3f (%.3f)" % (name,cv_results.mean(),cv_results.std()))
  
  print("Test accuracy:")
  for name, model in models:
    model.fit(x_train_input, y_train_output)
    test_prediction = model.predict(x_test_input)
    test_acc = accuracy_score(y_test_output, test_prediction)
    print("%s: %.3f" % (name,test_acc))


In [37]:
def kbest(x_train_input, y_train_output, x_test_input):
  # feature selection using K-best
  
  # apply SelectKBest class to extract top 10 best features
  bestfeatures = SelectKBest(score_func=chi2, k=10)
  fit = bestfeatures.fit(x_train_input, y_train_output)
  dfscores = pd.DataFrame(fit.scores_)
  df = pd.DataFrame(x_train_input)
  dfcolumns = pd.DataFrame(df.columns)

  # concat two dataframes for better visualization 
  featureScores = pd.concat([dfcolumns,dfscores],axis=1)
  # naming the dataframe columns
  featureScores.columns = ['Specs','Score']  

  # print 10 best features
  print(featureScores.nlargest(10,'Score'))  

  # find the columns of the 10 best features
  featureScores=featureScores.nlargest(10,'Score').values
  col=list(featureScores[:,0])

  # returned train data set
  features = fit.transform(x_train_input)

  # returned test data set
  df_test_features = pd.DataFrame(x_test_input)
  test_features = df_test_features[col]

  return features, test_features.values

In [119]:
def rfe_elimination(model, x_train_input, y_train_output, x_test_input):
  # apply Recursive Feature Elimination(RFE) for feature selection

  rfe=RFE(model,10)
  fit=rfe.fit(x_train_input, y_train_output)
  #print("Num Features: %d" % fit.n_features_)
  #print("Selected Features: %s" % fit.support_)
  #print("Feature Ranking: %s" % fit.ranking_)

  dfscores = pd.DataFrame(fit.ranking_)
  df = pd.DataFrame(x_train_input)
  dfcolumns = pd.DataFrame(df.columns)

  # concat two dataframes for better visualization 
  featureScores = pd.concat([dfcolumns,dfscores],axis=1)
  # naming the dataframe columns
  featureScores.columns = ['Specs','Score']  
  print(featureScores.nsmallest(10,'Score')) 

  # find the columns of the 10 best features
  featureScores=(featureScores.nsmallest(10,'Score')).values
  col=list(featureScores[:,0])
  print(col)

  # returned train data set
  features=fit.transform(x_train_input)

  # returned test data set
  df_test_features = pd.DataFrame(x_test_input)
  test_features = df_test_features[col]

  return features, test_features.values


In [126]:
def extra_trees_classifier(x_train_input, y_train_output, x_test_input):
  # feature selection with extra tree classifier - Feature Importance

  model = ExtraTreesClassifier()
  tree=model.fit(x_train_input,y_train_output)
  #print(model.feature_importances_)

  dfscores = pd.DataFrame(tree.feature_importances_)
  df = pd.DataFrame(x_train_input)
  dfcolumns = pd.DataFrame(df.columns)
  
  # concat two dataframes for better visualization 
  featureScores = pd.concat([dfcolumns,dfscores],axis=1)
  # naming the dataframe columns
  featureScores.columns = ['Specs','Score']  
  featureScores=featureScores.nlargest(10,'Score')
  print(featureScores)

  # find the columns of the 10 best features
  featureScores=featureScores.values
  col=list(featureScores[:,0])

  # returned train data set
  features=df[col]

  # returned test data set
  df_test_features = pd.DataFrame(x_test_input)
  test_features = df_test_features[col]

  return features.values, test_features.values


In [18]:
#feature union(kbest-RFE logistic reg-extra tree)
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import RFE
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score

def features_union(x_data, y_data):
  #create feature union
  
  # ('EXTRE', ExtraTreesClassifier())
  
  features=[
            ('KBEST', SelectKBest(k=10)),
            ('RFE', RFE(LogisticRegression(solver='liblinear'),10 ))
            
  ]

  # features = []
  # features.append(('pca', PCA(n_components=3)))
  # features.append(('select_best', SelectKBest(k=6)))

  union = FeatureUnion(features)
  union.fit(x_data, y_data)
  model = union.transform(x_data)
  
  return model


In [None]:
f_union = features_union(x_train, y_train)
print(f_union.shape)


(97044, 20)


In [None]:
#feature union(kbest-RFE logistic reg-extra tree)
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import RFE
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score

#create feature union
features=[]
#features.append(('PCA', PCA(n_components=3)))
features.append(('KBEST', SelectKBest(k=10)))
features.append(('RFE', RFE(LogisticRegression(solver='liblinear'),10 )))
#features.append(('EXTRE', ExtraTreesClassifier()))
feature_union=FeatureUnion(features)

#create pipeline
estimators=[]
estimators.append(('feat_union', feature_union))
estimators.append(('logistic', LogisticRegression()))
model=Pipeline(estimators)

#evaluate pipeline
kfold=KFold(n_splits=5, random_state=3, shuffle=True)
results=cross_val_score(model, x_train_resc,y_train, cv=kfold)
print(results.mean())


In [None]:
#no need to reshape!!!!
#reshape
#x_train_norm = x_train_norm.reshape((len(x_train_norm), np.prod(x_train_norm.shape[1:]))) 
#x_test_norm = x_test_norm.reshape((len(x_test_norm), np.prod(x_test_norm.shape[1:])))
#print(x_train_norm.shape)

(97044, 152)


In [27]:
def autoencoder(x_train_input, x_test_input):
  # returns the encoder model of a simple autoencoder with 2 hidden layers

  input = Input(shape=(82,))
  hidden_1 = Dense(500, activation='relu')(input)
  hidden_2 = Dense(300, activation='relu')(hidden_1)
  code = Dense(20, activation='relu')(hidden_2)
  hidden_3 = Dense(300, activation='relu')(code)
  hidden_4 = Dense(500, activation='relu')(hidden_3) 
  output = Dense(82, activation='sigmoid')(hidden_4)

  # define autoencoder model
  autoencoder = Model(inputs=input, outputs=output)
  # compile model
  autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
  # fit model to reconstruct the input
  autoencoder.fit(x_train_input, x_train_input, batch_size=40, epochs=3, validation_data=(x_test_input, x_test_input))

  # define encoder model
  encoder1 = Model(inputs=input, outputs=code)
#  # save the encoder model to file
#  encoder.save("encoder1.h5")

  return encoder1

In [26]:
def denoising_autoencoder(x_train_input, x_test_input):
  # returns the encoder model of a denoising autoencoder with 2 hidden layers

  # add noise
  noise_factor = 0.4
  x_train_noisy = x_train_input + noise_factor * np.random.normal(size=x_train_input.shape)
  x_test_noisy = x_test_input + noise_factor * np.random.normal(size=x_test_input.shape)
  x_train_noisy = np.clip(x_train_noisy, 0.0, 1.0)
  x_test_noisy = np.clip(x_test_noisy, 0.0, 1.0)

  input = Input(shape=(82,))
  hidden_1 = Dense(500, activation='relu')(input)
  hidden_2 = Dense(300, activation='relu')(hidden_1)
  code = Dense(20, activation='relu')(hidden_2)
  hidden_3 = Dense(300, activation='relu')(code)
  hidden_4 = Dense(500, activation='relu')(hidden_3) 
  output = Dense(82, activation='sigmoid')(hidden_4)

  #define noisy aytoencoder model
  autoencoder_noisy = Model(inputs=input, outputs=output)
  # compile the model
  autoencoder_noisy.compile(optimizer='adam', loss='binary_crossentropy')
  # fit model to reconstruct the input
  autoencoder_noisy.fit(x_train_noisy, x_train_noisy, batch_size=40, epochs=3, validation_data=(x_test_noisy, x_test_noisy))

  # define the encoder model
  encoder2 = Model(inputs=input, outputs=code)
  #  # save the encoder model to file
  #  encoder.save("encoder2.h5")

  return encoder2

In [132]:
def sparse_autoencoder(x_train_input, x_test_input):
  #returns the encoder model of a sparse autoencoder 

  input_size = 82
  code_size = 20

  input = Input(shape=(input_size,))
  code = Dense(code_size,activity_regularizer=l1(10e-6), activation='relu')(input) 
  output = Dense(input_size, activation='sigmoid')(code)

  # define autoencoder model
  autoencoder_regularized = Model(inputs=input, outputs=output) 
  # compile the autoencoder model
  autoencoder_regularized.compile(optimizer='adam', loss='binary_crossentropy') 
  # fit autoencoder model to reconstruct the input
  autoencoder_regularized.fit(x_train_input, x_train_input, epochs=3, validation_data=(x_test_input, x_test_input))
  
  # define the encoder model
  encoder3 = Model(input, code)
  #  # save the encoder model to file
  #  encoder.save("encoder3.h5")

  return encoder3

In [133]:
def encoder_transformation(encoder, x_train_input, x_test_input):
  # use the encoder models to transform the data

  # train_data transformation
  x_train_encoded = encoder.predict(x_train_input)
  # test data transformation
  x_test_encoded = encoder.predict(x_test_input)

  return x_train_encoded, x_test_encoded

In [100]:
def create_nn_model(init="glorot_uniform", optimizer="rmsprop"):
  #returns a neural network nodel

  model=Sequential()
  model.add(Dense(150, input_dim=82, activation="relu", kernel_initializer=init))
  model.add(Dense(50, activation="relu", kernel_initializer=init))
  model.add(Dense(1, activation="softmax", kernel_initializer=init))

  model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

  return model

In [114]:
# Grid search for NN

def grid_search(input, output):
  #numpy.set_printoptions(precision=3)

  # create model
  nn_model=KerasClassifier(build_fn=create_nn_model, verbose=0)

  #grid search to find epochs, batch_size, optimizer
  epochs_list=[5,10,30,50]
  batches_list=[5,10,15]
  optimizer_list=["adam", "rmsprop"]
  inits_list=["glorot_uniform","normal", "uniform"]

  param_grid_dictionary=dict(optimizer=optimizer_list, epochs=epochs_list, batch_size=batches_list, init=inits_list)
  type(param_grid_dictionary)
  grid=GridSearchCV(estimator=nn_model,param_grid=param_grid_dictionary)
  grid_result=grid.fit(input,output)

  # summarize results
  print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
  means = grid_result.cv_results_['mean_test_score']
  stds = grid_result.cv_results_['std_test_score']
  params = grid_result.cv_results_['params']
  for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))


In [96]:
def reduced_data(df_data, new_size=600):
  # creates a reduced dataset in order to minimize time needed for grid search

  number_of_rows = df_data.shape[0]
  random_indices = np.random.choice(number_of_rows, size=new_size, replace=False)
  random_rows = df_data.values[random_indices, :]

  X=random_rows[:,0:82]
  Y=random_rows[:,82]

  return X,Y

In [105]:
# use a reduced (train) data set for grid search 
reduced_x, reduced_y = reduced_data(df_train_data_filtered)

In [None]:
grid_search(X,Y)
"""Best: 0.471667 using {'batch_size': 5, 'epochs': 5, 'init': 'glorot_uniform', 'optimizer': 'adam'}"""

MAY NOT BE NEEDED

In [None]:
#correlation matrix plot
correlations=df_train_data_filtered.corr()
fig=pyplot.figure()
ax=fig.add_subplot(1,1,1)
cax=ax.matshow(correlations,vmin=-1, vmax=1)
fig.colorbar(cax)
pyplot.show()

In [None]:
df_train_data_filtered.corr(method="pearson")

In [None]:
#scatter plot matrix
scatter_matrix(train_data, figsize=[20,20])
pyplot.show()

In [None]:
set_option("display.width",100)
set_option("precision",2)
train_data.describe()

set_printoptions(precision=3)

In [None]:
#check if balanced
train_data.groupby("155").size()
test_data.groupby("155").size()

In [None]:
#check for skew
pd.set_option('max_rows', 99999)
pd.set_option('max_colwidth', 400)
pd.describe_option('max_colwidth')

results_skew = train_data.skew()
print(results_skew)

In [None]:
print(np.max(x_test_norm))
np.min(x_test_norm)

In [None]:
from sklearn.svm import SVR, SVC, LinearSVC
rfe_elimination(SVR(kernel="linear"), x_train, y_train)
rfe_elimination(SVC(), x_train, y_train)
rfe_elimination(LinearSVC(), x_train, y_train)

In [None]:
#without regularization (not sparsed)

#input_size = 152
##code_size = 30

#input_img = Input(shape=(input_size,))
#code = Dense(code_size, activation='relu')(input_img) 
#output_img = Dense(input_size, activation='sigmoid')(code)

#autoencoder_standard = Model(input_img, output_img) 
#autoencoder_standard.compile(optimizer='adam', loss='binary_crossentropy') 
#history_standard = autoencoder_standard.fit(x_train_norm, x_train_norm, epochs=3)
#encoded_standard = Model(input, code)

#predictions_3 = autoencoder_standard.predict(x_test_norm)

In [None]:
#GRID SEARCH FOR CNN

from keras.layers import Dense, Dropout, Flatten
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.models import Sequential
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

#numpy.set_printoptions(precision=3)


# create a function to build a model
#required for KerasClassifier
def create_model_CNN(init="glorot_uniform", optimizer="rmsprop"):
  model=Sequential()
  model.add(Conv2D(32,kernel_size=(5,5), strides=(1,1), padding="same", activation="relu", input_shape=(1,152,600)))
  model.add(MaxPooling2D(pool_size=(2,2),strides=(2,2)))
  model.add(Conv2D(64,(2,2),activation="relu",padding="same"))
  model.add(MaxPooling2D(pool_size=(2,2)))
  model.add(Dropout(0,25))
  model.add(Flatten())
  model.add(Dense(1000, activation="relu"))
  model.add(Dropout(0,5))
  model.add(Dense(2, activation="softmax"))

  model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

  return model

# create model
model_CNN=KerasClassifier(build_fn=create_model_CNN, verbose=0)

#grid search to find epochs, batch_size, optimizer
epochs_list=[100,150]
batches_list=[5,10]
optimizer_list=["adam", "rmsprop"]
inits_list=["glorot_uniform","normal", "uniform"]

param_grid_dictionary=dict(optimizer=optimizer_list, epochs=epochs_list, batch_size=batches_list, init=inits_list)
type(param_grid_dictionary)
grid=GridSearchCV(estimator=model_CNN,param_grid=param_grid_dictionary)
grid_result=grid.fit(X,Y)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
  print("%f (%f) with: %r" % (mean, stdev, param))
  

RUN

DEALING WITH THE DATA

In [10]:
#load the data
filename1 ="train_imperson_without4n7_balanced_data.csv"
filename2 ="test_imperson_without4n7_balanced_data.csv"

df_train_data = read_csv(filename1)
df_test_data = read_csv(filename2)


# apply constant column filter

# not matched constant columns in the train and test data set

# merge them to one data set
df_full_data = df_train_data.append(df_test_data)

#apply the filter
df_full_data_filtered = filter_constant_columns(df_full_data)

# separate the data sets
df_train_data_filtered = df_full_data_filtered[0:97044]
df_test_data_filtered = df_full_data_filtered[97044:]


#separate the filtered x,y
x_train = df_train_data_filtered[df_train_data_filtered.columns[:-1]].values
y_train = df_train_data_filtered[df_train_data_filtered.columns[-1]].values
x_test = df_test_data_filtered[df_test_data_filtered.columns[:-1]].values
y_test = df_test_data_filtered[df_test_data_filtered.columns[-1]].values

In [11]:
df_train_data.shape

(97044, 153)

In [12]:
df_test_data.shape

(40158, 153)

In [13]:
x_train_norm, x_test_norm = normalize(x_train, x_test)

In [14]:
x_train_resc, x_test_resc = rescale(x_train, x_test)

FITTING MODELS TO COMPARE, USING: Raw data / Normalized data / Rescaled data

In [15]:
# fitting models to raw data
compare_algorithms(x_train, y_train, x_test, y_test)

Validation accuracy:
LR: 0.992 (0.001)
LDA: 0.985 (0.002)
KNN: 0.999 (0.000)
SVC: 0.995 (0.001)
LnSVC: 0.995 (0.001)
Test accuracy:
LR: 0.868
LDA: 0.931
KNN: 0.534
SVC: 0.530
LnSVC: 0.652


In [None]:
# fitting models to normalized data
compare_algorithms(x_train_norm, y_train, x_test_norm, y_test)

Validation accuracy:
LR: 0.988 (0.001)
LDA: 0.985 (0.002)
KNN: 0.999 (0.000)
CART: 1.000 (0.000)
NB: 0.986 (0.001)
SVC: 0.995 (0.001)
LnSVC: 0.992 (0.001)
Test accuracy:
LR: 0.965
LDA: 0.614
KNN: 0.532
CART: 0.541
NB: 0.500
SVC: 0.531
LnSVC: 0.525


In [36]:
# fitting models to rescaled data
compare_algorithms(x_train_resc, y_train, x_test_resc, y_test)

Validation accuracy:
LR: 0.995 (0.001)
LDA: 0.985 (0.002)
KNN: 1.000 (0.000)
SVC: 0.997 (0.001)
LnSVC: 0.997 (0.001)
Test accuracy:
LR: 0.506
LDA: 0.931
KNN: 0.536
SVC: 0.499
LnSVC: 0.509


FEATURE SELECTION (TOP 10 FEATURES) USING: K-BEST / RFE / EXTRA TREES

In [115]:
# K-Best method for raw data
kbest_train_features, kbest_test_features = kbest(x_train, y_train, x_test)

    Specs         Score
26     26  32124.682195
16     16  28936.000138
22     22  28321.490224
13     13  24346.021809
81     81  13311.733744
15     15  12317.666392
2       2  12076.290588
3       3  12076.290588
23     23   7911.296191
36     36   7720.479234


In [116]:
# K-Best method for normalized data
kbest_train_features_norm, kbest_test_features_norm = kbest(x_train_norm, y_train, x_test_norm)

    Specs        Score
26     26  7185.552254
22     22  6458.853639
16     16  6265.412604
13     13  5162.369647
81     81  2648.775398
15     15  2569.185754
2       2  2378.606418
3       3  2378.606418
23     23  2091.641720
28     28  1758.070006


In [40]:
# K-Best method for rescaled data
kbest_train_features_resc, kbest_test_features_resc = kbest(x_train_resc, y_train, x_test_resc)

    Specs         Score
26     26  32124.682195
16     16  28936.000138
22     22  28321.490224
13     13  24346.021809
81     81  13311.733744
15     15  12317.666392
2       2  12076.290588
3       3  12076.290588
23     23   7911.296191
36     36   7720.479234


In [122]:
# Recursive Feature Elimination method for raw data
rfe_train_features, rfe_test_features = rfe_elimination(LogisticRegression(solver='liblinear'), x_train, y_train, x_test)

    Specs  Score
3       3      1
13     13      1
22     22      1
25     25      1
36     36      1
44     44      1
46     46      1
50     50      1
55     55      1
74     74      1
[3, 13, 22, 25, 36, 44, 46, 50, 55, 74]


In [120]:
# Recursive Feature Elimination method for normalized data
rfe_train_features_norm, rfe_test_features_norm = rfe_elimination(LogisticRegression(solver='liblinear'), x_train_norm, y_train, x_test_norm)

    Specs  Score
13     13      1
15     15      1
18     18      1
22     22      1
26     26      1
36     36      1
44     44      1
46     46      1
69     69      1
73     73      1
[13, 15, 18, 22, 26, 36, 44, 46, 69, 73]


In [121]:
# Recursive Feature Elimination method for rescaled data
rfe_train_features_resc, rfe_test_features_resc = rfe_elimination(LogisticRegression(solver='liblinear'), x_train_resc, y_train, x_test_resc)

    Specs  Score
13     13      1
20     20      1
23     23      1
25     25      1
36     36      1
50     50      1
51     51      1
55     55      1
62     62      1
74     74      1
[13, 20, 23, 25, 36, 50, 51, 55, 62, 74]


In [127]:
# Extra Trees Classifier method for raw data
extree_train_features, extree_test_features = extra_trees_classifier(x_train, y_train, x_test)

    Specs     Score
22     22  0.106122
26     26  0.100525
15     15  0.078155
13     13  0.074158
23     23  0.067266
16     16  0.059308
28     28  0.058513
2       2  0.053410
36     36  0.043418
3       3  0.038511


In [128]:
# Extra Trees Classifier method for normalized data
extree_train_features_norm, extree_test_features_norm = extra_trees_classifier(x_train_norm, y_train, x_test_norm)

    Specs     Score
26     26  0.105521
15     15  0.089479
22     22  0.082283
16     16  0.074157
23     23  0.070151
28     28  0.044706
13     13  0.044182
21     21  0.036853
11     11  0.032722
3       3  0.027792


In [129]:
# Extra Trees Classifier method for rescaled data
extree_train_features_resc, extree_test_features_resc = extra_trees_classifier(x_train_resc, y_train, x_test_resc)

    Specs     Score
22     22  0.107082
26     26  0.098998
15     15  0.091695
16     16  0.082208
28     28  0.079847
23     23  0.067989
36     36  0.060018
13     13  0.055606
81     81  0.036854
11     11  0.034167


MODEL EVALUATION WITH SELECTED FEATURES

In [None]:
# raw data

In [130]:
# compare models using top 10 features from K-Best for raw data
compare_algorithms(kbest_train_features, y_train, kbest_test_features, y_test)

Validation accuracy:
LR: 0.961 (0.001)
LDA: 0.928 (0.001)
KNN: 0.998 (0.000)
SVC: 0.992 (0.001)
LnSVC: 0.960 (0.001)
Test accuracy:
LR: 0.500
LDA: 0.277
KNN: 0.500
SVC: 0.500
LnSVC: 0.500


In [139]:
# compare models using top 10 features from RFE for raw data
compare_algorithms(rfe_train_features, y_train, rfe_test_features, y_test)

Validation accuracy:
LR: 0.985 (0.001)
LDA: 0.969 (0.002)
KNN: 0.999 (0.000)
SVC: 0.997 (0.000)
LnSVC: 0.993 (0.001)
Test accuracy:
LR: 0.861
LDA: 0.820
KNN: 0.531
SVC: 0.521
LnSVC: 0.866


In [140]:
# compare models using top 10 features from Extra Trees for raw data
compare_algorithms(extree_train_features, y_train, extree_test_features, y_test)

Validation accuracy:
LR: 0.961 (0.001)
LDA: 0.932 (0.002)
KNN: 0.998 (0.000)
SVC: 0.992 (0.001)
LnSVC: 0.961 (0.001)
Test accuracy:
LR: 0.975
LDA: 0.910
KNN: 0.533
SVC: 0.527
LnSVC: 0.975


In [None]:
# normalized data

In [141]:
# compare models using top 10 features from K-Best for normalized data
compare_algorithms(kbest_train_features_norm, y_train, kbest_test_features_norm, y_test)

Validation accuracy:
LR: 0.929 (0.002)
LDA: 0.928 (0.002)
KNN: 0.997 (0.000)
SVC: 0.981 (0.002)
LnSVC: 0.941 (0.002)
Test accuracy:
LR: 0.500
LDA: 0.675
KNN: 0.500
SVC: 0.500
LnSVC: 0.499


In [142]:
# compare models using top 10 features from RFE for normalized data
compare_algorithms(rfe_train_features_norm, y_train, rfe_test_features_norm, y_test)

Validation accuracy:
LR: 0.987 (0.001)
LDA: 0.974 (0.002)
KNN: 0.998 (0.000)
SVC: 0.991 (0.001)
LnSVC: 0.988 (0.002)
Test accuracy:
LR: 0.520
LDA: 0.958
KNN: 0.532
SVC: 0.527
LnSVC: 0.519


In [143]:
# compare models using top 10 features from Extra Trees for normalized data
compare_algorithms(extree_train_features_norm, y_train, extree_test_features_norm, y_test)

Validation accuracy:
LR: 0.931 (0.002)
LDA: 0.929 (0.002)
KNN: 0.999 (0.000)
SVC: 0.993 (0.000)
LnSVC: 0.950 (0.001)
Test accuracy:
LR: 0.917
LDA: 0.887
KNN: 0.528
SVC: 0.521
LnSVC: 0.966


In [None]:
# rescaled data

In [43]:
# compare models using top 10 features from K-Best for rescaled data
compare_algorithms(kbest_train_features_resc, y_train, kbest_test_features_resc, y_test)

Validation accuracy:
LR: 0.961 (0.001)
LDA: 0.928 (0.001)
KNN: 0.998 (0.000)
SVC: 0.992 (0.001)
LnSVC: 0.960 (0.001)
Test accuracy:
LR: 0.500
LDA: 0.277
KNN: 0.500
SVC: 0.500
LnSVC: 0.500


In [44]:
# compare models using top 10 features from RFE for rescaled data
compare_algorithms(rfe_train_features_resc, y_train, rfe_test_features_resc, y_test)

Validation accuracy:
LR: 0.992 (0.001)
LDA: 0.980 (0.002)
KNN: 0.999 (0.000)
SVC: 0.997 (0.001)
LnSVC: 0.995 (0.001)
Test accuracy:
LR: 0.813
LDA: 0.783
KNN: 0.535
SVC: 0.494
LnSVC: 0.816


In [45]:
# compare models using top 10 features from Extra Trees for rescaled data
compare_algorithms(extree_train_features_resc, y_train, extree_test_features_resc, y_test)

Validation accuracy:
LR: 0.969 (0.002)
LDA: 0.938 (0.002)
KNN: 0.997 (0.000)
SVC: 0.988 (0.000)
LnSVC: 0.971 (0.002)
Test accuracy:
LR: 0.959
LDA: 0.916
KNN: 0.521
SVC: 0.502
LnSVC: 0.958


USE OF AUTOENCODERS FOR FEATURE EXTRACTION: SIMPLE / DENOISING / SPARSE

---




In [28]:
# creating the encoder model of the simple autoencoder
# for the raw, normalized and rescaled data

encoder1_raw = autoencoder(x_train, x_test)
encoder1_norm = autoencoder(x_train_norm, x_test_norm)
encoder1_resc = autoencoder(x_train_resc, x_test_resc)


Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [59]:
# creating the encoder model of the denoising autoencoder
# for the raw, normalized and rescaled data

encoder2_raw = denoising_autoencoder(x_train, x_test)
encoder2_norm = denoising_autoencoder(x_train_norm, x_test_norm)
encoder2_resc = denoising_autoencoder(x_train_resc, x_test_resc)

Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [134]:
# creating the encoder model of sparse autoencoder
# for the raw, normalized and rescaled data
encoder3_raw = sparse_autoencoder(x_train, x_test)
encoder3_norm = sparse_autoencoder(x_train_norm, x_test_norm)
encoder3_resc = sparse_autoencoder(x_train_resc, x_test_resc) 

Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [51]:
# data transformation - encoder1
x_train_enc_1, x_test_enc_1 = encoder_transformation(encoder1_raw, x_train, x_test)
x_train_norm_enc_1, x_test_norm_enc_1 = encoder_transformation(encoder1_norm, x_train_norm, x_test_norm) 
x_train_resc_enc_1, x_test_resc_enc_1 = encoder_transformation(encoder1_resc, x_train_resc, x_test_resc)


In [60]:
# data transformation - encoder2
x_train_enc_2, x_test_enc_2 = encoder_transformation(encoder2_raw, x_train, x_test)
x_train_norm_enc_2, x_test_norm_enc_2 = encoder_transformation(encoder2_norm, x_train_norm, x_test_norm) 
x_train_resc_enc_2, x_test_resc_enc_2 = encoder_transformation(encoder2_resc, x_train_resc, x_test_resc)

In [135]:
# data transformation - encoder3
x_train_enc_3, x_test_enc_3 = encoder_transformation(encoder3_raw, x_train, x_test)
x_train_norm_enc_3, x_test_norm_enc_3 = encoder_transformation(encoder3_norm, x_train_norm, x_test_norm) 
x_train_resc_enc_3, x_test_resc_enc_3 = encoder_transformation(encoder3_resc, x_train_resc, x_test_resc)

In [131]:
print(x_train_enc_3.shape)

(97044, 30)


FITTING MODELS TO EXTRACTED DATA

In [46]:
# encoder1 - raw data
compare_algorithms(x_train_enc_1, y_train, x_test_enc_1, y_test)

Validation accuracy:
LR: 0.973 (0.002)
LDA: 0.950 (0.002)
KNN: 0.999 (0.000)
SVC: 0.997 (0.000)




LnSVC: 0.974 (0.001)
Test accuracy:
LR: 0.569
LDA: 0.943
KNN: 0.534
SVC: 0.534
LnSVC: 0.566




In [30]:
# encoder1 - normalized data
compare_algorithms(x_train_norm_enc_1, y_train, x_test_norm_enc_1, y_test)

Validation accuracy:
LR: 0.985 (0.002)
LDA: 0.967 (0.001)
KNN: 0.999 (0.000)
SVC: 0.995 (0.001)




LnSVC: 0.985 (0.002)
Test accuracy:
LR: 0.868
LDA: 0.924
KNN: 0.518
SVC: 0.500
LnSVC: 0.874




In [52]:
# encoder1 - rescaled data
compare_algorithms(x_train_resc_enc_1, y_train, x_test_resc_enc_1, y_test)

Validation accuracy:
LR: 0.994 (0.001)
LDA: 0.985 (0.001)
KNN: 0.999 (0.000)
SVC: 0.999 (0.000)




LnSVC: 0.994 (0.001)
Test accuracy:
LR: 0.609
LDA: 0.810
KNN: 0.500
SVC: 0.500
LnSVC: 0.534




In [61]:
# encoder2 - raw data
compare_algorithms(x_train_enc_2, y_train, x_test_enc_2, y_test)

Validation accuracy:
LR: 0.991 (0.001)
LDA: 0.980 (0.002)
KNN: 0.999 (0.000)
SVC: 0.995 (0.000)




LnSVC: 0.993 (0.001)
Test accuracy:
LR: 0.552
LDA: 0.502
KNN: 0.531
SVC: 0.525
LnSVC: 0.532




In [63]:
# encoder2 - normalized data
compare_algorithms(x_train_norm_enc_2, y_train, x_test_norm_enc_2, y_test)

Validation accuracy:
LR: 0.985 (0.001)
LDA: 0.953 (0.001)
KNN: 0.999 (0.000)
SVC: 0.980 (0.001)




LnSVC: 0.990 (0.001)
Test accuracy:
LR: 0.945
LDA: 0.500
KNN: 0.740
SVC: 0.951
LnSVC: 0.929




In [65]:
# encoder2 - rescaled data data
compare_algorithms(x_train_resc_enc_2, y_train, x_test_resc_enc_2, y_test)

Validation accuracy:
LR: 0.995 (0.001)
LDA: 0.978 (0.002)
KNN: 0.999 (0.000)
SVC: 0.998 (0.000)




LnSVC: 0.995 (0.001)
Test accuracy:
LR: 0.508
LDA: 0.504
KNN: 0.500
SVC: 0.500
LnSVC: 0.506




In [136]:
# encoder3 - raw data
compare_algorithms(x_train_enc_3, y_train, x_test_enc_3, y_test)

Validation accuracy:
LR: 0.992 (0.001)
LDA: 0.977 (0.002)
KNN: 0.999 (0.000)
SVC: 0.996 (0.001)




LnSVC: 0.992 (0.001)
Test accuracy:
LR: 0.867
LDA: 0.824
KNN: 0.508
SVC: 0.598
LnSVC: 0.512




In [137]:
# encoder3 - normalized data
compare_algorithms(x_train_norm_enc_3, y_train, x_test_norm_enc_3, y_test)

Validation accuracy:
LR: 0.988 (0.002)
LDA: 0.974 (0.002)
KNN: 0.999 (0.000)
SVC: 0.992 (0.001)




LnSVC: 0.990 (0.001)
Test accuracy:
LR: 0.896
LDA: 0.850
KNN: 0.531
SVC: 0.524
LnSVC: 0.866




In [138]:
# encoder3 - rescaled data
compare_algorithms(x_train_resc_enc_3, y_train, x_test_resc_enc_3, y_test)

Validation accuracy:
LR: 0.991 (0.001)
LDA: 0.984 (0.001)
KNN: 0.999 (0.000)
SVC: 0.998 (0.000)




LnSVC: 0.993 (0.000)
Test accuracy:
LR: 0.953
LDA: 0.803
KNN: 0.500
SVC: 0.581
LnSVC: 0.474


