Variables to change:

1. jar_file_loc: this is the file of the .jar postgres file. Change path to where it is located
2. train_data_path & test_data_path: change to path of train/test csv files

In [1]:
# install if necessary: tensorflow version needs to be 2.10.x
# RESTART Kernel after installation

!pip install tensorflow==2.10.0



In [2]:
import tensorflow as tf  # now import the tensorflow module
print(tf.__version__)  # make sure the version is 2.x

caused by: ["[Errno 2] The file to load file system plugin from does not exist.: '/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so'"]
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: cannot open shared object file: No such file or directory']


2.10.0


# Task I

In [3]:
jar_file_loc = 'postgresql-42.5.0.jar'

# set up Spark
import pyspark
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName("GenericAppName") \
    .config('spark.jars', jar_file_loc) \
    .getOrCreate()


#Access SparkContext from your SparkSession
print("APP Name :"+ spark.sparkContext.appName);
print("Master :"+ spark.sparkContext.master);

sqlContext = SQLContext(spark.sparkContext)


APP Name :GenericAppName
Master :local[*]




In [4]:
# read in data
df_train = spark.read.csv('data_folder/train70_reduced.csv', header = True, inferSchema = True)
df_test = spark.read.csv('data_folder/test30_reduced.csv', header = True, inferSchema = True)

In [5]:
# add column to differentiate b/w train and test sets
from pyspark.sql.functions import col, lit

df_train_cat = df_train.withColumn("data_category", lit("train"))
df_test_cat = df_test.withColumn("data_category", lit("test"))

print('Item Count\n')
print('Train:', df_train_cat.count())
print('Test:', df_test_cat.count())


# combine dfs
df_combined = df_train_cat.union(df_test_cat)
print('Combined:', df_combined.count())

Item Count

Train: 231646
Test: 84351
Combined: 315997


In [6]:
# write into postgresql db

db_properties={}
#update your db username
db_properties['username']="postgres"
#update your db password
db_properties['password']="bigdata"
#make sure you got the right port number here
db_properties['url']= "jdbc:postgresql://host.docker.internal/postgres"
#make sure you had the Postgres JAR file in the right location
db_properties['driver']="org.postgresql.Driver"
db_properties['table']= "mqtt"

# create df with train data 
df_combined.write.format("jdbc")\
.mode("overwrite")\
.option("url", db_properties['url'])\
.option("dbtable", db_properties['table'])\
.option("user", db_properties['username'])\
.option("password", "bigdata")\
.option("Driver", db_properties['driver'])\
.save()

In [7]:
# read db to ensure data has been written in correctly
df_read = sqlContext.read.format("jdbc")\
    .option("url", db_properties['url'])\
    .option("dbtable", db_properties['table'])\
    .option("user", db_properties['username'])\
    .option("password", "bigdata")\
    .option("Driver", db_properties['driver'])\
    .load()

print('Item Count from PostgreSQL Read:', df_read.count())

Item Count from PostgreSQL Read: 315997


# Task III

### Data Processing

In [8]:
# replacing all . with _ in column names to avoid bugs with having . in col name

col_names_underscore = ['tcp_flags','tcp_time_delta','tcp_len','mqtt_conack_flags','mqtt_conack_flags_reserved',
             'mqtt_conack_flags_sp','mqtt_conack_val','mqtt_conflag_cleansess','mqtt_conflag_passwd',
             'mqtt_conflag_qos','mqtt_conflag_reserved','mqtt_conflag_retain','mqtt_conflag_uname',
             'mqtt_conflag_willflag','mqtt_conflags','mqtt_dupflag','mqtt_hdrflags','mqtt_kalive',
             'mqtt_len','mqtt_msg','mqtt_msgid','mqtt_msgtype','mqtt_proto_len','mqtt_protoname',
             'mqtt_qos','mqtt_retain','mqtt_sub_qos','mqtt_suback_qos','mqtt_ver','mqtt_willmsg',
             'mqtt_willmsg_len','mqtt_willtopic','mqtt_willtopic_len','target','data_category']

df_read_underscore = df_read.toDF(*col_names_underscore)

In [9]:
# remove all zero columns & absurd cols

zero_cols = ['mqtt_conack_flags','mqtt_conack_flags_reserved','mqtt_conack_flags_sp','mqtt_conack_val',
            'mqtt_conflag_qos','mqtt_conflag_reserved','mqtt_conflag_retain','mqtt_conflag_willflag',
            'mqtt_retain','mqtt_sub_qos','mqtt_suback_qos','mqtt_willmsg',
             'mqtt_willmsg_len','mqtt_willtopic','mqtt_willtopic_len']

absurd_cols = ['mqtt_msg', 'mqtt_msgid']


df_read_underscore_dropped = df_read_underscore.drop(*zero_cols+absurd_cols)

In [10]:
# split train and test
df_read_train = df_read_underscore_dropped.where(df_read_underscore_dropped['data_category'] == 'train')
df_read_test = df_read_underscore_dropped.where(df_read_underscore_dropped['data_category'] == 'test')


# drop 'data_category' col
df_read_train = df_read_train.drop('data_category')
df_read_test = df_read_test.drop('data_category')

In [11]:
# PySpark
import pyspark
from pyspark.sql import SparkSession, SQLContext
from pyspark.ml import Pipeline,Transformer
from pyspark.ml.feature import Imputer,StandardScaler,StringIndexer,OneHotEncoder, VectorAssembler
from pyspark.ml.classification import LogisticRegression

from pyspark.sql.functions import *
from pyspark.sql.types import *
import numpy as np


In [31]:
# deal with correlations
correlation_matrix = df_read_underscore_dropped.toPandas().corr()
print(correlation_matrix)

                        tcp_time_delta   tcp_len  mqtt_conflag_cleansess  \
tcp_time_delta                1.000000 -0.006952               -0.009565   
tcp_len                      -0.006952  1.000000               -0.013370   
mqtt_conflag_cleansess       -0.009565 -0.013370                1.000000   
mqtt_conflag_passwd          -0.006303 -0.008609                0.658921   
mqtt_conflag_uname           -0.006312 -0.008623                0.659891   
mqtt_dupflag                 -0.018682  0.159388               -0.024359   
mqtt_kalive                  -0.005286 -0.008729                0.552715   
mqtt_len                     -0.036825  0.274375                0.001855   
mqtt_msgtype                  0.283643  0.085299               -0.056337   
mqtt_proto_len               -0.009565 -0.013370                1.000000   
mqtt_qos                     -0.037996  0.271391               -0.044270   
mqtt_ver                     -0.009565 -0.013370                1.000000   

           

  correlation_matrix = df_read_underscore_dropped.toPandas().corr()


In [12]:
col_names_underscore = ['tcp_flags','tcp_time_delta','tcp_len','mqtt_conack_flags','mqtt_conack_flags_reserved',
             'mqtt_conack_flags_sp','mqtt_conack_val','mqtt_conflag_cleansess','mqtt_conflag_passwd',
             'mqtt_conflag_qos','mqtt_conflag_reserved','mqtt_conflag_retain','mqtt_conflag_uname',
             'mqtt_conflag_willflag','mqtt_conflags','mqtt_dupflag','mqtt_hdrflags','mqtt_kalive',
             'mqtt_len','mqtt_msg','mqtt_msgid','mqtt_msgtype','mqtt_proto_len','mqtt_protoname',
             'mqtt_qos','mqtt_retain','mqtt_sub_qos','mqtt_suback_qos','mqtt_ver','mqtt_willmsg',
             'mqtt_willmsg_len','mqtt_willtopic','mqtt_willtopic_len','target'] # 'data_category' removed


nominal_cols = ['tcp_flags','mqtt_conflags','mqtt_hdrflags', 'mqtt_protoname']

continuous_cols = ['tcp_time_delta', 'tcp_len','mqtt_kalive','mqtt_len',  'mqtt_msgtype',
                  'mqtt_proto_len','mqtt_qos', 'mqtt_ver']

binary_cols = ['mqtt_conflag_cleansess', 'mqtt_conflag_passwd','mqtt_conflag_uname', 'mqtt_dupflag']


keys = ['slowite', 'bruteforce','flood', 'malformed', 'dos', 'legitimate']
vals = [0, 1, 2, 3, 4, 5]

global label_dict
label_dict = dict(zip(keys, vals))

# ===========================================================================

class OutcomeCreater(Transformer): # this defines a transformer that creates the outcome column
    
    def __init__(self):
        super().__init__()

    def _transform(self, dataset):
        label_to_multiclass = udf(lambda name: label_dict[name])
        output_df = dataset.withColumn('outcome', label_to_multiclass(col('target'))).drop('target')
        output_df = output_df.withColumn('outcome', col('outcome').cast(DoubleType()))
        return output_df

class FeatureTypeCaster(Transformer): # this transformer will cast the columns as appropriate types  
    def __init__(self):
        super().__init__()

    def _transform(self, dataset):
        output_df = dataset
        for col_name in binary_cols + continuous_cols:
            output_df = output_df.withColumn(col_name,col(col_name).cast(DoubleType()))

        return output_df
    
class ColumnDropper(Transformer): # this transformer drops uannecessary columns
    def __init__(self, columns_to_drop = None):
        super().__init__()
        self.columns_to_drop=columns_to_drop
    def _transform(self, dataset):
        output_df = dataset
        for col_name in self.columns_to_drop:
            output_df = output_df.drop(col_name)
        return output_df

def get_preprocess_pipeline():
    # Stage where columns are casted as appropriate types
    stage_typecaster = FeatureTypeCaster()

    # Stage where nominal columns are transformed to index columns using StringIndexer
    nominal_id_cols = [x+"_index" for x in nominal_cols]
    nominal_onehot_cols = [x+"_encoded" for x in nominal_cols]
    stage_nominal_indexer = StringIndexer(inputCols = nominal_cols, outputCols = nominal_id_cols )
    
    # Stage where the index columns are further transformed using OneHotEncoder
    stage_nominal_onehot_encoder = OneHotEncoder(inputCols=nominal_id_cols, outputCols=nominal_onehot_cols)
    
    # Stage where all relevant features are assembled into a vector (and dropping a few)
    feature_cols = continuous_cols+binary_cols+nominal_onehot_cols
    corelated_cols_to_remove = ['mqtt_conflag_uname','mqtt_qos','mqtt_proto_len', 'mqtt_ver']

    for col_name in corelated_cols_to_remove:
        feature_cols.remove(col_name)
    
    stage_vector_assembler = VectorAssembler(inputCols=feature_cols, outputCol="vectorized_features")

    # Stage where we scale the columns
    stage_scaler = StandardScaler(inputCol= 'vectorized_features', outputCol= 'features')
    

    # Stage for creating the outcome column representing whether there is attack 
    stage_outcome = OutcomeCreater()

    # Removing all unnecessary columns, only keeping the 'features' and 'outcome' columns
    stage_column_dropper = ColumnDropper(columns_to_drop = nominal_cols+nominal_id_cols+
        nominal_onehot_cols+ binary_cols + continuous_cols + ['vectorized_features'])
     
        
    # fit with logistic regression
    lr = LogisticRegression(featuresCol = 'features', labelCol = 'outcome', maxIter=10)
    
    # Connect the columns into a pipeline
    pipeline = Pipeline(stages=[stage_typecaster,stage_nominal_indexer,stage_nominal_onehot_encoder,
        stage_vector_assembler,stage_scaler,stage_outcome,stage_column_dropper])
    return pipeline

In [13]:
# fit and transform
preprocess_pipeline = get_preprocess_pipeline()
preprocess_pipeline_model = preprocess_pipeline.fit(df_read_train)

train_transform = preprocess_pipeline_model.transform(df_read_train)
test_transform = preprocess_pipeline_model.transform(df_read_test)

### Machine Learning - PySpark

The classifiers chosen are: Logistic Regression & Random Forest.

#### Standard Training

In [14]:
# logistic regression

lr = LogisticRegression(featuresCol = 'features', labelCol = 'outcome', maxIter=5)
lr_fit = lr.fit(train_transform)

lr_preds_train = lr_fit.transform(train_transform)
lr_preds_test = lr_fit.transform(test_transform)

# random forest
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'outcome')
rf_fit = rf.fit(train_transform)

rf_preds_train = rf_fit.transform(train_transform)
rf_preds_test = rf_fit.transform(test_transform)

In [15]:
# accuracies

# logistic regression
lr_accuracy_train = (lr_preds_train.filter(lr_preds_train.outcome == lr_preds_train.prediction)
    .count() / float(lr_preds_train.count()))

lr_accuracy_test = (lr_preds_test.filter(lr_preds_test.outcome == lr_preds_test.prediction)
    .count() / float(lr_preds_test.count()))

print('Logistic Regression')
print("Train Accuracy :", lr_accuracy_train)
print("Test Accuracy :", lr_accuracy_test)


# rf
rf_accuracy_train = (rf_preds_train.filter(rf_preds_train.outcome == rf_preds_train.prediction)
    .count() / float(rf_preds_train.count()))

rf_accuracy_test = (rf_preds_test.filter(rf_preds_test.outcome == rf_preds_test.prediction)
    .count() / float(rf_preds_test.count()))

print('\nRandom Forest')
print("Train Accuracy :", rf_accuracy_train)
print("Test Accuracy :", rf_accuracy_test)

Logistic Regression
Train Accuracy : 0.8225481985443306
Test Accuracy : 0.8164692771869925

Random Forest
Train Accuracy : 0.827085293939891
Test Accuracy : 0.8378561012910339


#### Hyperparameter Tuning

In [16]:
# logistic regression

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

lr = LogisticRegression(featuresCol = 'features', labelCol = 'outcome')

# Create ParamGrid for Cross Validation
lr_paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.0001, 1.0])
             .addGrid(lr.maxIter, [10, 50])
             .build())

evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', 
    labelCol='outcome', metricName='accuracy')

lr_cv = CrossValidator(estimator=lr, estimatorParamMaps=lr_paramGrid, 
                    evaluator=evaluator, numFolds=5)

lr_cv_fit_train = lr_cv.fit(train_transform)
lr_cv_preds_test = lr_cv_fit_train.transform(test_transform)



In [17]:
# random forest

rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'outcome')

rf_paramGrid = (ParamGridBuilder()
             .addGrid(rf.maxDepth, [10, 15])# maximum depth for each tree
             .addGrid(rf.numTrees,[30, 60])# number of trues
             .build())

evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', 
    labelCol='outcome', metricName='accuracy')

rf_cv = CrossValidator(estimator=rf, estimatorParamMaps=rf_paramGrid, 
                    evaluator=evaluator, numFolds=5)

rf_cv_fit_train = rf_cv.fit(train_transform)

rf_cv_preds_test = rf_cv_fit_train.transform(test_transform)

In [18]:
# logistic regression
lr_accuracy_test_cv = (lr_cv_preds_test.filter(lr_cv_preds_test.outcome == lr_cv_preds_test.prediction)
    .count() / float(lr_cv_preds_test.count()))

# random forest
rf_accuracy_test_cv = (rf_cv_preds_test.filter(rf_cv_preds_test.outcome == rf_cv_preds_test.prediction)
    .count() / float(rf_cv_preds_test.count()))

print('\nLogistic Regression')
print("Pre-CV:", lr_accuracy_test)
print("Post-CV:", lr_accuracy_test_cv)


print("\nRandom Forest")
print("Pre-CV:", rf_accuracy_test)
print("Post-CV:", rf_accuracy_test_cv)



Logistic Regression
Pre-CV: 0.8164692771869925
Post-CV: 0.8269137295349196

Random Forest
Pre-CV: 0.8378561012910339
Post-CV: 0.9015660750909888


### Machine Learning - TensorFlow

The two classifiers chosen are: a shallow NN and a deep NN. The shallow NN only has 2 hidden layers, while the deep NN has 5 hidden layers.

In [19]:
import tensorflow as tf  # now import the tensorflow module
print(tf.__version__)  # make sure the version is 2.x

2.10.0


In [20]:
import numpy as np
from tensorflow import keras

In [21]:
# create tensors

x_train = tf.constant(np.array(train_transform.toPandas()['features'].values.tolist()))
y_train = tf.constant(np.array(train_transform.toPandas()['outcome'].values.tolist()))

x_test = tf.constant(np.array(test_transform.toPandas()['features'].values.tolist()))
y_test = tf.constant(np.array(test_transform.toPandas()['outcome'].values.tolist()))

In [22]:
# Shallow NN (2 hidden layers)
from tensorflow import keras

model_shallow = keras.Sequential()

model_shallow.add(keras.layers.Dense(10, activation = 'relu'))
model_shallow.add(keras.layers.Dense(10, activation = 'relu'))

model_shallow.add(keras.layers.Dense(6))

model_shallow.compile(optimizer = keras.optimizers.SGD(), loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=[keras.metrics.SparseCategoricalAccuracy()])
model_shallow.fit(x_train, y_train, epochs = 10, verbose = 2)
model_shallow.evaluate(x_test, y_test, verbose = 2)

Epoch 1/10
7239/7239 - 4s - loss: 0.4974 - sparse_categorical_accuracy: 0.7947 - 4s/epoch - 526us/step
Epoch 2/10
7239/7239 - 4s - loss: 0.4424 - sparse_categorical_accuracy: 0.8144 - 4s/epoch - 524us/step
Epoch 3/10
7239/7239 - 4s - loss: 0.4354 - sparse_categorical_accuracy: 0.8186 - 4s/epoch - 504us/step
Epoch 4/10
7239/7239 - 4s - loss: 0.4307 - sparse_categorical_accuracy: 0.8219 - 4s/epoch - 503us/step
Epoch 5/10
7239/7239 - 4s - loss: 0.4273 - sparse_categorical_accuracy: 0.8237 - 4s/epoch - 575us/step
Epoch 6/10
7239/7239 - 4s - loss: 0.4240 - sparse_categorical_accuracy: 0.8257 - 4s/epoch - 549us/step
Epoch 7/10
7239/7239 - 4s - loss: 0.4226 - sparse_categorical_accuracy: 0.8258 - 4s/epoch - 515us/step
Epoch 8/10
7239/7239 - 4s - loss: 0.4218 - sparse_categorical_accuracy: 0.8260 - 4s/epoch - 504us/step
Epoch 9/10
7239/7239 - 4s - loss: 0.4213 - sparse_categorical_accuracy: 0.8261 - 4s/epoch - 504us/step
Epoch 10/10
7239/7239 - 4s - loss: 0.4210 - sparse_categorical_accuracy: 

[0.4313971996307373, 0.8269611597061157]

In [23]:
# Deep NN (2 hidden layers)
from tensorflow import keras

model_deep = keras.Sequential()

model_deep.add(keras.layers.Dense(10, activation = 'relu'))
model_deep.add(keras.layers.Dense(10, activation = 'relu'))
model_deep.add(keras.layers.Dense(10, activation = 'relu'))
model_deep.add(keras.layers.Dense(10, activation = 'relu'))
model_deep.add(keras.layers.Dense(10, activation = 'relu'))


model_deep.add(keras.layers.Dense(6))

model_deep.compile(optimizer = keras.optimizers.SGD(), loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=[keras.metrics.SparseCategoricalAccuracy()])
model_deep.fit(x_train, y_train, epochs = 10, verbose = 2)
model_deep.evaluate(x_test, y_test, verbose = 2)

Epoch 1/10
7239/7239 - 4s - loss: 0.5754 - sparse_categorical_accuracy: 0.7747 - 4s/epoch - 612us/step
Epoch 2/10
7239/7239 - 4s - loss: 0.4409 - sparse_categorical_accuracy: 0.8191 - 4s/epoch - 541us/step
Epoch 3/10
7239/7239 - 4s - loss: 0.4365 - sparse_categorical_accuracy: 0.8215 - 4s/epoch - 529us/step
Epoch 4/10
7239/7239 - 4s - loss: 0.4362 - sparse_categorical_accuracy: 0.8216 - 4s/epoch - 531us/step
Epoch 5/10
7239/7239 - 5s - loss: 0.4340 - sparse_categorical_accuracy: 0.8218 - 5s/epoch - 640us/step
Epoch 6/10
7239/7239 - 4s - loss: 0.4335 - sparse_categorical_accuracy: 0.8219 - 4s/epoch - 603us/step
Epoch 7/10
7239/7239 - 4s - loss: 0.4328 - sparse_categorical_accuracy: 0.8218 - 4s/epoch - 574us/step
Epoch 8/10
7239/7239 - 4s - loss: 0.4320 - sparse_categorical_accuracy: 0.8218 - 4s/epoch - 538us/step
Epoch 9/10
7239/7239 - 4s - loss: 0.4314 - sparse_categorical_accuracy: 0.8217 - 4s/epoch - 537us/step
Epoch 10/10
7239/7239 - 4s - loss: 0.4303 - sparse_categorical_accuracy: 

[0.44072425365448, 0.8159002065658569]

#### Hyperparameter Tuning

In [24]:
# combine the x and y train tensors and shuffle
tf_train = tf.concat([x_train, tf.reshape(y_train, [-1, 1])], axis = 1)
tf_train_shuffle = tf.random.shuffle(tf_train)

In [25]:
def shallow_cross_val_activation_width(k, act_fun, width):
    
    cuts = np.linspace(0, tf_train_shuffle.shape[0]-1, k+1, dtype = int)
    
    model = keras.Sequential()
    model.add(keras.layers.Dense(width, activation = act_fun))
    model.add(keras.layers.Dense(width, activation = act_fun))
    model.add(keras.layers.Dense(6))

    metric = []

    for i in range(k):
        print('k = '+str(i+1)+'\n')
        
        val = tf_train_shuffle[cuts[i]:cuts[i+1]]
            
        mask = np.ones(tf_train_shuffle.shape[0])
        mask[cuts[i]:cuts[i+1]] = 0
        
        train = tf.boolean_mask(tf_train_shuffle, mask)
    
        cur_x_train = train[:,:-1]
        cur_y_train = train[:,-1]
        
        cur_x_val = val[:,:-1]
        cur_y_val = val[:,-1]
        
        model.compile(optimizer = keras.optimizers.SGD(), 
                      loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                      metrics=[keras.metrics.SparseCategoricalAccuracy()])
        
        fit_data = model.fit(cur_x_train, cur_y_train, epochs = 5, 
                             verbose = 2,validation_data = (cur_x_val, cur_y_val))
        
        cur_auc = np.mean(fit_data.history['val_sparse_categorical_accuracy'])
        metric.append(cur_auc)
        
        if np.max(metric) == cur_auc:
            best_model = model
            print('\nNew best model saved.')
        
        print('\n')
        
    print(metric)
    return np.mean(metric), best_model.evaluate(x_test, y_test, verbose = 2), best_model


# val_acc, test_res, best_model_shallow = shallow_cross_val_activation_width(k = 4, act_fun = 'relu', width = 5)

# print('\n===========')
# print("Validation Accuracy:", val_acc)
# print("Test Accuracy:", test_res[1])

In [26]:
def deep_cross_val_activation_width(k, act_fun, width):
    
    cuts = np.linspace(0, tf_train_shuffle.shape[0]-1, k+1, dtype = int)
    
    model = keras.Sequential()
    
    model.add(keras.layers.Dense(width, activation = act_fun))
    model.add(keras.layers.Dense(width, activation = act_fun))
    model.add(keras.layers.Dense(width, activation = act_fun))
    model.add(keras.layers.Dense(width, activation = act_fun))
    model.add(keras.layers.Dense(width, activation = act_fun))
    
    model.add(keras.layers.Dense(6))

    metric = []

    for i in range(k):
        print('k = '+str(i+1)+'\n')
        
        val = tf_train_shuffle[cuts[i]:cuts[i+1]]
            
        mask = np.ones(tf_train_shuffle.shape[0])
        mask[cuts[i]:cuts[i+1]] = 0
        
        train = tf.boolean_mask(tf_train_shuffle, mask)
    
        cur_x_train = train[:,:-1]
        cur_y_train = train[:,-1]
        
        cur_x_val = val[:,:-1]
        cur_y_val = val[:,-1]
        
        model.compile(optimizer = keras.optimizers.SGD(), 
                      loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                      metrics=[keras.metrics.SparseCategoricalAccuracy()])
        
        fit_data = model.fit(cur_x_train, cur_y_train, epochs = 5,
                             verbose = 2, validation_data = (cur_x_val, cur_y_val))
        
        cur_auc = np.mean(fit_data.history['val_sparse_categorical_accuracy'])
        metric.append(cur_auc)
        
                
        if np.max(metric) == cur_auc:
            best_model = model
            print('\nNew best model saved.')

        print('\n')
        
    print(metric)
    return np.mean(metric), best_model.evaluate(x_train, y_train, verbose = 2), best_model


# val_acc, test_res, best_model_deep = deep_cross_val_activation_width(k = 4, act_fun = 'relu', width = 5)

# print('\n===========')
# print("Validation Accuracy:", val_acc)
# print("Test Accuracy:", test_res[1])

In [29]:
act_funs = ['relu', 'softmax']
widths = [10, 20]

accuracies = []

for act_fun in act_funs:
    for width in widths:

        run_name = 'run-'+act_fun+' width'+str(width)
        print('')
        print('--- Starting trial: %s' % run_name)

        run_dir = 'logs14813/hparam_tuning_q2_2/' + run_name
        val_acc, test_res, hp_model = shallow_cross_val_activation_width(3, act_fun, width)
        
        accuracies.append(test_res[1])
        
        if np.max(accuracies) == test_res[1]:
            best_hp_shallow_model = hp_model
            best_shallow_run_name = run_name
            print('New Best Model:', best_shallow_run_name)
            
shallow_res = best_hp_shallow_model.evaluate(x_test, y_test, verbose = 2)

print('Best Model Hyperparameters:', best_shallow_run_name)
print('Test Accuracy:', shallow_res[1])


--- Starting trial: run-relu width10
k = 1

Epoch 1/5
4826/4826 - 4s - loss: 0.5097 - sparse_categorical_accuracy: 0.7935 - val_loss: 0.4405 - val_sparse_categorical_accuracy: 0.8218 - 4s/epoch - 819us/step
Epoch 2/5
4826/4826 - 4s - loss: 0.4387 - sparse_categorical_accuracy: 0.8146 - val_loss: 0.4323 - val_sparse_categorical_accuracy: 0.8241 - 4s/epoch - 756us/step
Epoch 3/5
4826/4826 - 4s - loss: 0.4315 - sparse_categorical_accuracy: 0.8197 - val_loss: 0.4290 - val_sparse_categorical_accuracy: 0.8241 - 4s/epoch - 769us/step
Epoch 4/5
4826/4826 - 4s - loss: 0.4260 - sparse_categorical_accuracy: 0.8236 - val_loss: 0.4316 - val_sparse_categorical_accuracy: 0.8241 - 4s/epoch - 858us/step
Epoch 5/5
4826/4826 - 4s - loss: 0.4241 - sparse_categorical_accuracy: 0.8247 - val_loss: 0.4362 - val_sparse_categorical_accuracy: 0.7520 - 4s/epoch - 774us/step

New best model saved.


k = 2

Epoch 1/5
4826/4826 - 4s - loss: 0.4231 - sparse_categorical_accuracy: 0.8248 - val_loss: 0.4234 - val_spars

In [30]:
act_funs = ['relu', 'softmax']
widths = [10, 20]

accuracies = []

for act_fun in act_funs:
    for width in widths:

        run_name = 'run-'+act_fun+' width'+str(width)
        print('')
        print('--- Starting trial: %s' % run_name)
        
        val_acc, test_res, hp_model = deep_cross_val_activation_width(3, act_fun, width)
        
        accuracies.append(test_res[1])
        
        if np.max(accuracies) == test_res[1]:
            best_hp_deep_model = hp_model
            best_deep_run_name = run_name
            print('New Best Model:', best_deep_run_name)


deep_res = best_hp_deep_model.evaluate(x_test, y_test, verbose = 2)

print('Best Model Hyperparameters:', best_deep_run_name)
print('Test Accuracy:', deep_res[1])


--- Starting trial: run-relu width10
k = 1

Epoch 1/5
4826/4826 - 4s - loss: 0.5223 - sparse_categorical_accuracy: 0.7980 - val_loss: 0.4646 - val_sparse_categorical_accuracy: 0.8185 - 4s/epoch - 903us/step
Epoch 2/5
4826/4826 - 4s - loss: 0.4520 - sparse_categorical_accuracy: 0.8161 - val_loss: 0.4432 - val_sparse_categorical_accuracy: 0.8183 - 4s/epoch - 808us/step
Epoch 3/5
4826/4826 - 4s - loss: 0.4431 - sparse_categorical_accuracy: 0.8170 - val_loss: 0.4423 - val_sparse_categorical_accuracy: 0.8202 - 4s/epoch - 791us/step
Epoch 4/5
4826/4826 - 4s - loss: 0.4364 - sparse_categorical_accuracy: 0.8218 - val_loss: 0.4359 - val_sparse_categorical_accuracy: 0.8206 - 4s/epoch - 808us/step
Epoch 5/5
4826/4826 - 4s - loss: 0.4316 - sparse_categorical_accuracy: 0.8236 - val_loss: 0.4350 - val_sparse_categorical_accuracy: 0.8239 - 4s/epoch - 786us/step

New best model saved.


k = 2

Epoch 1/5
4826/4826 - 5s - loss: 0.4281 - sparse_categorical_accuracy: 0.8242 - val_loss: 0.4296 - val_spars