# Imports

In [1]:
import torch
import torch.nn as nn
import torch.utils.data

from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

from os import makedirs
from os.path import join, exists

import numpy as np
import pandas as pd

from itertools import product

from datetime import datetime

# Utils Functions  

In [2]:
smt = SMOTE()

In [3]:
def epochs(train_dataset, n_iters, batch_size):
    num_of_epochs = int(n_iters/(len(train_dataset)/batch_size))
    return num_of_epochs

In [4]:
def feature_from_correlation(correlation_map, lower_range, upper_range):
    filtered_features = []
    corresponding_correlations = []
    
    for index, row in correlation_map.iterrows():
        correlation = round(row.URL_Type_obf_Type, 4)
        
        if correlation<lower_range or  correlation>upper_range:
            filtered_features.append(index)
            corresponding_correlations.append(correlation)
    
    dataframe = pd.DataFrame(data = {"Features":filtered_features, "Correlations":corresponding_correlations})
    return dataframe

In [5]:
def split_dataset(dataset, training_allocation, testing_allocation):
    training_dataset_length = int(training_allocation*len(dataset))    
    dataset_training = dataset[:training_dataset_length]
    dataset_training = dataset_training.to_numpy().astype("float32") 
    dataset_training = torch.tensor(dataset_training)
    
    print("Training set shape: {}".format(dataset_training.shape))
    
    testing_dataset_length = int(testing_allocation*len(dataset))
    dataset_testing = dataset[-testing_dataset_length:]
    dataset_testing = dataset_testing.to_numpy().astype("float32") 
    dataset_testing = torch.tensor(dataset_testing)
    
    print("Testing set shape: {}".format(dataset_testing.shape) + "\n")
    
    return dataset_training, dataset_testing

In [6]:
def down_sample(dataset, column_name):
    target_names = dataset[column_name].value_counts().index.tolist()
    value_counts = dataset[column_name].value_counts().values.tolist()
    minimum_of_value_counts = min(value_counts)
    separated_dfs = [dataset[dataset[column_name]==target] for target in target_names]
    balanced_dfs = [dataset.iloc[:minimum_of_value_counts, :].sample(frac = 1).reset_index(drop = True) for dataset in separated_dfs]
    dataset = pd.concat(balanced_dfs, axis = 0)
    dataset = dataset.reset_index(drop = True)
    return dataset

In [7]:
def upsample(dataset):
    X_train = dataset.iloc[:, :-1]
    Y_train = dataset.iloc[:, -1]
    features, targets = smt.fit_sample(X_train, Y_train)
    column_names = dataset.columns.tolist()
    targets = targets.reshape((-1,1))
    upsampled_dataset = np.hstack((features, targets))
    upsampled_dataset = pd.DataFrame(columns = column_names, data = upsampled_dataset)
    return upsampled_dataset

# Load Data

In [8]:
dataset = pd.read_csv("all.csv", low_memory = False, na_values = [-1, "nan", "Infinity"])

**Explore dataset**

In [9]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36707 entries, 0 to 36706
Data columns (total 80 columns):
Querylength                        36707 non-null int64
domain_token_count                 36707 non-null int64
path_token_count                   36707 non-null int64
avgdomaintokenlen                  36707 non-null float64
longdomaintokenlen                 36707 non-null int64
avgpathtokenlen                    36427 non-null float64
tld                                36707 non-null int64
charcompvowels                     36707 non-null int64
charcompace                        36707 non-null int64
ldl_url                            36707 non-null int64
ldl_domain                         36707 non-null int64
ldl_path                           36707 non-null int64
ldl_filename                       36707 non-null int64
ldl_getArg                         36707 non-null int64
dld_url                            36707 non-null int64
dld_domain                         36707 non-nu

In [10]:
dataset.shape

(36707, 80)

# Clean data

**Replace target categorical data with encoded label**

In [11]:
dataset["URL_Type_obf_Type"] = dataset["URL_Type_obf_Type"].replace("Defacement", 0.0)
dataset["URL_Type_obf_Type"] = dataset["URL_Type_obf_Type"].replace("benign", 1.0)
dataset["URL_Type_obf_Type"] = dataset["URL_Type_obf_Type"].replace("phishing", 2.0)
dataset["URL_Type_obf_Type"] = dataset["URL_Type_obf_Type"].replace("malware", 3.0)
dataset["URL_Type_obf_Type"] = dataset["URL_Type_obf_Type"].replace("spam", 4.0)

**Fill NaN values with zeros**

In [12]:
dataset = dataset.fillna(0.0)

**Convert entire dataframe to numeric** 

In [13]:
# By examining the output of dataset.info() you can see that 
# there are columns within the dataset that are of type object 
#(they are a string).
dataset = dataset.apply(pd.to_numeric)

In [14]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36707 entries, 0 to 36706
Data columns (total 80 columns):
Querylength                        36707 non-null int64
domain_token_count                 36707 non-null int64
path_token_count                   36707 non-null int64
avgdomaintokenlen                  36707 non-null float64
longdomaintokenlen                 36707 non-null int64
avgpathtokenlen                    36707 non-null float64
tld                                36707 non-null int64
charcompvowels                     36707 non-null int64
charcompace                        36707 non-null int64
ldl_url                            36707 non-null int64
ldl_domain                         36707 non-null int64
ldl_path                           36707 non-null int64
ldl_filename                       36707 non-null int64
ldl_getArg                         36707 non-null int64
dld_url                            36707 non-null int64
dld_domain                         36707 non-nu

**Find all zero columns**

In [15]:
column_names = dataset.columns.tolist()

In [16]:
for name in column_names:
    mean = dataset[name].mean()
    if mean == 0.0:
        print("Column >>{}<< is all zeros".format(name))

Column >>isPortEighty<< is all zeros
Column >>ISIpAddressInDomainName<< is all zeros


Since **isPortEighty** & **ISIpAddressInDomainName** is all zeros, there is no useful information that the model can generalize from those columns. Therefore, they are removed.

In [17]:
dataset = dataset.drop(["isPortEighty", "ISIpAddressInDomainName"], axis=1)

In [18]:
dataset.URL_Type_obf_Type.value_counts()

0.0    7930
1.0    7781
2.0    7586
3.0    6712
4.0    6698
Name: URL_Type_obf_Type, dtype: int64

**Global Variables**

In [19]:
num_classes = 5
batch_size = 1000
encoded_labels = [0.0, 1.0, 2.0, 3.0, 4.0]
unencoded_labels = ["Defacement", "benign", "phishing", "malware", "spam"]

# Balance Dataset

**Down-sample**

In [20]:
down_sampled_dataset = down_sample(dataset, "URL_Type_obf_Type")

In [21]:
down_sampled_dataset["URL_Type_obf_Type"].value_counts()

3.0    6698
4.0    6698
2.0    6698
1.0    6698
0.0    6698
Name: URL_Type_obf_Type, dtype: int64

**Up-sample**

In [22]:
up_sampled_dataset = upsample(dataset)

In [23]:
up_sampled_dataset["URL_Type_obf_Type"].value_counts()

3.0    7930
4.0    7930
2.0    7930
1.0    7930
0.0    7930
Name: URL_Type_obf_Type, dtype: int64

# Feature Selection

### PCA Algorithm

**15 PCA Components**

In [26]:
n = 15
dataset_PCA = PCA(n_components=n)
column_names = ["principal_component_{}".format(i) for i in range(n)]

# DOWNSAMPLED SET
ds = down_sampled_dataset.loc[:, :"URL_Type_obf_Type"].values # Separating out the target
ds = StandardScaler().fit_transform(ds)

principal_components_ds = dataset_PCA.fit_transform(ds)
principal_Df_ds = pd.DataFrame(data = principal_components_ds, columns = column_names)
pca_15_dataset_ds = pd.concat([principal_Df_ds, down_sampled_dataset.loc[:, "URL_Type_obf_Type"]], axis = 1)
pca_15_dataset_ds = pca_15_dataset_ds.sample(frac = 1).reset_index(drop = True)

# UPSAMPLED SET
us = up_sampled_dataset.loc[:, :"URL_Type_obf_Type"].values # Separating out the target
us = StandardScaler().fit_transform(us)

principal_components_us = dataset_PCA.fit_transform(us)
principal_Df_us = pd.DataFrame(data = principal_components_us, columns = column_names)
pca_15_dataset_us = pd.concat([principal_Df_us, up_sampled_dataset.loc[:, "URL_Type_obf_Type"]], axis = 1)
pca_15_dataset_us = pca_15_dataset_us.sample(frac = 1).reset_index(drop = True)

**Split Dataset**

In [27]:
# DOWNSAMPLED SET
pca_15_dataset_training_ds, pca_15_dataset_testing_ds = split_dataset(pca_15_dataset_ds, 0.80, 0.20)

# UPSAMPLED SET
pca_15_dataset_training_us, pca_15_dataset_testing_us = split_dataset(pca_15_dataset_us, 0.80, 0.20)

Training set shape: torch.Size([26792, 16])
Testing set shape: torch.Size([6698, 16])

Training set shape: torch.Size([31720, 16])
Testing set shape: torch.Size([7930, 16])



**25 PCA Components**

In [30]:
n = 25
dataset_PCA = PCA(n_components=n)
column_names = ["principal_component_{}".format(i) for i in range(n)]

# DOWNSAMPLED SET
ds = down_sampled_dataset.loc[:, :"URL_Type_obf_Type"].values # Separating out the target
ds = StandardScaler().fit_transform(ds)

principal_components_ds = dataset_PCA.fit_transform(ds)
principal_Df_ds = pd.DataFrame(data = principal_components_ds, columns = column_names)
pca_25_dataset_ds = pd.concat([principal_Df_ds, down_sampled_dataset.loc[:, "URL_Type_obf_Type"]], axis = 1)
pca_25_dataset_ds = pca_25_dataset_ds.sample(frac = 1).reset_index(drop = True)

# UPSAMPLED SET
us = up_sampled_dataset.loc[:, :"URL_Type_obf_Type"].values # Separating out the target
us = StandardScaler().fit_transform(us)

principal_components_us = dataset_PCA.fit_transform(us)
principal_Df_us = pd.DataFrame(data = principal_components_us, columns = column_names)
pca_25_dataset_us = pd.concat([principal_Df_us, up_sampled_dataset.loc[:, "URL_Type_obf_Type"]], axis = 1)
pca_25_dataset_us = pca_25_dataset_us.sample(frac = 1).reset_index(drop = True)

**Split Dataset**

In [31]:
# DOWNSAMPLED SET
pca_25_dataset_training_ds, pca_25_dataset_testing_ds = split_dataset(pca_25_dataset_ds, 0.80, 0.20)

# UPSAMPLED SET
pca_25_dataset_training_us, pca_25_dataset_testing_us = split_dataset(pca_25_dataset_us, 0.80, 0.20)

Training set shape: torch.Size([26792, 26])
Testing set shape: torch.Size([6698, 26])

Training set shape: torch.Size([31720, 26])
Testing set shape: torch.Size([7930, 26])



**30 PCA Components**

In [32]:
n = 30
dataset_PCA = PCA(n_components=n)
column_names = ["principal_component_{}".format(i) for i in range(n)]

# DOWNSAMPLED SET
ds = down_sampled_dataset.loc[:, :"URL_Type_obf_Type"].values # Separating out the target
ds = StandardScaler().fit_transform(ds)

principal_components_ds = dataset_PCA.fit_transform(ds)
principal_Df_ds = pd.DataFrame(data = principal_components_ds, columns = column_names)
pca_30_dataset_ds = pd.concat([principal_Df_ds, down_sampled_dataset.loc[:, "URL_Type_obf_Type"]], axis = 1)
pca_30_dataset_ds = pca_30_dataset_ds.sample(frac = 1).reset_index(drop = True)

# UPSAMPLED SET
us = up_sampled_dataset.loc[:, :"URL_Type_obf_Type"].values # Separating out the target
us = StandardScaler().fit_transform(us)

principal_components_us = dataset_PCA.fit_transform(us)
principal_Df_us = pd.DataFrame(data = principal_components_us, columns = column_names)
pca_30_dataset_us = pd.concat([principal_Df_us, up_sampled_dataset.loc[:, "URL_Type_obf_Type"]], axis = 1)
pca_30_dataset_us = pca_30_dataset_us.sample(frac = 1).reset_index(drop = True)

**Split Dataset**

In [33]:
# DOWNSAMPLED SET
pca_30_dataset_training_ds, pca_30_dataset_testing_ds = split_dataset(pca_30_dataset_ds, 0.80, 0.20)

# UPSAMPLED SET
pca_30_dataset_training_us, pca_30_dataset_testing_us = split_dataset(pca_30_dataset_us, 0.80, 0.20)

Training set shape: torch.Size([26792, 31])
Testing set shape: torch.Size([6698, 31])

Training set shape: torch.Size([31720, 31])
Testing set shape: torch.Size([7930, 31])



**Spearman correlation**

In [36]:
correlation_map = dataset.corr(method = "spearman")

In [119]:
correlation_map

Unnamed: 0,Querylength,domain_token_count,path_token_count,avgdomaintokenlen,longdomaintokenlen,avgpathtokenlen,tld,charcompvowels,charcompace,ldl_url,...,SymbolCount_FileName,SymbolCount_Extension,SymbolCount_Afterpath,Entropy_URL,Entropy_Domain,Entropy_DirectoryName,Entropy_Filename,Entropy_Extension,Entropy_Afterpath,URL_Type_obf_Type
Querylength,1.000000,0.216476,0.397880,-0.078390,0.062631,-0.485053,0.216476,0.489252,0.470341,0.230636,...,0.794757,0.814578,0.919061,-0.174685,-0.169994,-0.145407,-0.320017,0.055360,0.766734,0.016351
domain_token_count,0.216476,1.000000,-0.086912,-0.296066,0.039787,-0.124185,1.000000,-0.005741,0.048004,0.108117,...,0.242872,0.202962,0.185896,0.000025,-0.551254,-0.031305,0.072357,0.091447,0.180556,0.257578
path_token_count,0.397880,-0.086912,1.000000,-0.064224,-0.043609,-0.076906,-0.086912,0.736043,0.690169,0.057315,...,0.305165,0.396102,0.435297,-0.604181,0.059103,-0.137376,-0.501098,-0.198693,0.280050,0.006406
avgdomaintokenlen,-0.078390,-0.296066,-0.064224,1.000000,0.898761,-0.017851,-0.296066,-0.055701,-0.038918,-0.030094,...,-0.125744,-0.093333,-0.083837,-0.173386,-0.116090,-0.085439,-0.028294,-0.123944,-0.185197,-0.200548
longdomaintokenlen,0.062631,0.039787,-0.043609,0.898761,1.000000,-0.106487,0.039787,0.013903,0.041623,0.008291,...,0.009758,0.036288,0.029812,-0.219064,-0.283656,-0.097815,-0.059185,-0.109626,-0.097032,-0.115739
avgpathtokenlen,-0.485053,-0.124185,-0.076906,-0.017851,-0.106487,1.000000,-0.124185,0.037986,0.101715,0.132506,...,-0.415065,-0.379183,-0.481583,-0.075008,0.161740,0.261518,0.090083,-0.086361,-0.362024,0.206961
tld,0.216476,1.000000,-0.086912,-0.296066,0.039787,-0.124185,1.000000,-0.005741,0.048004,0.108117,...,0.242872,0.202962,0.185896,0.000025,-0.551254,-0.031305,0.072357,0.091447,0.180556,0.257578
charcompvowels,0.489252,-0.005741,0.736043,-0.055701,0.013903,0.037986,-0.005741,1.000000,0.903817,0.122437,...,0.328450,0.428773,0.439614,-0.725565,0.032077,-0.102337,-0.565696,-0.283268,0.200052,-0.028466
charcompace,0.470341,0.048004,0.690169,-0.038918,0.041623,0.101715,0.048004,0.903817,1.000000,0.264875,...,0.298240,0.395377,0.396098,-0.696096,-0.002549,-0.027480,-0.534470,-0.279340,0.190088,0.079214
ldl_url,0.230636,0.108117,0.057315,-0.030094,0.008291,0.132506,0.108117,0.122437,0.264875,1.000000,...,0.089188,0.111058,0.100537,0.014224,-0.003016,0.138151,-0.114538,-0.008386,0.133413,0.367611


In [38]:
correlated_features = feature_from_correlation(correlation_map, -0.15, 0.25)

In [39]:
correlated_features

Unnamed: 0,Features,Correlations
0,domain_token_count,0.2576
1,avgdomaintokenlen,-0.2005
2,tld,0.2576
3,ldl_url,0.3676
4,ldl_path,0.3281
5,ldl_getArg,0.3104
6,dld_url,0.3185
7,dld_path,0.3194
8,dld_getArg,0.3199
9,NumberofDotsinURL,0.2668


In [124]:
spearman_correlation_dataset = dataset[correlated_features.Features.tolist()].sample(frac = 1).reset_index(drop = True)

**Split Data**

In [125]:
spearman_correlation_dataset_training, spearman_correlation_dataset_testing = split_dataset(spearman_correlation_dataset, 0.80, 0.20)

Training set shape: torch.Size([29365, 20])
Testing set shape: torch.Size([7341, 20])



## General Dataset

In [42]:
general_shuffled_dataset = dataset.sample(frac = 1).reset_index(drop = True)

**Split Data**

In [43]:
general_training_set, general_testing_set = split_dataset(general_shuffled_dataset, 0.80, 0.20)

Training set shape: torch.Size([29365, 78])
Testing set shape: torch.Size([7341, 78])



# Logistic Regression

**Logistic Regression Class**

In [44]:
class LogisticRegressionModel(nn.Module):
    
    def __init__(self, in_dim, num_classes):
        super().__init__()
        self.linear = nn.Linear(in_dim, num_classes)

    def forward(self, x):
        out = self.linear(x)
        return out

**Train Class**

In [95]:
class Train(LogisticRegressionModel):
    
    def __init__(self, in_dim, num_classes):
        super().__init__(in_dim, num_classes)
        self.input_layer_dim = in_dim
        self.output_layer_dim = num_classes
        self.criterion = nn.CrossEntropyLoss()
        self.model = LogisticRegressionModel(self.input_layer_dim, self.output_layer_dim)
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.model = self.model.to(self.device)
        
        self.model_directory = "./models"
        if not exists(self.model_directory):
            makedirs(self.model_directory)
        
    def train_model(self, learning_rate, n_iters, training_data, sub_directory, batch_size, save_iterations = 20):
        subset = str(learning_rate)+"_"+str(n_iters)
        sub_dir = join(sub_directory, subset)
        directory = join(self.model_directory, sub_dir)
        
        if not exists(directory):
            makedirs(directory)
        
        report = None
        precision = None
        recall = None
        f1score = None
        accuracy = None
        report = None
        model_names = []
        
        num_epochs = range(epochs(training_data, n_iters, batch_size))
        training_data = torch.utils.data.DataLoader(dataset=training_data, batch_size=batch_size, shuffle=True)
        optimizer = torch.optim.Adam(self.model.parameters(), lr = learning_rate)
        
        file_name = join(directory, "training_progress.txt")
        with open(file_name, "a") as txt_file:
            
            time = datetime.strftime(datetime.now(), "%Y-%m-%d_%H:%M:%S")
            txt_file.write("Initiate Training: {}".format(time) + "\n\n")
            
            txt_file.write("Learning Rate: {}".format(learning_rate) + "\n")
            txt_file.write("Number of  Epochs: {}".format(num_epochs) + "\n")
            txt_file.write("Batch Size: {}".format(batch_size) + "\n")
            txt_file.write(30*"--")
            txt_file.write("\n\n")
            
            for epoch in num_epochs:

                for i, data in enumerate(training_data):

                    X_train = data[:, :-1]
                    Y_train = data[:, -1]

                    if torch.cuda.is_available():
                        x = Variable(X_train).cuda()
                        y = Variable(Y_train).cuda()

                    else:
                        x = X_train.float()
                        y = Y_train.long()

                    out = self.model(x)
                    assert not torch.isnan(out).any(), out
                    loss = self.criterion(out, y)
                    pred = torch.max(out.data, 1)[1]
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                if epoch%20==0 or epoch==num_epochs[-1]:

                    precision = round(precision_score(y, pred, average="macro")*100, 3)
                    recall = round(recall_score(y, pred, average="macro")*100, 3)
                    f1score = round(f1_score(y, pred, average="macro")*100, 3)
                    accuracy = round(accuracy_score(y, pred)*100, 3)
                    report = classification_report(y, pred, encoded_labels, unencoded_labels)
                    
                    txt_file.write("Progress: {}".format(time)+"\n")
                    txt_file.write("Epoch: {}, Running Loss: {}".format(epoch, loss)+"\n")
                    txt_file.write("Running overall precision: {}%".format(precision)+"\n")
                    txt_file.write("Running overall recall: {}%".format(recall)+"\n")
                    txt_file.write("Running overall F1-Score: {}%".format(f1score)+"\n")
                    txt_file.write("Running overall accuracy: {}%".format(accuracy)+"\n")
                    txt_file.write("\n")

                    
                    if epoch != 0:

                        time = datetime.strftime(datetime.now(), "%Y-%m-%d_%H:%M:%S")

                        model_name = join(directory, time)
                        torch.save(self.model.state_dict(), model_name+".pkl")
                        model_names.append(model_name)

                        txt_file.write("Classification Report:"+"\n")
                        txt_file.write(report)
                        txt_file.write("\n")
                        
                        txt_file.write(40*"#"+"\n")
                        txt_file.write("Model {} saved".format(time)+"\n")
                        txt_file.write(40*"#"+"\n\n")

            model_names.append(model_name)
    
        return model_names

    def run_train_model(self, n_iterations, learning_rate, dataset, directory, batch_size):
        model_names = []
        n_iterations = [n_iterations]
        learning_rates = [learning_rate]
        combinations = list(product(learning_rates, n_iterations))
        for comb in combinations:
            generated_model_names = self.train_model(comb[0], comb[1], dataset, directory, batch_size)
            model_names.extend(generated_model_names)

        return model_names

In [96]:
class Test(LogisticRegressionModel):
    
    def __init__(self, input_size, num_classes):
        super().__init__(input_size, num_classes)
        self.input_size = input_size
        self.num_classes = num_classes
        self.model = LogisticRegressionModel(self.input_size, self.num_classes)

    def evaluate_model(self, testing_data, path):
        model_name = path+".pkl"
        self.model.load_state_dict(torch.load(model_name))
        
        X_test = testing_data[:, :-1]
        Y_test = testing_data[:, -1]

        if torch.cuda.is_available():
            x = Variable(X_test).cuda()
            y = Variable(Y_test).cuda()
            
        else:
            x = X_test.float()
            y = Y_test.long()

        out = self.model(x)
        assert not torch.isnan(out).any(), out
        pred = torch.max(out.data, 1)[1]
        
        precision = round(precision_score(y, pred, average="macro")*100, 3)
        recall = round(recall_score(y, pred, average="macro")*100, 3)
        f1score = round(f1_score(y, pred, average="macro")*100, 3)
        accuracy = round(accuracy_score(y, pred)*100, 3)
        report = classification_report(y, pred, encoded_labels, unencoded_labels)
        
        parent_directory = path.split("/")
        parent_directory = "/".join(parent_directory[:3])
        file_name = join(parent_directory, "evaluations.txt")
        with open(file_name, "a") as txt_file:
            txt_file.write(45*"--"+"\n")
            txt_file.write("Model Name: {}".format(model_name)+"\n")
            txt_file.write(45*"--"+"\n")
            txt_file.write("Precision: {}%".format(precision)+"\n")
            txt_file.write("Recall: {}%".format(recall)+"\n")
            txt_file.write("F1-Score: {}%".format(f1score)+"\n")
            txt_file.write("Accuracy: {}%".format(accuracy)+"\n\n")

            txt_file.write("Classification Report:"+"\n")
            txt_file.write(report)
            txt_file.write("\n")
            
    def run_test_model(self, model_paths, dataset):
        for name in model_paths:
            self.evaluate_model(dataset, name)


### GENERAL DATASET: Training & Testing

**Train Models**

In [122]:
general_train = Train(general_training_set.shape[1]-1, num_classes)
general_model_names = general_train.run_train_model(10000, 0.005, general_training_set, "general_shuffled", batch_size)

**Test Models**

In [123]:
general_test = Test(general_testing_set.shape[1]-1, num_classes)
general_test.run_test_model(general_model_names, general_testing_set)

### PCA TABLE DATASET: Training & Testing Models

**PCA 15**

**Train Models**

- **Learning rate range = [0.01 --> 0.05)**
- **n_iterations range = [3500 --> 4000]**

In [100]:
pca_15_train = Train(pca_15_dataset_training_ds.shape[1]-1, num_classes)
ds_model_names = pca_15_train.run_train_model(10000, 0.005, pca_15_dataset_training_ds, "pca_15_dataset/pca_15_dataset_ds", batch_size)
us_model_names = pca_15_train.run_train_model(10000, 0.005, pca_15_dataset_training_ds, "pca_15_dataset/pca_15_dataset_us", batch_size)

**Test Models**

In [101]:
pca_15_test = Test(pca_15_dataset_testing_ds.shape[1]-1, num_classes)
pca_15_test.run_test_model(ds_model_names, pca_15_dataset_testing_ds)
pca_15_test.run_test_model(us_model_names, pca_15_dataset_testing_us)

**PCA 25**

**Train Models**

- **Learning rate range = [0.01 --> 0.05)**
- **n_iterations range = [3500 --> 4000]**

In [104]:
pca_25_train = Train(pca_25_dataset_training_ds.shape[1]-1, num_classes)
ds_model_names = pca_25_train.run_train_model(10000, 0.005, pca_25_dataset_training_ds, "pca_25_dataset/pca_25_dataset_ds", batch_size)
us_model_names = pca_25_train.run_train_model(10000, 0.005, pca_25_dataset_training_ds, "pca_25_dataset/pca_25_dataset_us", batch_size)

**Test Models**

In [105]:
pca_25_test = Test(pca_25_dataset_testing_ds.shape[1]-1, num_classes)
pca_25_test.run_test_model(ds_model_names, pca_25_dataset_testing_ds)
pca_25_test.run_test_model(us_model_names, pca_25_dataset_testing_us)

**PCA 30**

**Train Models**

- **Learning rate range = [0.01 --> 0.05)**
- **n_iterations range = [3500 --> 4000]**

In [106]:
pca_30_train = Train(pca_30_dataset_training_ds.shape[1]-1, num_classes)
ds_model_names = pca_30_train.run_train_model(10000, 0.005, pca_30_dataset_training_ds, "pca_30_dataset/pca_30_dataset_ds", batch_size)
us_model_names = pca_30_train.run_train_model(10000, 0.005, pca_30_dataset_training_ds, "pca_30_dataset/pca_30_dataset_us", batch_size)

**Test Models**

In [107]:
pca_30_test = Test(pca_30_dataset_testing_ds.shape[1]-1, num_classes)
pca_30_test.run_test_model(ds_model_names, pca_30_dataset_testing_ds)
pca_30_test.run_test_model(us_model_names, pca_30_dataset_testing_us)

### SPEARMAN CORRELATION TABLE DATASET: Training & Testing Models

- **Learning rate range = [0.01 --> 0.05)**
- **n_iterations range = [3500 --> 4000]**

**Train Models**

In [126]:
spearman_train = Train(spearman_correlation_dataset_training.shape[1]-1, num_classes)
spearman_model_names = spearman_train.run_train_model(10000, 0.005, spearman_correlation_based_dataset_training, "spearman_correlation", batch_size)

**Test Models**

In [127]:
spearman_test = Test(spearman_correlation_dataset_testing.shape[1]-1, num_classes)
spearman_test.run_test_model(spearman_model_names, spearman_correlation_dataset_testing)