In [None]:
from scipy.io import arff
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split, KFold

In [None]:
%matplotlib inline

In [None]:
def getProcessedDataFrame(filepath):
    dataset = arff.loadarff(filepath)
    df = pd.DataFrame(dataset[0])
    str_df = df.select_dtypes([np.object]) 
    str_df = str_df.stack().str.decode('utf-8').unstack()

    for col in str_df.columns:
        str_df[col] = str_df[col].astype(int)
    return str_df

In [None]:
complete_training = getProcessedDataFrame("Training Dataset.arff")
print(complete_training.columns)

#We will not include: SSLfinal_State,Domain_registration_length,port,Abnormal_URL,Redirect,popUpWindow,Google_Index,Links_pointing_to_page,Statistical_report
reduced_df = complete_training[['having_IP_Address', 'URL_Length', 'Shortining_Service',
       'having_At_Symbol', 'double_slash_redirecting', 'Prefix_Suffix',
       'having_Sub_Domain','Favicon', 'HTTPS_token', 'Request_URL', 'URL_of_Anchor',
       'Links_in_tags', 'SFH', 'Submitting_to_email','on_mouseover', 'RightClick', 'Iframe',
       'age_of_domain', 'DNSRecord', 'web_traffic', 'Page_Rank','Result']]

In [None]:
complete_training['Result'].value_counts()

## Metrics to evaluate this Project:

Because the data we have is balanced (around 55:45%) we will consider ACCURACY as our major metric. We will also have to ensure that we will have a minimal TYPE ONE error (Minimize false positives)(FPR). 
This is because it is important that the user does not enter an unsafe url. We can tradeoff classifying a safe url as unsafe for the same reason as well.

In [None]:
#We encode the data into positive values as our algorithms work better then
def convertEncodingToPositive(dataframe):

    mapping = {-1: 2, 0: 0, 1: 1}

    col_map = {}

    for col in dataframe:
        col_map[col] = mapping

    for i in range(dataframe.shape[0]):
        for j in range(dataframe.shape[1]):
            dataframe.loc[i][j] = mapping[dataframe.loc[i][j]]

In [None]:
convertEncodingToPositive(reduced_df)
reduced_df

In [None]:
X_reduced = reduced_df.iloc[:,0:21]
y_reduced = reduced_df.iloc[:, -1]

In [None]:
#Splits data into training and test data
#Quick utility that wraps input validation and next(ShuffleSplit().split(X, y)) and application to input data into a single call for splitting
X_train_red, X_test_red, y_train_red, y_test_red = train_test_split(X_reduced, y_reduced, test_size=0.2, random_state=7, stratify=y_reduced)

print(type(X_train_red))
print(X_train_red.shape)
print(X_test_red.shape)
print(y_train_red.shape)
print(y_test_red.shape,"\n")
X_train_red = X_train_red.to_numpy()
X_test_red = X_test_red.to_numpy()
y_train_red = y_train_red.to_numpy()
y_test_red = y_test_red.to_numpy()

print(type(X_train_red))
print(X_train_red.shape)
print(X_test_red.shape)
print(y_train_red.shape)
print(y_test_red.shape)

In [None]:
#From the training and test data, we will have a 5fold spilt for cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=786)

In [None]:
for train, test in kf.split(X_train_red):
    print(X_train_red[train].shape, y_train_red[train].shape, X_train_red[test].shape, y_train_red[test].shape)

In [None]:
#Confusion matrix assesses the performance of a classification model (F score types) (Precision and recall)
from sklearn.metrics import confusion_matrix

In [None]:
def plot_cm(y_val, y_pred):
    labels = [1, 0]
    cm = confusion_matrix(y_val, y_pred, labels)
    # print(cm)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(cm)
    plt.title('Confusion matrix of the classifier')
    fig.colorbar(cax)
    ax.set_xticklabels([''] + labels)
    ax.set_yticklabels([''] + labels)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    
#tp=true positive, fn=false negative, fp=false positive, tn=true negative

In [None]:
import pickle
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)
X_train_red_onehot = encoder.fit_transform(X_train_red)
X_test_red_onehot = encoder.transform(X_test_red)
pickle.dump(encoder, open("One_Hot_Encoder", 'wb'))

## Categorical Naive Bayes

In [None]:
from sklearn import metrics
from pprint import pprint
from sklearn.naive_bayes import CategoricalNB

In [None]:
def to_use_GNB(X_train_red, y_train_red):
    
    #These will store values for the accuracy scores
    accuracy_scores_catNB = []
    
    #Do this for all n splits
    for train, val in kf.split(X_train_red):
        
        clf_NB = CategoricalNB()
        
        #Train Model
        clf_NB = clf_NB.fit(X_train_red[train], y_train_red[train])
        
        #Using validation data to predict
        predictions = clf_NB.predict(X_train_red[val])
        tp, fn, fp, tn = metrics.confusion_matrix(y_train_red[val], predictions).ravel()
        accuracy = (tn+tp)/(fp+fn+tp+tn)
        precision = tp/(tp+fp)
        recall = tp/(tp+fn)
        fpr = fp/(fp+tn)
        f1 = 2*precision*recall/(precision + recall)
        accuracy_scores_catNB.append((accuracy, precision, recall, fpr, f1))

    return np.mean(accuracy_scores_catNB, axis=0)

metric_GNB = to_use_GNB(X_train_red_onehot, y_train_red)
print(metric_GNB)

## K-Nearest Neighbours

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
def gridsearch_KNN(X_train_red_onehot, y_train_red):
    clf_knn = KNeighborsClassifier()
    knn_param_grid = {
      "n_neighbors": [3, 5, 7, 9, 15],
      "metric": ['euclidean', 'manhattan']
    }

    gs_knn = GridSearchCV(KNeighborsClassifier(), knn_param_grid, cv = 3)
    gs_results = gs_knn.fit(X_train_red_onehot, y_train_red)

    return gs_results.best_params_

bestparams_knn = gridsearch_KNN(X_train_red_onehot, y_train_red)
print(bestparams_knn)

In [None]:
def to_use_KNN(X_train_red, y_train_red):
    accuracy_scores_catKNN = []
    for train, val in kf.split(X_train_red):
        
        clf_neigh = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
        
        #Train model
        clf_neigh = clf_neigh.fit(X_train_red[train], y_train_red[train])
        
        #Make predictions
        predictions = clf_neigh.predict(X_train_red[val])
        tp, fn, fp, tn = metrics.confusion_matrix(y_train_red[val], predictions).ravel()
        accuracy = (tn+tp)/(fp+fn+tp+tn)
        precision = tp/(tp+fp)
        recall = tp/(tp+fn)
        fpr = fp/(fp+tn)
        f1 = 2*precision*recall/(precision + recall)
        accuracy_scores_catKNN.append((accuracy, precision, recall, fpr, f1))
  
    return np.mean(accuracy_scores_catKNN, axis=0)

metric_KNN = to_use_KNN(X_train_red_onehot, y_train_red)
print(metric_KNN)

## XGBoost

In [None]:
#!pip install xgboost
from xgboost import XGBClassifier

In [None]:
def to_use_XGB(X_train_red, y_train_red):
    accuracy_scores_xgb = []
    for train, val in kf.split(X_train_red):
        
        
        clf_xgb = XGBClassifier(silent=False, 
                        scale_pos_weight=1,
                        learning_rate=0.01,  
                        colsample_bytree = 0.4,
                        subsample = 0.8,
                        objective='binary:logistic', 
                        n_estimators=1000, 
                        reg_alpha = 0.3,
                        max_depth=4, 
                        gamma=10)
        
        #Train Model
        clf_xgb = clf_xgb.fit(X_train_red[train], y_train_red[train])
        
        #Predict
        predictions = clf_xgb.predict(X_train_red[val])
        
        tp, fn, fp, tn = metrics.confusion_matrix(y_train_red[val], predictions).ravel()
        accuracy = (tn+tp)/(fp+fn+tp+tn)
        precision = tp/(tp+fp)
        recall = tp/(tp+fn)
        fpr = fp/(fp+tn)
        f1 = 2*precision*recall/(precision + recall)
        accuracy_scores_xgb.append((accuracy, precision, recall, fpr, f1))

    return np.mean(accuracy_scores_xgb, axis=0)
    
metric_xgb = to_use_XGB(X_train_red_onehot, y_train_red)
print(metric_xgb)

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
def grid_search_decision_tree(X_train_red, y_train_red):
    dt = DecisionTreeClassifier(random_state=5)
    p = {"max_depth": range(1,20), "random_state":[5]}
    gs = GridSearchCV(estimator=dt,param_grid=p)
    gs_fit = gs.fit(X_train_red, y_train_red)
    return gs_fit.best_params_

best_params_decision_tree = grid_search_decision_tree(X_train_red_onehot, y_train_red)
print(best_params_decision_tree)

In [None]:
def to_use_decisiontree(X_train_red, y_train_red):
    accuracy_scores_dt = []
    for train, val in kf.split(X_train_red):
        dt = DecisionTreeClassifier(random_state=5, max_depth=18)
        dt = dt.fit(X_train_red[train], y_train_red[train])
        tp, fn, fp, tn = metrics.confusion_matrix(y_train_red[val],dt.predict(X_train_red[val])).ravel()
        dt_accuracy = (tn+tp)/(tp+tn+fp+fn)
        dt_prec = tp/(tp+fp)
        dt_rec = tp/(tp+fn)
        dt_fpr = fp/(fp+tn)
        dt_f1 = 2*dt_prec*dt_rec/(dt_prec+dt_rec)
        accuracy_scores_dt.append((dt_accuracy,dt_prec,dt_rec,dt_fpr,dt_f1))
  
    return np.mean(accuracy_scores_dt,axis=0)

metric_decision_tree = to_use_decisiontree(X_train_red_onehot, y_train_red)
print(metric_decision_tree)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
def grid_search_random_forest(X_train_red_onehot, y_train_red):
    rfc = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True) 
    param_grid = { 
      'n_estimators': [200, 700],
      'max_features': ['auto', 'sqrt', 'log2']
    }
    CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
    CV_rfc.fit(X_train_red, y_train_red)
    return CV_rfc.best_params_

best_params_rfc = grid_search_random_forest(X_train_red_onehot, y_train_red)
print(best_params_rfc)

In [None]:
def to_use_rfc(X_train_red_onehot, y_train_red):
    accuracy_scores_forest = []
    for train, val in kf.split(X_train_red):
        rforest = RandomForestClassifier(max_features= 'auto' ,n_estimators=200)
        rforest = rforest.fit(X_train_red_onehot[train],y_train_red[train])
        tp1, fn1, fp1, tn1 = metrics.confusion_matrix(y_train_red[val],rforest.predict(X_train_red_onehot[val])).ravel()
        r_accuracy = (tn1 + tp1)/(tn1+tp1+fn1+fp1)
        r_prec = tp1/(tp1+fp1)
        r_rec = tp1/(tp1+fn1)
        r_fpr = fp1/(fp1+tn1)
        r_f1 = 2*(r_prec)*r_rec/(r_prec+r_rec)
        accuracy_scores_forest.append((r_accuracy,r_prec,r_rec,r_fpr,r_f1))

    return np.mean(accuracy_scores_forest, axis=0)
  

metric_rforest = to_use_rfc(X_train_red_onehot, y_train_red)
print(metric_rforest)

## Support Vector Machines

In [None]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV

In [None]:
def grid_search_svm(X_train_red_onehot, y_train_red):
    svm_clf = svm.SVC()
    param_grid = {'C': [0.1, 1, 10, 100],  
                'gamma': [1, 0.1, 0.01, 0.001], 
                'kernel': ['rbf', 'linear']}  

    gs_svm = GridSearchCV(svm.SVC(), param_grid, cv = 3)
    gs_results = gs_svm.fit(X_train_red_onehot, y_train_red)

    return gs_results.best_params_

bestparams_svm = grid_search_svm(X_train_red_onehot, y_train_red)
print(bestparams_svm)

In [None]:
def to_use_SVM(X_train_red_onehot, y_train_red):
    accuracy_scores_svm = []
    for train, val in kf.split(X_train_red_onehot):
        svm_clf = svm.SVC(kernel='rbf', gamma = 0.1, C = 10, probability=True)
        svm_clf = svm_clf.fit(X_train_red_onehot[train], y_train_red[train])
        tp, fn, fp, tn = metrics.confusion_matrix(y_train_red[val], svm_clf.predict(X_train_red_onehot[val])).ravel()
        plot_cm(y_train_red[val], svm_clf.predict(X_train_red_onehot[val]))
    
        accuracy = (tn+tp)/(fp+fn+tp+tn)
        precision = tp/(tp+fp)
        recall = tp/(tp+fn)
        fpr = fp/(fp+tn)
        f1 = 2*precision*recall/(precision + recall)
        accuracy_scores_svm.append((accuracy, precision, recall, fpr, f1))


    return np.mean(accuracy_scores_svm, axis=0)

metric_svm = to_use_SVM(X_train_red_onehot, y_train_red)
print(metric_svm)


In [None]:
print("\t===============================================Training Metrics===============================================\n")
labels = ["Accuracy: ", "Precision: ", "Recall: ", "FPR: ", "F1: "]

print("Naive Bayes\n")
for i in range(len(metric_GNB)):
    print(labels[i],metric_GNB[i])

print("\n")

print("KNN\n")
for i in range(len(metric_KNN)):
    print(labels[i],metric_KNN[i])
    
print("\n")

print("XGBoost\n")
for i in range(len(metric_xgb)):
    print(labels[i],metric_xgb[i])
    
print("\n")

print("Decision Trees\n")
for i in range(len(metric_decision_tree)):
    print(labels[i],metric_decision_tree[i])
    
print("\n")

print("Random Forest\n")
for i in range(len(metric_rforest)):
    print(labels[i],metric_rforest[i])
    
print("\n")

print("Support Vector Machines\n")
for i in range(len(metric_svm)):
    print(labels[i],metric_svm[i])

In [None]:
from sklearn.metrics import plot_confusion_matrix


def to_test_KNN(X_test_red_onehot, y_test_red, X_train_red_onehot, y_train_red):
    accuracy_scores_catKNN = []
        
    clf_neigh = KNeighborsClassifier(n_neighbors=3, metric='euclidean')

    #Train model
    clf_neigh = clf_neigh.fit(X_train_red_onehot, y_train_red)

    #Make predictions
    predictions = clf_neigh.predict(X_test_red_onehot)
    tp, fn, fp, tn = metrics.confusion_matrix(y_test_red, predictions).ravel()
    accuracy = (tn+tp)/(fp+fn+tp+tn)
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    fpr = fp/(fp+tn)
    f1 = 2*precision*recall/(precision + recall)
    accuracy_scores_catKNN.append((accuracy, precision, recall, fpr, f1))
    
    plot_confusion_matrix(clf_neigh , X_test_red_onehot, y_test_red)
    plt.show()

    return accuracy_scores_catKNN

test_accuracy_knn = to_test_KNN(X_test_red_onehot, y_test_red, X_train_red_onehot, y_train_red)


def to_test_XGB(X_test_red_onehot, y_test_red, X_train_red_onehot, y_train_red):
    accuracy_scores_xgb = []
    clf_xgb = XGBClassifier(silent=False, scale_pos_weight=1,learning_rate=0.01, colsample_bytree = 0.4,subsample = 0.8, objective='binary:logistic',  n_estimators=1000, reg_alpha = 0.3,max_depth=4, gamma=10)

    #Train Model
    clf_xgb = clf_xgb.fit(X_train_red_onehot, y_train_red)


    tp, fn, fp, tn = metrics.confusion_matrix(y_test_red, clf_xgb.predict(X_test_red_onehot)).ravel()
    accuracy = (tn+tp)/(fp+fn+tp+tn)
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    fpr = fp/(fp+tn)
    f1 = 2*precision*recall/(precision + recall)
    accuracy_scores_xgb.append((accuracy, precision, recall, fpr, f1))
    plot_confusion_matrix(clf_xgb , X_test_red_onehot, y_test_red)
    plt.show()

    return accuracy_scores_xgb 

test_accuracy_xgb = to_test_XGB(X_test_red_onehot, y_test_red, X_train_red_onehot, y_train_red)

def to_test_decisiontree(X_test_red_onehot, y_test_red, X_train_red_onehot, y_train_red):
    accuracy_scores_dt = []
    dt = DecisionTreeClassifier(random_state=5, max_depth=18)
    dt = dt.fit(X_train_red_onehot, y_train_red)
    tp, fn, fp, tn = metrics.confusion_matrix(y_test_red,dt.predict(X_test_red_onehot)).ravel()
    dt_accuracy = (tn+tp)/(tp+tn+fp+fn)
    dt_prec = tp/(tp+fp)
    dt_rec = tp/(tp+fn)
    dt_fpr = fp/(fp+tn)
    dt_f1 = 2*dt_prec*dt_rec/(dt_prec+dt_rec)
    accuracy_scores_dt.append((dt_accuracy,dt_prec,dt_rec,dt_fpr,dt_f1))
    plot_confusion_matrix(dt, X_test_red_onehot, y_test_red)
    plt.show()
  
    return accuracy_scores_dt

test_accuracy_dt = to_test_decisiontree(X_test_red_onehot, y_test_red, X_train_red_onehot, y_train_red)

def to_test_rfc(X_test_red_onehot, y_test_red, X_train_red_onehot, y_train_red):
    
    accuracy_scores_rfc = []
    rforest = RandomForestClassifier(max_features= 'sqrt' ,n_estimators=200)
    rforest = rforest.fit(X_train_red_onehot,y_train_red)
    tp, fn, fp, tn = metrics.confusion_matrix(y_test_red, rforest.predict(X_test_red_onehot)).ravel()
    accuracy = (tn+tp)/(fp+fn+tp+tn)
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    fpr = fp/(fp+tn)
    f1 = 2*precision*recall/(precision + recall)
    accuracy_scores_rfc.append((accuracy, precision, recall, fpr, f1))
    pickle.dump(rforest, open("RF_Final_Model.pkl", 'wb'))
    plot_confusion_matrix(rforest , X_test_red_onehot, y_test_red)
    plt.show()
    
    return accuracy_scores_rfc
  

test_accuracy_rfc = to_test_rfc(X_test_red_onehot, y_test_red, X_train_red_onehot, y_train_red)


def to_test_SVM(X_test_red_onehot, y_test_red, X_train_red_onehot, y_train_red):
    accuracy_scores_svm = []
    svm_clf = svm.SVC(kernel='rbf', gamma = 0.1, C = 10, probability=True)
    svm_clf = svm_clf.fit(X_train_red_onehot, y_train_red)
    tp, fn, fp, tn = metrics.confusion_matrix(y_test_red, svm_clf.predict(X_test_red_onehot)).ravel()
    accuracy = (tn+tp)/(fp+fn+tp+tn)
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    fpr = fp/(fp+tn)
    f1 = 2*precision*recall/(precision + recall)
    pickle.dump(svm_clf, open("SVM_Final_Model.pkl", 'wb'))
    accuracy_scores_svm.append((accuracy, precision, recall, fpr, f1))
    plot_confusion_matrix(svm_clf , X_test_red_onehot, y_test_red)
    plt.show()
    
    return accuracy_scores_svm

test_accuracy_svm = to_test_SVM(X_test_red_onehot, y_test_red, X_train_red_onehot, y_train_red)


print("\t==============================================Testing Metrics==================================================\n")

print("KNN: \n",test_accuracy_knn)

print("XGBoost: \n",test_accuracy_xgb)

print("Decision Tree: \n",test_accuracy_dt)

print("Random Forest: \n",test_accuracy_rfc)

print("Support Vector Machines: \n",test_accuracy_svm)

In [None]:
filename_dt = 'SVM_Final_Model.pkl'
loaded_model = pickle.load(open(filename_dt, 'rb'))
tp, fn, fp, tn = metrics.confusion_matrix(y_train_red, loaded_model.predict(X_train_red_onehot)).ravel()
accuracy = (tn+tp)/(fp+fn+tp+tn)
recall = tp/(tp+fn)
fpr = fp/(fp+tn)
print(accuracy)
print(recall)
print(fpr,"\n\n")

filename_dt = 'RF_Final_Model.pkl'
loaded_model = pickle.load(open(filename_dt, 'rb'))
tp, fn, fp, tn = metrics.confusion_matrix(y_train_red, loaded_model.predict(X_train_red_onehot)).ravel()
accuracy = (tn+tp)/(fp+fn+tp+tn)
recall = tp/(tp+fn)
fpr = fp/(fp+tn)
print(accuracy)
print(recall)
print(fpr,"\n")

print("Random Forest performs best with least no of false positives and high accuracy and recall, hence we will go forward with this algorithm")


In [None]:
from sklearn.manifold import TSNE

def tsne_analysis():
    """ For TSNE analysis on the training dataset with 
    last hidden layer output"""

    filename_dt = 'RF_Final_Model'
    loaded_model = pickle.load(open(filename_dt, 'rb'))
  
    hidden_opt = loaded_model.predict_proba(X_train_red_onehot)
    df = pd.DataFrame(hidden_opt)
    df['y'] = y_train_red
    df['label'] = df['y'].apply(lambda i: str(i))
    tsne = TSNE(n_components = 2, perplexity = 50, verbose = 1, n_iter = 1000)
    tsne_res = tsne.fit_transform(hidden_opt)

    df['TSNE 1st component'] = tsne_res[:,0]
    df['TSNE 2nd component'] = tsne_res[:,1]

    sns.scatterplot(
        x = "TSNE 1st component", y = "TSNE 2nd component",
        hue = 'y',
        palette = sns.color_palette("hls", 2),
        data = df,
        legend = "full",
        alpha = 0.3
    )
    plt.show()

tsne_analysis()