## Importing libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")

# data manipulation and numeric operations
import pandas as pd
import numpy as np

# save and load serialized objects
import pickle

# track progress of function execution
from tqdm import tqdm
import os

# metrics
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix

In [2]:
test_df = pd.read_csv('test_dataset.csv')

test_df.shape

(4587, 27)

In [3]:
# randomly selecting one legitimate and one phishing url
test_legi_df = test_df[test_df.result == 0]
url_legi = test_legi_df.sample().url.values[0]
print('random legitimate url: {}'.format(url_legi))

test_phish_df = test_df[test_df.result == 1]
url_phish = test_phish_df.sample().url.values[0]
print('random phishing url: {}'.format(url_phish))

random legitimate url: http://mitsubishielectric.co.uk
random phishing url: https://emsi-lobo.firebaseapp.com/


In [4]:
model = pickle.load(open('final_model.sav', 'rb'))

In [5]:
# !pip install import_ipynb # https://newbedev.com/ipynb-import-another-ipynb-file
import import_ipynb
import FeatureExtraction

importing Jupyter notebook from FeatureExtraction.ipynb


In [6]:
def function1(url):
  '''This function contains the steps to feature extraction, preprocessing and model prediction'''
  # getting featured dataframe from url
  X_test = FeatureExtraction.extract_all_features(url)
  # preprocessing
  X_test['statistical_report'] = X_test['statistical_report'].fillna(-1)
  X_test['page_favicon'] = X_test['page_favicon'].fillna(-1)
  X_test['redirection_count'] = X_test['redirection_count'].fillna(1)

# drop textual feature, dependent feature
  X_test.drop(columns = ['url', 'url_google_index'], inplace= True)

  # removing constant features
  X_test.drop(columns = ['url_having_IP_Address', "domain_registration_length"], inplace= True)

  # applying FE
  X_test['length_depth'] = X_test.url_length +  X_test.url_depth
  X_test['port_redirection'] = X_test.url_standard_port +  X_test.redirection_count
  X_test['var_median'] = X_test.median(axis = 1)
  X_test['var_max'] = X_test.max(axis = 1)
  X_test['var_std'] = X_test.std(axis = 1)
  X_test['var_sum'] = X_test.sum(axis = 1)

  return model.predict(X_test)

In [7]:
def checker(pred):
    if pred == 0:
        print('Legitimate')
    else:
        print('Phishing')

In [8]:
# checking a random legitimate url
pred = function1(url_legi)
checker(pred)

Error trying to connect to socket: closing socket
Legitimate


In [9]:
# checking a random phishing url
pred = function1(url_phish)
checker(pred)

Phishing


In [16]:
def function2(X_test, y):
  '''This function returns the predicted value and its probability as its metric'''
  # preprocessing
  X_test['statistical_report'] = X_test['statistical_report'].fillna(-1)

  X_test['page_favicon'] = X_test['page_favicon'].fillna(-1)

  # filling na with 1    
  X_test['redirection_count'] = X_test['redirection_count'].fillna(1)

  # drop textual feature
  X_test.drop(columns = ['url', 'url_google_index'], inplace= True)

  # removing constant features
  X_test.drop(columns = ['url_having_IP_Address', "domain_registration_length"], inplace= True)

  # applying FE
  X_test['length_depth'] = X_test.url_length +  X_test.url_depth
  X_test['port_redirection'] = X_test.url_standard_port +  X_test.redirection_count
  X_test['var_median'] = X_test.median(axis = 1)
  X_test['var_max'] = X_test.max(axis = 1)
  X_test['var_std'] = X_test.std(axis = 1)
  X_test['var_sum'] = X_test.sum(axis = 1)

  return [model.predict(X_test), model.predict_proba(X_test)]

In [17]:
test_sample = test_df.sample()
X = test_sample.drop(columns = ['result'])
y = test_sample.result

print('Actual value: {}, predicted value and probability score: {}'.format(y, function2(X, y)))

Actual value: 1001    1
Name: result, dtype: int64, predicted value and probability score: [array([1], dtype=int64), array([[0.012, 0.988]])]
