In [3]:
import pandas as pd
import joblib
from urllib.parse import urlparse
from nltk.tokenize import RegexpTokenizer
import tldextract
from sklearn.base import BaseEstimator, TransformerMixin
import requests
import re

ModuleNotFoundError: No module named 'pandas'

In [3]:
def parse_url(url: str):
    try:
        no_scheme = not url.startswith('https://') and not url.startswith('http://')
        if no_scheme:
            parsed_url = urlparse(f"http://{url}")
            return {
                "url": url,
                "scheme": None,
                "netloc": parsed_url.netloc,
                "path": parsed_url.path,
                "params": parsed_url.params,
                "query": parsed_url.query,
                "fragment": parsed_url.fragment,
            }
        else:
            parsed_url = urlparse(url)
            return {
                "url": url,
                "scheme": parsed_url.scheme,
                "netloc": parsed_url.netloc,
                "path": parsed_url.path,
                "params": parsed_url.params,
                "query": parsed_url.query,
                "fragment": parsed_url.fragment,
            }
    except:
        return None

In [4]:
def get_num_subdomains(netloc: str):
    subdomain = tldextract.extract(netloc).subdomain
    if subdomain == "":
        return 0
    return subdomain.count('.') + 1

In [5]:
def tokenize_domain(netloc: str):
    tokenizer = RegexpTokenizer(r'[A-Za-z]+')
    split_domain = tldextract.extract(netloc)
    no_tld = str(split_domain.subdomain +'.'+ split_domain.domain)
    return " ".join(map(str,tokenizer.tokenize(no_tld)))


In [6]:
def predict_url(model, url: str):
    parsed_url = parse_url(url)
    data = pd.DataFrame.from_records([parsed_url])
    data["length"] = data.url.str.len()
    data["tld"] = data.netloc.apply(lambda nl: tldextract.extract(nl).suffix)
    data['tld'] = data['tld'].replace('','None')
    data["is_ip"] = data.netloc.str.fullmatch(r"\d+\.\d+\.\d+\.\d+")
    data['domain_hyphens'] = data.netloc.str.count('-')
    data['domain_underscores'] = data.netloc.str.count('_')
    data['path_hyphens'] = data.path.str.count('-')
    data['path_underscores'] = data.path.str.count('_')
    data['slashes'] = data.path.str.count('/')
    data['full_stops'] = data.path.str.count('.')
    data['num_subdomains'] = data['netloc'].apply(lambda net: get_num_subdomains(net))
    data['domain_tokens'] = data['netloc'].apply(lambda net: tokenize_domain(net))
    data['path_tokens'] = data['path'].apply(lambda path: " ".join(map(str,tokenizer.tokenize(path))))
    data.drop(['url', 'scheme', 'netloc', 'path', 'params', 'query', 'fragment'], axis=1, inplace=True)

    pred = model.predict(data)
    pred_proba = model.predict_proba(data)

    return pred[0], max(pred_proba[0])

In [7]:
class Converter(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, data_frame):
        return data_frame.values.ravel()

In [8]:
tokenizer = RegexpTokenizer(r'[A-Za-z]+')

In [9]:
clf = joblib.load('naive_bayes_classifier.pkl')
preprocessor = joblib.load('preprocessor.pkl')

In [11]:
API_URL = "https://api-inference.huggingface.co/models/Jagannath/phishNet"
headers = {"Authorization": "Bearer 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"}

def query(payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

In [12]:
def extract_urls(text):
    # Regular expression pattern to find URLs starting with 'www.' or 'https://'
    url_pattern = r'\b(https?://[^\s]+|www\.[^\s]+)'

    # Find all URLs in the text using the pattern
    urls = re.findall(url_pattern, text)

    return urls

In [14]:
def get_predictions(url, text):
    url_prediction, url_confidence = predict_url(clf, url)

    url_list = extract_urls(text)
    goodCount = 0
    badCount = 0
    for url in url_list:
      prediction, confidence = predict_url(clf, url)
      if prediction == 'good':
        goodCount += 1
      else:
        badCount += 1
    url_pred = None
    if goodCount > badCount:
      url_pred = 'good'
    elif badCount > goodCount:
      url_pred = 'bad'
    elif goodCount == badCount == 0:
      url_pred = None

    if url_pred != None:
      url_prediction = url_pred

    # Text prediction
    output = query({
    "inputs": text,
    "parameters": {
      "truncation": "only_first"
    }
    })

    max_confidence_label = max(output[0], key=lambda x: x['score'])
    predicted_label = "good" if max_confidence_label['label'] == 'LABEL_0' else "bad"
    text_confidence_score = max_confidence_label['score']

    return url_prediction, url_confidence, predicted_label, text_confidence_score

In [15]:
text = '''Commbank
     Dear Commonwealth Bank Client
      We are contacting you to inform you that on October 7, 2006 our Account
   Review Team identified some usual activity in your account. In accordance
   with User Agreement and to ensure that your account has not been
   compromised, access to your account was limited. Your account access will
   remain limited until this issue has been resolved.
   We encourage you to log in and perform the steps  necessary to restore your
   account access as soon as possible. Allowing your account access to remain
   limited for an extended period of time may result in further limitations on
   the use of your account and possible account closure.
   In order to confirm your account and to preserve the account stability, you
   are required to login to your account using the following link below:
   [1]http://www.commonwealth-updatesystem.com
   This procedure is performed one time only and it does not require further
   actions on the customer side. This is an automated message, no reply or
   confirmation is required. Thank you for using Commonwealth NetBank!
        Ã‚Â© Copyright Commonwealth Bank of Australia 2005 ABN 48 123 123 124

References

   1. http://www.commonwealth-updatesystem.com/'''
url_test = "http://www.commonwealth-updatesystem.com/"

In [16]:
get_predictions(url_test, text)

('good', 0.682154050869551, 'bad', 0.9621559977531433)

In [20]:
def ensemble_prediction(url, text):
    url_prediction, url_confidence, text_prediction, text_confidence = get_predictions(url, text)
    if url_prediction == "None":
      print(url_prediction + " : " + str(url_confidence))
    if text_prediction == "None":
      print(text_prediction + " : " + str(text_confidence))
    if url_prediction == text_prediction:
        print(url_prediction + " : " + str((url_confidence + text_confidence)/2))
    else:
      if url_confidence > text_confidence:
        print(url_prediction + " : " + str((url_confidence + text_confidence)/2))
      else:
        print(text_prediction + " : " + str((url_confidence + text_confidence)/2))

In [21]:
ensemble_prediction(url_test,text)

bad : 0.8221550243113471
