# Functions

In [None]:
!pip install tldextract #If required
!apt install libzbar0 #If required
!pip install pyzbar #If required

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Reading package lists... Done
Building dependency tree       
Reading state information... Done
libzbar0 is already the newest version (0.10+doc-10.1build2).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 20 not upgraded.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## QR Code Scanner

In [None]:
from IPython.display import display, Javascript
from google.colab.output import eval_js
from base64 import b64decode
from IPython.display import Image
from google.colab.patches import cv2_imshow
import cv2
import numpy as np
from pyzbar.pyzbar import decode

# function to convert the JavaScript object into an OpenCV image
def js_to_image(js_reply):
  """
  Params:
          js_reply: JavaScript object containing image from webcam
  Returns:
          img: OpenCV BGR image
  """
  # decode base64 image
  image_bytes = b64decode(js_reply.split(',')[1])
  # convert bytes to numpy array
  jpg_as_np = np.frombuffer(image_bytes, dtype=np.uint8)
  # decode numpy array into OpenCV BGR image
  img = cv2.imdecode(jpg_as_np, flags=1)

  return img

def take_photo(show=0, quality=1):
    js = Javascript('''
        async function takePhoto(quality) {
        const div = document.createElement('div');
        const capture = document.createElement('button');
        capture.textContent = 'Capture';
        div.appendChild(capture);

        const video = document.createElement('video');
        video.style.display = 'block';
        const stream = await navigator.mediaDevices.getUserMedia({video: true});

        document.body.appendChild(div);
        div.appendChild(video);
        video.srcObject = stream;
        await video.play();

        // Resize the output to fit the video element.
        google.colab.output.setIframeHeight(document.documentElement.scrollHeight, true);

        // Wait for Capture to be clicked.
        await new Promise((resolve) => capture.onclick = resolve);

        const canvas = document.createElement('canvas');
        canvas.width = video.videoWidth;
        canvas.height = video.videoHeight;
        canvas.getContext('2d').drawImage(video, 0, 0);
        stream.getVideoTracks()[0].stop();
        div.remove();
        return canvas.toDataURL('image/jpeg', quality);
        }
        ''')
    display(js)

    # get photo data
    data = eval_js('takePhoto({})'.format(quality))
    # get OpenCV format image
    img = js_to_image(data) 
    
    if show:
        # Get bounding box
        decoder = cv2.QRCodeDetector()
        data, points, _ = decoder.detectAndDecode(img)

        if points is not None:
            points = points[0]
            for i in range(len(points)):
                pt1 = [int(val) for val in points[i]]
                pt2 = [int(val) for val in points[(i + 1) % 4]]
                cv2.line(img, pt1, pt2, color=(255, 0, 0), thickness=3)

            cv2_imshow(img)

    # Get website/info
    data = decode(img)[0].data
    data_str = data.decode("utf-8")
    print('Decoded data: ', data_str)
    
    return data_str


## Phishing Identifiers

## Jie Sheng

In [125]:
import nltk
nltk.download('punkt')
import tldextract
import pandas as pd
import numpy as np
import re
from urllib.parse import urlparse
from nltk.util import ngrams
from sklearn.feature_extraction import DictVectorizer
from scipy.sparse import csr_matrix

#Using IP Addresses within Address Bar
def isIPAddress(url):
    # declaring the regex pattern for IP addresses
    pattern = re.compile(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})')
    if (pattern.search(url)):
        return 1
    return 0

#Long URL (Characters)
def urlLength(url):
    return len(url)

#URL with @ Symbol
def hasAt(url):
    if '@' in url:
        return 1
    return 0

#URL with multiple "//" for redirecting
def hasDoubleSlash(url):
    if '//' in url:
        return 1
    return 0

#URL Domain with - Symbol
def hasDash(url):
    if '-' in url.split('.')[0]:
        return 1
    return 0

#URL with multiple "." for subdomains
def hasMultipleDots(url):
    if '.' in url:
        return 1
    return 0

#URL with "?"
def hasQuestion(url):
    if '?' in url:
        return 1
    return 0

#URL with "cmd"
def hasCmd(url):
    if 'cmd' in url:
        return 1
    return 0

#URL with ".php"
def hasPhp(url):
    if '.php' in url:
        return 1
    return 0

#URL with HTTPS in domain
def hasHTTPorHTTPS(url):
    if 'http' in url:
        return 1
    return 0

#Total Digits Domain
def digitsDomain(url):
    return len(re.sub("[^0-9]", "", url.split("/", 1)[0]))

#Total Digits Path
def digitsPath(url):
    if len(url.split("/", 1)) == 2:
        return len(re.sub("[^0-9]", "", url.split("/", 1)[1]))
    return 0

def generate_url_ngrams(n: int, url: str):
    url_formated = ''
    
    for index, char in enumerate(url):
        if index % n == 0:
            url_formated += ' '
        url_formated += char

    ngram = ngrams(sequence=nltk.word_tokenize(url_formated), n=n)
    
    ngram_url = {}
    for grams in ngram:
        for gx in grams:
            ngram_url[gx] = 1
    return ngram_url


def get_fields_url(url: str):
    try:
        features = dict()

        url_tldextract = tldextract.extract(url) 
        url_urlparse = urlparse(f"http://{url}")
        url_info = [
            {"name": "domain", "string": url_tldextract.domain},
            {"name": "subdomain", "string": url_tldextract.subdomain},
            {"name": "suffix", "string": url_tldextract.suffix},
            {"name": "path", "string": url_urlparse.path},
            {"name": "params", "string": url_urlparse.params},
            {"name": "query", "string": url_urlparse.query},
            {"name": "fragment", "string": url_urlparse.fragment}
        ]
        features.update(generate_url_ngrams(2, url_tldextract.domain))
        
        for each_url in url_info:
            features[f'len_{each_url["name"]}'] = len(each_url["string"])
            for char_ in list(map(str, "-@_?=&./,")):
                features[f'char{char_}-{each_url["name"]}'] = each_url["string"].count(char_)

            if "domain" == each_url["name"] or "path" == each_url["name"]:
                total_letter, total_number = 0, 0
                for char_ in list(map(str, "abcdefghijklmnopqrstuvwxyz")):
                    total_letter += each_url["string"].lower().count(char_)

                for char_ in list(map(str, "0123456789")):
                    total_number += each_url["string"].lower().count(char_)

                features[f'letter_len_{each_url["name"]}'] = total_letter
                features[f'number_len_{each_url["name"]}'] = total_number
    except Exception as e:
        return e      
    return features   

def get_features(url):
    if "http://" == url[:7]:
        url = url[7:]
    elif "https://" == url[:8]:
        url = url[8:]
    features_json = {}
    features_json = get_fields_url(url)
    features_json['ip_addr'] = isIPAddress(url)
    features_json['url_len'] = urlLength(url)
    features_json['has_at'] = hasAt(url)
    features_json['has_double_slash'] = hasDoubleSlash(url)
    features_json['has_dash'] = hasDash(url)
    features_json['has_multiple_dots'] = hasMultipleDots(url)
    features_json['has_question_mark'] = hasQuestion(url)
    features_json['has_cmd'] = hasCmd(url)
    features_json['has_php'] = hasPhp(url)
    features_json['has_http'] = hasHTTPorHTTPS(url)
    features_json['digits_domain'] = digitsDomain(url)
    features_json['digits_path'] = digitsPath(url)
    return features_json

def feature_to_vector(features_json, pre_processor):   
    X = pre_processor.transform([features_json])
    X = csr_matrix(X)
    X = X.tocsr()
    return X

def get_prediction(vector, lg, xgb, rfc, nn):
    #print(lg.predict(vector),xgb.predict(vector),rfc.predict(vector),nn.predict(vector))
    if nn.predict(vector) == 1:
        nn_pred = 1
    else:
        nn_pred = 0
    if lg.predict(vector)+xgb.predict(vector)+rfc.predict(vector)+nn_pred > 2:
        return 1
    return 0

def url_phishing_predictor_js(url, pre_processor, lg, xgb, rfc, nn):
    return get_prediction(feature_to_vector(get_features(url), pre_processor), lg, xgb, rfc, nn)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Jun Long

In [135]:
#!pip install tld
import pandas as pd
import numpy as np
import re
from urllib.parse import urlparse
from tld import get_tld, is_tld

def process_tld(url):
    try:
        res = get_tld(url, as_object = True, fail_silently=False,fix_protocol=True)
        pri_domain= res.parsed_url.netloc
    except :
        pri_domain= None
    return pri_domain

def abnormal_url(url):
    hostname = urlparse(url).hostname
    hostname = str(hostname)
    match = re.search(hostname, url)
    if match:
        # print match.group()
        return 1
    else:
        # print 'No matching pattern found'
        return 0

def httpSecure(url):
    htp = urlparse(url).scheme
    match = str(htp)
    if match=='https':
        # print match.group()
        return 1
    else:
        # print 'No matching pattern found'
        return 0

def digit_count(url):
    digits = 0
    for i in url:
        if i.isnumeric():
            digits = digits + 1
    return digits
    
def letter_counter(url):
    letter_count = 0
    for i in url: 
        if i.isalpha():
            letter_count += 1
        return letter_count    

def Shortining_Service(url):
    match = re.search('bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
                      'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
                      'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
                      'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
                      'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
                      'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
                      'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|'
                      'tr\.im|link\.zip\.net',
                      url)
    if match:
        return 1
    else:
        return 0

def having_ip_address(url):
    match = re.search(
        '(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.'
        '([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\/)|'  # IPv4
        '(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.'
        '([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\/)|'  # IPv4 with port
        '((0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\/)' # IPv4 in hexadecimal
        '(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}|'
        '([0-9]+(?:\.[0-9]+){3}:[0-9]+)|'
        '((?:(?:\d|[01]?\d\d|2[0-4]\d|25[0-5])\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d|\d)(?:\/\d{1,2})?)', url)  # Ipv6
    if match:
        return 1
    else:
        return 0

'''
['url', 'url_len', 'domain', '@', '?', '-', '=', '.',
       '#', '%', '+', '$', '!', '*', ',', '//', 'abnormal_url', 'https',
       'digits', 'letters', 'Shortining_Service', 'having_ip_address'],

'''
def feature_to_input(link):
    link_df = {'url': [link]}
    data = pd.DataFrame(link_df)
    data["url"] = data["url"].replace("www","", regex=True)
    data['url_len'] = data['url'].apply(lambda x: len(str(x)))
    data['domain'] = data['url'].apply(lambda i: process_tld(i))
    feature = ['@','?','-','=','.','#','%','+','$','!','*',',','//']
    for a in feature:
        data[a] = data['url'].apply(lambda i: i.count(a))
    data['abnormal_url'] = data['url'].apply(lambda i: abnormal_url(i))
    data['https'] = data['url'].apply(lambda i: httpSecure(i))
    data['digits']= data['url'].apply(lambda i: digit_count(i))
    data['letters']= data['url'].apply(lambda i: letter_counter(i))
    data['Shortining_Service'] = data['url'].apply(lambda x: Shortining_Service(x))
    data['having_ip_address'] = data['url'].apply(lambda i: having_ip_address(i))
    data = data.drop(['url','domain'],axis=1)
    return data.iloc[:, :22]

def url_phishing_predictor_jl(link, dt, rf, et):
    features = feature_to_input(link)
    if (dt.predict(features) + rf.predict(features) + et.predict(features) >= 2):
        return 1
    return 0

## Wee Din

In [127]:
import csv
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from urllib.parse import urlparse
from nltk.tokenize import RegexpTokenizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import svm
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

class Converter(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, data_frame):
        return data_frame.values.ravel()

numeric_features = ['length', 'domain_hyphens', 'domain_underscores', 'path_hyphens', 'path_underscores', 'slashes', 'full_stops', 'num_subdomains']
numeric_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler())])

categorical_features = ['tld', 'is_ip']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

vectorizer_features = ['domain_tokens','path_tokens']
vectorizer_transformer = Pipeline(steps=[
    ('con', Converter()),
    ('tf', TfidfVectorizer())])

vectorizer_features = ['domain_tokens','path_tokens']
vectorizer_transformer = Pipeline(steps=[
    ('con', Converter()),
    ('tf', TfidfVectorizer())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('domvec', vectorizer_transformer, ['domain_tokens']),
        ('pathvec', vectorizer_transformer, ['path_tokens'])
    ])

svc_clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LinearSVC())])

log_clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

nb_clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', MultinomialNB())])


tokenizer = RegexpTokenizer(r'[A-Za-z]+')
def tokenize_domain(netloc: str) -> str:
    split_domain = tldextract.extract(netloc)
    no_tld = str(split_domain.subdomain +'.'+ split_domain.domain)
    return " ".join(map(str,tokenizer.tokenize(no_tld)))

def get_num_subdomains(netloc: str) -> int:
    subdomain = tldextract.extract(netloc).subdomain 
    if subdomain == "":
        return 0
    return subdomain.count('.') + 1

def parse_url(url: str): #Optional[Dict[str, str]] --> Saw this online, but not too sure what this is
    try:
        no_scheme = not url.startswith('https://') and not url.startswith('http://')
        if no_scheme:
            parsed_url = urlparse(f"http://{url}")
            return {
                "scheme": None, #No established value for this --> Think this is for http or https
                "netloc": parsed_url.netloc,
                "path": parsed_url.path,
                "params": parsed_url.params,
                "query": parsed_url.query,
                "fragment": parsed_url.fragment,
            }
        else:
            parsed_url = urlparse(url)
            return {
                "scheme": parsed_url.scheme,
                "netloc": parsed_url.netloc,
                "path": parsed_url.path,
                "params": parsed_url.params,
                "query": parsed_url.query,
                "fragment": parsed_url.fragment,
            }
    except:
        return None

def feature_as_input(link):
    link_df = {'url': [link]}
    df_grp = pd.DataFrame(link_df)        
    df_grp["parsed_url"] = df_grp.url.apply(parse_url)
    df_grp = pd.concat([
        df_grp.drop(['parsed_url'], axis=1),
        df_grp['parsed_url'].apply(pd.Series)
    ], axis=1)
    df_grp = df_grp[~df_grp.netloc.isnull()]
    df_grp["length"] = df_grp.url.str.len()
    df_grp["tld"] = df_grp.netloc.apply(lambda nl: tldextract.extract(nl).suffix)
    df_grp['tld'] = df_grp['tld'].replace('','None')
    df_grp["is_ip"] = df_grp.netloc.str.fullmatch(r"\d+\.\d+\.\d+\.\d+")
    df_grp['domain_hyphens'] = df_grp.netloc.str.count('-')
    df_grp['domain_underscores'] = df_grp.netloc.str.count('_')
    df_grp['path_hyphens'] = df_grp.path.str.count('-')
    df_grp['path_underscores'] = df_grp.path.str.count('_')
    df_grp['slashes'] = df_grp.path.str.count('/')
    df_grp['full_stops'] = df_grp.path.str.count('.')
    df_grp['num_subdomains'] = df_grp['netloc'].apply(lambda net: get_num_subdomains(net))        
    df_grp['domain_tokens'] = df_grp['netloc'].apply(lambda net: tokenize_domain(net))
    df_grp['path_tokens'] = df_grp['path'].apply(lambda path: " ".join(map(str,tokenizer.tokenize(path))))

    df_grp.drop('url', axis=1, inplace=True)
    df_grp.drop('scheme', axis=1, inplace=True)
    df_grp.drop('netloc', axis=1, inplace=True)
    df_grp.drop('path', axis=1, inplace=True)
    df_grp.drop('params', axis=1, inplace=True)
    df_grp.drop('query', axis=1, inplace=True)
    df_grp.drop('fragment', axis=1, inplace=True)

    return df_grp.iloc[:, :12]

def url_phishing_predictor_wd(link, lr, svc, nb):
    features = feature_as_input(link)
    if (int(lr.predict(features)[0]) + int(svc.predict(features)[0]) + int(nb.predict(features)[0]) >= 2):
        return 1
    return 0

## Hui Xian

In [128]:
# import pandas as pd
# import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.neural_network import MLPClassifier
# from xgboost import XGBClassifier
# from sklearn.svm import SVC
# import pickle

# dataset = pd.read_csv('Datasets/urldata.csv')
# dataset = dataset.sample(frac=1).reset_index(drop=True)

# y = dataset.Label
# X = dataset.iloc[:, 1:-1]
# X = X.drop(["Web_Traffic"], axis=1)
# X_train, X_test, y_train, y_test = \
#  train_test_split(X, y, test_size=0.2, random_state=12)

# # XGBoost
# xgb = XGBClassifier(learning_rate=0.4, max_depth=7)

# xgb.fit(X_train, y_train)

# y_test_xgb = xgb.predict(X_test)
# y_train_xgb = xgb.predict(X_train)

# acc_train_xgb = accuracy_score(y_train, y_train_xgb)
# acc_test_xgb = accuracy_score(y_test, y_test_xgb)

# joblib.dump(xgb, "Models/XGBoostClassifierNew")

In [129]:
#!pip install whois
import pandas as pd
from urllib.parse import urlparse,urlencode
import ipaddress
import re
import sys
import whois
from datetime import datetime
import requests

# Domain of the URL
def getDomain(url):
    domain = urlparse(url).netloc
    if re.match(r"^www.", domain):
        ain = domain.replace("www.", "")
        return domain

# Checks for IP address in URL (Have_IP)
def havingIP(url):
    try:
        ipaddress.ip_address(url)
        ip = 1
    except:
        ip = 0
    return ip

# Checks the presence of @ in URL (Have_At)
def haveAtSign(url):
    if "@" in url:
        at = 1
    else:
        at = 0
    return at

# Finding the length of URL and categorizing (URL_Length)
def getLength(url):
    if len(url) < 54:
        length = 0
    else:
        length = 1
    return length

# Gives number of '/' in URL (URL_Depth)
def getDepth(url):
    s = urlparse(url).path.split('/')
    depth = 0
    for j in range(len(s)):
        if len(s[j]) != 0:
            depth = depth+1
    return depth

# Checking for redirection '//' in the url (Redirection)
def redirection(url):
    pos = url.rfind('//')
    if pos > 6:
        if pos > 7:
            return 1
        else:
            return 0
    else:
        return 0

# Existence of “HTTPS” Token in the Domain Part of the URL (https_Domain)
def httpDomain(url):
    domain = urlparse(url).netloc
    if 'https' in domain:
        return 1
    else:
        return 0

#listing shortening services
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
                      r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
                      r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
                      r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
                      r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
                      r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
                      r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
                      r"tr\.im|link\.zip\.net"

# Checking for Shortening Services in URL (Tiny_URL)
def tinyURL(url):
    match = re.search(shortening_services, url)
    if match:
        return 1
    else:
        return 0

# Checking for Prefix or Suffix Separated by (-) in the Domain (Prefix/Suffix)
def prefixSuffix(url):
    if '-' in urlparse(url).netloc:
        return 1
    else:
        return 0

# Survival time of domain: The difference between termination time and creation time (Domain_Age)  
def domainAge(domain_name):
    creation_date = domain_name.creation_date
    expiration_date = domain_name.expiration_date
    if (isinstance(creation_date, str) or isinstance(expiration_date, str)):
        try:
            creation_date = datetime.strptime(creation_date, '%Y-%m-%d')
            expiration_date = datetime.strptime(expiration_date, "%Y-%m-%d")
        except:
            return 1
    if ((expiration_date is None) or (creation_date is None)):
        return 1
    elif ((type(expiration_date) is list) or (type(creation_date) is list)):
        return 1
    else:
        ageofdomain = abs((expiration_date - creation_date).days)
        if ((ageofdomain/30) < 6):
            age = 1
        else:
            age = 0
    return age

# End time of domain: The difference between termination time and current time (Domain_End)
def domainEnd(domain_name):
    expiration_date = domain_name.expiration_date
    if isinstance(expiration_date, str):
        try:
            expiration_date = datetime.strptime(expiration_date, "%Y-%m-%d")
        except:
            return 1
    if (expiration_date is None):
        return 1
    elif (type(expiration_date) is list):
        return 1
    else:
        today = datetime.now()
        end = abs((expiration_date - today).days)
        if ((end/30) < 6):
            end = 0
        else:
            end = 1
    return end

# IFrame Redirection (iFrame)
def iframe(response):
    if response == "":
        return 1
    else:
        if re.findall(r"[<iframe>|<frameBorder>]", response.text):
            return 0
        else:
            return 1

# Checks the effect of mouse over on status bar (Mouse_Over)
def mouseOver(response): 
    if response == "":
        return 1
    else:
        if re.findall("<script>.+onmouseover.+</script>", response.text):
            return 1
        else:
            return 0

# Checks the status of the right click attribute (Right_Click)
def rightClick(response):
    if response == "":
        return 1
    else:
        if re.findall(r"event.button ?== ?2", response.text):
            return 0
        else:
            return 1

# Checks the number of forwardings (Web_Forwards)    
def forwarding(response):
    if response == "":
        return 1
    else:
        if len(response.history) <= 2:
            return 0
        else:
            return 1

def featureExtraction(url):

    features = []
# Address bar based features (10)
    features.append(havingIP(url))
    features.append(haveAtSign(url))
    features.append(getLength(url))
    features.append(getDepth(url))
    features.append(redirection(url))
    features.append(httpDomain(url))
    features.append(tinyURL(url))
    features.append(prefixSuffix(url))

# Domain based features (4)
    dns = 0
    try:
        domain_name = whois.whois(urlparse(url).netloc)
    except:
        dns = 1

    features.append(dns)
    features.append(1 if dns == 1 else domainAge(domain_name))
    features.append(1 if dns == 1 else domainEnd(domain_name))

# HTML & Javascript based features (4)
    try:
        response = requests.get(url)
    except:
        response = ""
    features.append(iframe(response))
    features.append(mouseOver(response))
    features.append(rightClick(response))
    features.append(forwarding(response))

    return features

# converting the list to dataframe
def get_feature(link):
    feature_names = ['Have_IP', 'Have_At', 'URL_Length', 'URL_Depth', 'Redirection', 'https_Domain', 'TinyURL', 'Prefix/Suffix', 'DNS_Record', 'Domain_Age', 'Domain_End', 'iFrame', 'Mouse_Over', 'Right_Click', 'Web_Forwards']
    features_hx = pd.DataFrame([featureExtraction(link)], columns=feature_names)
    return features_hx.iloc[:, :15]

def url_phishing_predictor_hx(link, xgb):
    features_hx = get_feature(link)
    return xgb.predict(features_hx)[0]


# Application (Loading Models to Test on Individual URLs)

In [130]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/My Drive/DLW Hackathon'
#Feel free to change to your path to this folder

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/DLW Hackathon


Load Models

In [None]:
!pip install --no-cache-dir joblib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [146]:
# from fe_js import url_phishing_predictor_js
# from fe_wd import url_phishing_predictor_wd
# from fe_jl import url_phishing_predictor_jl
# from fe_hx import url_phishing_predictor_hx
from keras.models import load_model
import joblib

dir = "Models/"

#JS
rf_imported_js = joblib.load(dir+"RandomForest")
lg_imported_js = joblib.load(dir+"LogisticRegression")
xgb_imported_js = joblib.load(dir+"XGBoost")
nn_imported_js = load_model(dir+"NeuralNet.h5")
pre_processor_imported_js = joblib.load(dir+"Preprocessor")

#JL
DT_imported_jl = joblib.load(dir+"DT")
RF_imported_jl = joblib.load(dir+"RF")
ET_imported_jl = joblib.load(dir+"ET")

#WD
LR_imported_wd = joblib.load(dir+"LR_WD")
SVC_imported_wd = joblib.load(dir+"SVC_WD")
NB_imported_wd = joblib.load(dir+"NB_WD")

#HX
XGB_imported_hx = joblib.load(dir+"XGBoostClassifierNew")

In [140]:
links_list = []
with open("Datasets/ALL-phishing-links.txt") as file:
    for line in file:
        links_list.append(line.rstrip())

print(links_list[:5])

['ftp://188.128.111.33/IPTV/TV1324/view.html', 'ftp://188.128.111.33/web/sec.htm', 'ftp://me@createkindlebooks.org:Noobasshole@createkindlebooks.org/index.html', 'http://00000000000000000000000000000000000000000.xyz', 'http://00000000000000000000000000000000000000dfjjjhv.000webhostapp.com/Yahoo/YahooAttt/global/attverzon/login.php?.intl=us&.lang=en-US&https://login.yahoo.com/?.src=ym']


In [149]:
import warnings
warnings.filterwarnings('ignore')

def get_prediction_from_link(link):
    jiesheng = url_phishing_predictor_js(link, pre_processor_imported_js, lg_imported_js, xgb_imported_js, rf_imported_js, nn_imported_js)
    weedin = url_phishing_predictor_wd(link, LR_imported_wd, SVC_imported_wd, NB_imported_wd)
    junlong = url_phishing_predictor_jl(link, DT_imported_jl, RF_imported_jl, ET_imported_jl)
    huixian = url_phishing_predictor_hx(link, XGB_imported_hx)
    szekee = 0 # Add SK Code here

    return jiesheng+junlong+weedin+huixian+szekee

dict_res = {0:0, 1:0, 2:0, 3:0, 4:0}
for link in links_list[:10]:
    print(get_prediction_from_link(link))








4
4
4
4
4
4
3
4
3
4


In [None]:
try:
    link = take_photo(show=1)
    if url_phishing_predictor(link, pre_processor_imported_js, lg_imported_js, xgb_imported_js, rf_imported_js, nn_imported_js):
        print("This is a Phishing Link! Do not click!")
    else:
        print("This link is safe. Feel free to enter.")

except Exception as err:
    # Errors will be thrown if the user does not have a webcam or if they do not
    # grant the page permission to access it.
    print(str(err))

<IPython.core.display.Javascript object>

KeyboardInterrupt: ignored