## Importing libraries

In [1]:
# for quering and parsing of domain registration info.
# LINK=> https://pypi.org/project/python-whois/
!pip install python-whois



In [2]:
# https://docs.python.org/3/library/ipaddress.html
import ipaddress
# https://docs.python.org/3/library/urllib.request.html
import urllib.request
from urllib.parse import urlparse, urlencode
# disabling the urllib warnings
import urllib3
urllib3.disable_warnings()

# data scraping-> https://beautiful-soup-4.readthedocs.io/en/latest/
from bs4 import BeautifulSoup
import socket
import requests
# https://pypi.org/project/googlesearch-python/
from googlesearch import search
import whois
from datetime import date, datetime
import time
from dateutil.parser import parse as date_parse

import re
import random
import numpy as np
# data manipulation and analysis
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

C:\Users\sys.ai\anaconda3\lib\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll
C:\Users\sys.ai\anaconda3\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll


## Feature Extraction

Now, going to extract the features which are explained in this [Paper](https://arxiv.org/pdf/2009.11116v1.pdf) and sub categorized as follows:

1. Url based features

2. DNS Records based features

3. Page Content/ html based features 

* not all but some functions are taken from this [link](https://github.com/fafal-abnir/phishing_detection/blob/master/feature_extraction.py).


In [3]:
from urllib.parse import urlparse

# https://stackoverflow.com/questions/44113335/extract-domain-from-url-in-python
def _get_domain_from_url(url):
  ''' function to get domain name from url '''
  if not re.match(r"^https?", url): # checking if prefix like http and https are in url or not
    url = "http://" + url           # if not, appending http:// string from starting in url to avoid schema issue

  domain = urlparse(url).netloc
  return domain 

In [4]:
import logging
import requests
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

# source: https://stackoverflow.com/a/68583332/5994461

class requests_fetcher:
  THREAD_POOL = 1024

  # This is how to create a reusable connection pool with python requests.
  session = requests.Session()
  session.mount(
      'https://',
      requests.adapters.HTTPAdapter(pool_maxsize=THREAD_POOL,
                                    max_retries=3,
                                    pool_block=True)
  )

  def get_response(self, url, verify = False):
    try:
      self.response = self.session.get(url, timeout = 2, verify = verify)
      # logging.info("request was completed in %s seconds [%s]", response.elapsed.total_seconds(), response.url)
      # if response.status_code != 200:
      #     logging.error("request failed, error code %s [%s]", response.status_code, response.url)
      if 500 <= self.response.status_code < 600:
          # server is overloaded? give it a break
          time.sleep(5)

      return self.response

    except:
      self.response = ""
      return self.response

  def download(self, urls, cl = None):
      with ThreadPoolExecutor(max_workers=1024) as executor:
          # wrap in a list() to wait for all requests to complete
          if(cl):
            ls = list(executor.map(cl.get_response, urls))
          else:
            ls = list(executor.map(self.get_response, urls))
          return ls

### Url based features



**IP Address**

Phishing-> If the IP address is used instead of domain name address.

Legitimate-> Else.

In [5]:
def url_having_IP_Address(url):
  try:
    ipaddress.ip_address(url)
    return 1
  except:
    return 0

**Length**

Phisher can use long URL to hide the query i.e. doubtful part.

In [6]:
def url_length(url):
  return len(url)

**Shortining Service**

Links to the webpage that has a
long URL. 


In [7]:
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
                      r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
                      r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
                      r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
                      r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
                      r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
                      r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
                      r"tr\.im|link\.zip\.net"

In [8]:
def url_sortining_service(url):
  match = re.search(shortening_services, url) # returns the span i.e starting and ending index of substring if available else returns none
  return 1 if match else 0

**Depth of url**

Calculation of number of sub pages based on '/'.



In [9]:
def url_depth(url):
  s = urlparse(url).path.split('/')
  depth = 0
  for j in range(len(s)):
    if len(s[j]) != 0:
      depth = depth+1
  return depth

**"@" Symbol**

Phishing-> If @ symbol is in url

Legitimate-> Else

In [10]:
def url_having_at_symbol(url):
  if "@" in url:
    return 1
  else:
    return 0

**"//" in URL**

Used to redirect to another website.This comes under Redirection [attack](https://www.virtuesecurity.com/kb/url-redirection-attack-and-defense/).

Phishing-> If "//" is in URL

Legitimate-> Else.

In [11]:
def is_redirected(url):  
  pos = url.rfind('//') 
  # Python string method rfind() returns the last index where the substring str is found, or -1 if no such index exists, optionally restricting the search to string[beg:end].
  # ref. https://www.tutorialspoint.com/python/string_rfind.htm
  return pos

In [12]:
import collections

def top_frequent_values(urls, n):
  counter = collections.Counter(urls)
  print(counter)

  phishing_counts = dict(counter.most_common(n))
  print("Top five highest frequent values: ", phishing_counts)

Analyzing what value to take in consideration for url_redirection measure which helps to distinguish between phishing and legitimate web site


In [13]:
def url_redirection(url):
  pos = url.rfind('//')
  if pos >= 5:
    return 1
  else:
    return 0

**Prefix Suffix**

Phishers tend to add prefixes or suffixes separated by (-) to the domain name so that users feel that they are dealing with a legitimate website. For example http://www.Confirme-paypal.com

Phishing-> If yes

Legitimate-> Else.

In [14]:
from urllib.parse import urlparse

In [15]:
def url_prefix_suffix(url):
  if '-' in urlparse(url).netloc: # https://docs.python.org/3/library/urllib.parse.html(returns domain from url)
    return 1
  return 0

**HTTPS token**

Phishing-> If url contains https token

Legitimate-> Else.

In [16]:
def url_http_domain(url):
      if re.findall(r"^https://", url):
        return 1
      else:
        return 0

**Using non standard port**

In [17]:
def url_standard_port(url):
    try:
        port = url.split(":")[1]
        if port:
            return 1
        else:
            return 0 
    except:
        return 0

**Google Index**

Checking google search wrt. url for n number of index

If in any indexed link, phishing word is present -> Phishing

Else -> Legitimate

In [18]:
def url_google_index(url):
  try:
    sites = list(search(query = url, stop = 5, user_agent= googlesearch.get_random_user_agent()))
    for site in sites:
      if('phishing' in site.lower()):
        return 1
    return 0
  except:
    return -1

> **Creating a url_feature_extractor function**

In [19]:
def url_based_feature_extractor(url):

  features = []
  # url based features
  features.append(url)
  features.append(url_having_IP_Address(url))
  features.append(url_having_at_symbol(url))
  features.append(url_length(url))
  features.append(url_depth(url))
  features.append(url_redirection(url))
  features.append(url_http_domain(url))
  features.append(url_sortining_service(url))
  features.append(url_prefix_suffix(url))
  features.append(url_standard_port(url))
  features.append(url_google_index(url))

  return features

In [20]:
# column names for dataframe 
url_col_names = ['url', 'url_having_IP_Address', 'url_having_at_symbol', 'url_length', 'url_depth', 'url_redirection', 'url_http_domain', 'url_sortining_service', 'url_prefix_suffix',
             'url_standard_port', 'url_google_index']

In [21]:
def extract_url_features(url):
    url_features = []
    if not re.match(r"^https?", url): # checking if prefix like http and https are in url or not
     url = "http://" + url   
    url_features.append(url_based_feature_extractor(url))
    url_features_df = pd.DataFrame(url_features, columns = url_col_names)
    return url_features_df

### DNS Record based features


**Age of Domain**

This feature can be extracted from WHOIS database. Most phishing websites live for a short period of time. The minimum age of the legitimate domain is considered to be 6 months for this project. 

If age of domain < 6 months, the value of this feature is 1 (phishing) 

Else 0 (legitimate).

In [22]:
from datetime import date

def domain_age(whois_response):
    try:
      registration_date = whois_response.creation_date 
      if(type(registration_date) == list): # if it is a list, fetching the first element of list as reg. date
        registration_date = registration_date[0]

      if abs((date.today() - registration_date.date()).days) >= 180:  #https://stackoverflow.com/questions/151199/how-to-calculate-number-of-days-between-two-given-dates
        return 0
      else:
        return 1
    except:
      return 1


 **Domain Registration Length**

In [23]:
# dissimilar to domain age, this feature uses expiration date 

def domain_registration_length(whois_reponse):
  try:
    expiration_date = whois_response.expiration_date 
    if(type(expiration_date) == list):
     expiration_date = expiration_date[0]
    registration_length = abs((expiration_date.date() - date.today()).days)
    if registration_length / 365 <= 1:
      return 1
    else:
      return 0
  except:
    return 1

**Statistical Report**

If host belongs to ip_match i.e. top IPs selected based on statistical measures from Phishtank, it is Phishing

Else-> Legitimate

In [24]:
def statistical_report(url, domain):
    url_match = re.search(
        'at\.ua|usa\.cc|baltazarpresentes\.com\.br|pe\.hu|esy\.es|hol\.es|sweddy\.com|myjino\.ru|96\.lt|ow\.ly', url)
    try:
        ip_address = socket.gethostbyname(domain)
        ip_match = re.search('146\.112\.61\.108|213\.174\.157\.151|121\.50\.168\.88|192\.185\.217\.116|78\.46\.211\.158|181\.174\.165\.13|46\.242\.145\.103|121\.50\.168\.40|83\.125\.22\.219|46\.242\.145\.98|'
                             '107\.151\.148\.44|107\.151\.148\.107|64\.70\.19\.203|199\.184\.144\.27|107\.151\.148\.108|107\.151\.148\.109|119\.28\.52\.61|54\.83\.43\.69|52\.69\.166\.231|216\.58\.192\.225|'
                             '118\.184\.25\.86|67\.208\.74\.71|23\.253\.126\.58|104\.239\.157\.210|175\.126\.123\.219|141\.8\.224\.221|10\.10\.10\.10|43\.229\.108\.32|103\.232\.215\.140|69\.172\.201\.153|'
                             '216\.218\.185\.162|54\.225\.104\.146|103\.243\.24\.98|199\.59\.243\.120|31\.170\.160\.61|213\.19\.128\.77|62\.113\.226\.131|208\.100\.26\.234|195\.16\.127\.102|195\.16\.127\.157|'
                             '34\.196\.13\.28|103\.224\.212\.222|172\.217\.4\.225|54\.72\.9\.51|192\.64\.147\.141|198\.200\.56\.183|23\.253\.164\.103|52\.48\.191\.26|52\.214\.197\.72|87\.98\.255\.18|209\.99\.17\.27|'
                             '216\.38\.62\.18|104\.130\.124\.96|47\.89\.58\.141|78\.46\.211\.158|54\.86\.225\.156|54\.82\.156\.19|37\.157\.192\.102|204\.11\.56\.48|110\.34\.231\.42', ip_address)
        if url_match:
            return 1
        elif ip_match:
            return 1
        else:
            return 0
    except:
        1

> **Creating a domain_feature_extractor function**

In [25]:
import logging
import requests
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

# source: https://stackoverflow.com/a/68583332/5994461

THREAD_POOL = 1024

  
def get_domain_based_features(url):
  features = []
  # domain based features
  dns_record = 0
  try:
    domain = _get_domain_from_url(url)
    whois_response = whois.whois(domain)
  except:
    domain = ''
    whois_response = ''
    dns_record = 1

  features.append(url)
  features.append(dns_record)
  features.append(1 if dns_record == 1 else domain_age(whois_response))
  features.append(1 if dns_record == 1 else domain_registration_length(whois_response))
  features.append(statistical_report(url, domain))
  return features

def domain_based_feature_extractor( urls):
    with ThreadPoolExecutor(max_workers=THREAD_POOL) as executor:
        # wrap in a list() to wait for all requests to complete
        ls = list(executor.map(get_domain_based_features, urls))
        return ls

In [26]:
# column names for dataframe 
domain_col_names = ['url', 'dns_record', 'domain_age', 'domain_registration_length', 'statistical_report']

In [27]:
def extract_domain_features(url):
    domain_features = []
    if not re.match(r"^https?", url): # checking if prefix like http and https are in url or not
       url = "http://" + url   
    domain_features.append(domain_based_feature_extractor([url]))
    domain_features_df = pd.DataFrame(domain_features[0], columns = domain_col_names)
    return domain_features_df

### Page content/ HTML based features

**Status bar customization(Mouse over)**

In [28]:
# checks the effect of mouse over on status bar (Mouse_Over)
def _page_mouse_over(response): 
  if response == "" :
    return 1
  else:
    if re.findall("<script>.+onmouseover.+</script>", response.text):
      return 1
    else:
      return 0

**Disabling Right Click**

In [29]:
def _page_right_click_disable(response):
    if response == "":
      return 1
    else:
     if re.findall(r"event.button ?== ?2", response.text):
       return 0
     else:
       return 1

**Favicon**

In [1]:
def _page_favicon(soup, domain, url):  
  if soup == -999:
      return 1
  else:
      try:
        for head in soup.find_all('head'):
            for head.link in soup.find_all('link', href=True):
                dots = [x.start(0)
                        for x in re.finditer('\.', head.link['href'])]
                if url in head.link['href'] or len(dots) == 1 or domain in head.link['href']:
                    return 0
                else:
                    return 1
      except StopIteration:
          pass

**Using pop-up window**

In [31]:
 def _page_pop_up(response): 
    if response == "":
        return 1
    else:
        if re.findall(r"alert\(", response.text):
            return 0
        else:
            return 1

**IFrame**

In [32]:
def _page_iframe(response):
  if response == "":
    return 1
  else:
    if re.findall(r'[<iframe>|<frameBorder>]', response.text):
      return 0
    else:
      return 1

**Website forwarding**

We find that legitimate websites have been redirected one time max. On the other hand, phishing websites containing this feature have been redirected at least 4 times.

In [33]:
def _page_website_forwarding(response):
  if response == "":
    return 1
  else:
    if len(response.history) <= 2:
      return 0
    else:
      return 1

**Link pointing to the page**

In [34]:
def _link_pointing_to_the_page(response):
    if response == "":
        return 1
    else:
        number_of_links = len(re.findall(r"<a href=", response.text))
        if number_of_links == 0:
            return 0
        elif number_of_links <= 2:
            return -1
        else:
           return 1

**Submitting Information to email**

In [35]:
def _submit_email(response):
  if response == "":
        return 1
  else:
      if re.findall(r"[mail\(\)|mailto:?]", response.text):
          return 1
      else:
          return 0

**Website Redirection Count**

In [36]:
def _redirection_count(response):
    if response == "":
        return 1
    else:
        if len(response.history) <= 1: # https://www.geeksforgeeks.org/response-history-python-requests/
            return 1
        elif len(response.history) <= 4:
            return -1
        else:
            return 0

**Server from Handler**

If server_from_handler is_empty-> Phishing

Elif it refers to a different domain-> Suspicious

Else-> Legitimate

In [37]:
def _server_from_handler(soup, domain, url):
    if soup == -999:
      return 1
    elif len(soup.find_all('form', action=True))==0:
      return 0
    else :
      for form in soup.find_all('form', action=True):
          if form['action'] == "" or form['action'] == "about:blank":
              return 1
          elif url not in form['action'] and domain not in form['action']:
              return -1
          else:
              return 0

**SSL State**

In [38]:
def _ssl_state(response):
  if response == "":
    return 1
  else:
    return 0

> **Creating a page_feature_extractor function**

In [39]:
class page_response_request(requests_fetcher):
  curr_itr =0
  def get_response(self, url):
    self.response = super().get_response(url)
    self.features = []
    self.features.append(url)
    self.page_features()

    self.curr_itr += 1
    if(self.curr_itr%500==0):
      print('i: ', self.curr_itr)
    return self.features

  def page_features(self):
    self.features.append(_page_mouse_over(self.response))
    self.features.append(_page_right_click_disable(self.response))
    self.features.append(_page_pop_up(self.response))
    self.features.append(_page_iframe(self.response))
    self.features.append(_page_website_forwarding(self.response))
    self.features.append(_link_pointing_to_the_page(self.response))
    self.features.append(_submit_email(self.response))
    self.features.append(_redirection_count(self.response))  

In [40]:
page_col_names = ['url', 'page_mouse_over', 'page_right_click_disable', 'page_pop_up', 'page_iframe', 'page_website_forwarding', 'link_pointing_to_the_page', 'submit_email', 
             'redirection_count', 'server_from_handler','page_favicon',	'ssl_state']

In [41]:
# the features, urls get repeating when aligning with executor, creating a different method for this

def soupbased(url):
  req = requests_fetcher()
  try:
    response = req.get_response(url)
    soup = BeautifulSoup(response.text, 'html.parser')
  except:
    soup = -999
    response = ""

  domain = _get_domain_from_url(url)
  return [1 if soup == -999 else _server_from_handler(soup, domain = domain, url = url), 1 if soup == -999 else _page_favicon(soup, domain= domain, url = url), _ssl_state(response)]

In [42]:
page_requests_obj = page_response_request()

def extract_page_features(url):
    page_features = []
    if not re.match(r"^https?", url): # checking if prefix like http and https are in url or not
       url = "http://" + url   
    page_features.append(page_requests_obj.get_response(url))
    page_features[0]+= soupbased(url)
    page_features_df = pd.DataFrame(page_features, columns = page_col_names)
    return page_features_df

### Generating features

In [1]:
def extract_all_features(url):
    url_df = extract_url_features(url)
    domain_df = extract_domain_features(url)
    page_df = extract_page_features(url)
    df = url_df.merge(domain_df, on='url')
    df = df.merge(page_df, on='url')
    return df