# Author: Olli Jokinen


- The main task is to extract data from a given url and create a set of features that could predict a potential malicious web page.
- Read also readme.md to get a high-level view on the project

In [56]:
import requests
import json
from urllib.parse import urlparse
import io
import pandas as pd
import re
import numpy as np
from bs4 import BeautifulSoup
import socket

## Define functions:
- Parse URL
- Extract data from urls https://input.payapi.io/v1/api/fraud/domain/ + "URL"
- Extractors return None, if there is no information or an error occurs
- Create dictionary per URL
- Create a dataset based on features of a given url
- Save a file to .csv format and read it back to Jupyter from a local disk

In [57]:
def get_domain_data(domain):
    show = "https://input.payapi.io/v1/api/fraud/domain/" + domain
    #return None if error occurs
    try:
        data = requests.get(show).json()
        return data
    except Exception:
        print("ERROR")
        return None

    
def parse_domain_from_url(url):
    t = urlparse(url).netloc
    return '.'.join(t.split('.')[-2:])

def get_ip(domain):
    try:
        ip = socket.gethostbyname(domain)
        return ip
    except Exception:
        return None
    
def get_age(data):
    return data["result"] if 'result' in data else np.nan

def get_is_ip_blacklisted(ip,ip_blacklist):
    #use IPSum for this
    try:
        return ip in ip_blacklist.index
    except Exception:
        return "EI OLE" #np.nan is an indicator that     

def get_is_ip_proxy(data):
    return data["isIpProxy"] if "isIpProxy" in data else np.nan

def get_country(data):
    return data["countryName"] if "countryName" in data else np.nan

def get_error(data):
    return data["error"] if "error" in data else np.nan

def count_redirects(url):
    #number of redirects
    try:
        response = requests.get(url, timeout=5)
        if response.history:
            return len(response.history)
        else:
            return 0
    except Exception:
        return np.nan
    
def url_to_fqdn(url):
    result = "".join(url.split("://")[1:]) #remove protocol
    result = result.split("/")[0] #remove path
    return result

In [58]:
# extract num_of_href_links
def num_of_hrefs(url):
    try:
        headers = requests.utils.default_headers()
        req = requests.get(url, headers)
        soup = BeautifulSoup(req.content, 'html.parser')
        links = [link for link in soup.find_all(href=True)] #find all hrefs
        
        return len(links)
    except Exception:
        return np.nan

In [59]:
#return features based on url
def features_from_url(url,ip_blacklist):
    
    #protocol
    protocol = url.split("://")[0] #http:// or https://
    
    #url to domain
    domain = parse_domain_from_url(url)

    #get data by domain: age, is ip proxy, is ip blacklisted
    domain_data = get_domain_data(domain)
    
    #fqdn
    fqdn = url_to_fqdn(url)
    
    fqdn_length = len(fqdn)
    
    
    #blacklists
    isIpProxy = get_is_ip_proxy(domain_data)
    
    ip = get_ip(domain)
    blacklist = get_is_ip_blacklisted(ip,ip_blacklist)
    
    
    #url length
    url_length = len(url)
    
    #number of dots in fqdn
    url_num_of_dots = fqdn.count(".")
    
    #web scrape: num of hrefs
    num_of_links = num_of_hrefs(url)
    
    #ownership details
    age = get_age(domain_data)
    country = get_country(domain_data)
    
    #num_of_redirects
    redirects = count_redirects(url)
    
    #error from input.payapi
    error = get_error(domain_data)

    
    
    #build a dict of url features
    url_dict = {
        'url': url,
        'protocol': protocol,
        'url_length' : url_length,
        'fqdn_length' : fqdn_length,
        'num_of_dots_fqdn' : url_num_of_dots,
        'num_of_links (href)' : num_of_links,
        'age' : age,
        'country' : country,
        'blacklist' : blacklist, #is ip found from a blacklist
        'isIpProxy' : isIpProxy, 
        'redirects' : redirects, #number of redirections
        'error' : error #error message from input.payapi
        
    }
    
    return url_dict

### Let's extract
- example_urls is the dataset. 
- features extracted from urls are merged together into a single dataframe

In [60]:
# list of urls -  from phishtank
example_urls = ["http://carnavalacabomaistemosessaslindas.com/2.2.19-beta/mobile/login.php?enc=1",
                "https://www.slideshare.net/weaveworks/client-side-monitoring-with-prometheus",
                "http://cartaobndes.gov.br.cv31792.tmweb.ru/",
                "https://paypal.co.uk.yatn.eu/m/",
                "http://college-eisk.ru/cli/",
                "https://dotpay-platnosc3.eu/dotpay/",
                "https://yakbstains.cloudaccess.host/index.html",
                "https://muskdrop.vip",
                "http://unvisored-destinati.000webhostapp.com/",
                "http://gabungvideoviral77.000webhostapp.com/login.php",
                "https://www.google.com/url?sa=t&ectrans=1&cd=3&rct=j&cad=rja&usg=AOvVaw0_9EI52uq3vapizMNgByf0&url=http%3A//nextsteppk.com/courselist&q=&ved=2ahUKEwi-rviTieHnAhXF7eAKHVLFD6AQFjACegQIBhAB&source=web&uact=8&esrc=s",
                "https://www.tiktok.com/ar/",
                "https://www.disneyworld.eu/login/?returnUrl=/profile/",
                "https://dvla.co.uk-refund-form-feb27.info/",
                "http://getrufd.net/index.ph= p?id=3Dkoi@%cd%lkf.aspx?54fa654fa65a654fa6a654fa6a654fa6a654fa6"
               ]


In [61]:
#read blacklisted IPs
url="https://raw.githubusercontent.com/stamparm/ipsum/master/ipsum.txt"

ip_blacklist = pd.read_csv(url, skiprows=4, sep="\t", header=None)
ip_blacklist.columns = ["IP", "blacklist"]

#set IP as index -> faster to check if ip is in the blacklist
ip_blacklist = ip_blacklist.set_index('IP')

In [62]:
#set columns in advance to avoid automatic ordering of columns
columns =["url", 'protocol', 'url_length',"num_of_dots_fqdn","fqdn_length","num_of_links (href)","age","country","blacklist","isIpProxy","redirects","error"]
df = pd.DataFrame(columns=columns)


# merge features per url to a dataframe
for url in example_urls:
    new_row = features_from_url(url,ip_blacklist)
    df = df.append(new_row,ignore_index=True)

In [63]:
df

Unnamed: 0,url,protocol,url_length,num_of_dots_fqdn,fqdn_length,num_of_links (href),age,country,blacklist,isIpProxy,redirects,error
0,http://carnavalacabomaistemosessaslindas.com/2...,http,79,1,37,40.0,2.0,Singapore,False,False,0.0,
1,https://www.slideshare.net/weaveworks/client-s...,https,76,2,18,135.0,5078.0,United States,False,False,0.0,
2,http://cartaobndes.gov.br.cv31792.tmweb.ru/,http,43,5,35,,4993.0,Russia,False,False,,
3,https://paypal.co.uk.yatn.eu/m/,https,31,4,20,,,,False,,,NoDomainAgeData
4,http://college-eisk.ru/cli/,http,27,1,15,0.0,3061.0,Russia,False,False,0.0,
5,https://dotpay-platnosc3.eu/dotpay/,https,35,1,19,,,,False,,,NoDomainAgeData
6,https://yakbstains.cloudaccess.host/index.html,https,46,2,27,2.0,,,False,,0.0,InternalServerError
7,https://muskdrop.vip,https,20,1,12,,6.0,,False,,,
8,http://unvisored-destinati.000webhostapp.com/,http,45,2,37,8.0,1389.0,United States,False,False,0.0,
9,http://gabungvideoviral77.000webhostapp.com/lo...,http,53,2,36,8.0,1389.0,United States,False,False,0.0,


### Storing the data
- Use Pandas library
- import pandas as pd

##### Save to local as .csv
- df.to_csv("file_name.csv", sep="\t")
- tab (\t) as a delimiter

##### Read .csv file back to programming environment
- df_name = pd.read_csv("file_name.csv", sep="\t")
- you can also give parameters to give more detailed reading instructions
- see pandas documentation: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html

In [64]:
#example:
df.to_csv("hoxhunt.csv",sep="\t")

In [65]:
#read from local
df_name = pd.read_csv("hoxhunt.csv",sep="\t")
df_name.head()

Unnamed: 0.1,Unnamed: 0,url,protocol,url_length,num_of_dots_fqdn,fqdn_length,num_of_links (href),age,country,blacklist,isIpProxy,redirects,error
0,0,http://carnavalacabomaistemosessaslindas.com/2...,http,79,1,37,40.0,2.0,Singapore,False,False,0.0,
1,1,https://www.slideshare.net/weaveworks/client-s...,https,76,2,18,135.0,5078.0,United States,False,False,0.0,
2,2,http://cartaobndes.gov.br.cv31792.tmweb.ru/,http,43,5,35,,4993.0,Russia,False,False,,
3,3,https://paypal.co.uk.yatn.eu/m/,https,31,4,20,,,,False,,,NoDomainAgeData
4,4,http://college-eisk.ru/cli/,http,27,1,15,0.0,3061.0,Russia,False,False,0.0,


### Afterword

- all seems ok - No errors and all the data is stored in a dataframe!
- See readme.md for future discussion.