# Author: Olli Jokinen


- The main task is to extract data from a given url and create a set of features that could predict potential malicious web page.
- Read also readme.md to get a high-level view on the project

In [362]:
import requests
import json
from urllib.parse import urlparse
import io
import pandas as pd
import re
from urllib import parse

## Define functions:
- Parse urls
- extract data with https://input.payapi.io/v1/api/fraud/domain/
- extractors give None if there is no information or an error occurs
- create dictionary per url
- create a dataset based on features of a given url -> merge all together
- save file into .csv format and read it back from local to jupyter

In [364]:
def get_domain_data(domain):
    show = "https://input.payapi.io/v1/api/fraud/domain/" + domain
    #handle errors
    try:
        data = requests.get(show).json()
        return data
    except Exception:
        print("ERROR")
        return None
    
def parse_domain_from_url(url):
    t = urlparse(url).netloc
     #avoid wrong format e.g. https://www.slideshare.net/.... -> slideshare.net instead of www.slideshare.net
    return '.'.join(t.split('.')[-2:])

def get_ip(data):
    return data["ip"] if "ip" in data else None

def get_age(data):
    return data["result"] if 'result' in data else None

def get_is_ip_blacklisted(data):
    return data["isIpBlacklisted"] if "isIpBlacklisted" in data else None

def get_is_ip_proxy(data):
    return data["isIpProxy"] if "isIpProxy" in data else None

def get_country(data):
    return data["countryName"] if "countryName" in data else None

def get_error(data):
    return data["error"] if "error" in data else None

def count_redirects(url):
    #number of redirects
    try:
        response = requests.get(url, timeout=5)
        if response.history:
            return len(response.history)
        else:
            return 0
    except Exception:
        return '?'
    
def url_to_fqdn(url):
    result = "".join(url.split("://")[1:]) #remove protocol
    result = result.split("/")[0] #remove path
    return result

In [365]:
#return several values based on url
def dict_from_url(url):
    
    #protocol
    protocol = url.split("://")[0] #http:// or https://
    
    #url to domain
    domain = parse_domain_from_url(url)

    #get data by domain: age, is ip proxy, is ip blacklisted
    domain_data = get_domain_data(domain)
    
    #fqdn
    fqdn = url_to_fqdn(url)
    
    fqdn_length = len(fqdn)
    
    
    #blacklists
    isIpProxy = get_is_ip_proxy(domain_data)
    isIpBlacklisted = get_is_ip_blacklisted(domain_data)
    
    
    #url length
    url_length = len(url)
    
    #number of dots in fqdn
    url_num_of_dots = fqdn.count(".")
    
    #ownership details
    age = get_age(domain_data)
    country = get_country(domain_data)
    
    #num_of_redirects
    redirects = count_redirects(url)
    
    #error
    error = get_error(domain_data)

    
    
    #build a dict of url features
    url_dict = {
        'url': url,
        'protocol': protocol,
        'url_length' : url_length,
        'fqdn_length' : fqdn_length,
        'num_of_dots_fqdn' : url_num_of_dots,
        'age' : age,
        'country' : country,
        'isIpBlacklisted' : isIpBlacklisted ,
        'isIpProxy' : isIpProxy,
        'redirects' : redirects,
        'error' : error
        
    }
    
    return url_dict

### Let's extract
- example_urls is the dataset. 
- features extracted from urls are merged together into a single dataframe

In [366]:
# Note some of these urls are live phishing sites (as of 2019-03-21) use with caution! More can be found at https://www.phishtank.com/
example_urls = ["https://www.slideshare.net/weaveworks/client-side-monitoring-with-prometheus",
                "http://cartaobndes.gov.br.cv31792.tmweb.ru/",
                "https://paypal.co.uk.yatn.eu/m/",
                "http://college-eisk.ru/cli/",
                "https://dotpay-platnosc3.eu/dotpay/",
                "https://yakbstains.cloudaccess.host/index.html",
                "https://muskdrop.vip",
                "http://unvisored-destinati.000webhostapp.com/",
                "http://gabungvideoviral77.000webhostapp.com/login.php",
                "https://www.google.com/url?sa=t&ectrans=1&cd=3&rct=j&cad=rja&usg=AOvVaw0_9EI52uq3vapizMNgByf0&url=http%3A//nextsteppk.com/courselist&q=&ved=2ahUKEwi-rviTieHnAhXF7eAKHVLFD6AQFjACegQIBhAB&source=web&uact=8&esrc=s",
                "https://www.tiktok.com/ar/",
                "https://www.disneyworld.eu/login/?returnUrl=/profile/",
                "https://dvla.co.uk-refund-form-feb27.info/",
                "http://getrufd.net/index.ph= p?id=3Dkoi@%cd%lkf.aspx?54fa654fa65a654fa6a654fa6a654fa6a654fa6"
               ]


In [367]:
columns =["url", 'protocol', 'url_length',"num_of_dots_fqdn","fqdn_length","age","country","isIpBlacklisted","isIpProxy","redirects","error"]
df = pd.DataFrame(columns=columns)
for url in example_urls:
    new_row = dict_from_url(url)
    df = df.append(new_row,ignore_index=True)

In [368]:
df

Unnamed: 0,url,protocol,url_length,num_of_dots_fqdn,fqdn_length,age,country,isIpBlacklisted,isIpProxy,redirects,error
0,https://www.slideshare.net/weaveworks/client-s...,https,76,2,18,5077.0,United States,False,False,0,
1,http://cartaobndes.gov.br.cv31792.tmweb.ru/,http,43,5,35,4991.0,Russia,False,False,?,
2,https://paypal.co.uk.yatn.eu/m/,https,31,4,20,,,,,?,NoDomainAgeData
3,http://college-eisk.ru/cli/,http,27,1,15,3059.0,Russia,False,False,0,
4,https://dotpay-platnosc3.eu/dotpay/,https,35,1,19,,,,,?,NoDomainAgeData
5,https://yakbstains.cloudaccess.host/index.html,https,46,2,27,,,,,0,InternalServerError
6,https://muskdrop.vip,https,20,1,12,4.0,,,,?,
7,http://unvisored-destinati.000webhostapp.com/,http,45,2,37,1387.0,United States,False,False,0,
8,http://gabungvideoviral77.000webhostapp.com/lo...,http,53,2,36,1387.0,United States,False,False,0,
9,https://www.google.com/url?sa=t&ectrans=1&cd=3...,https,210,2,14,8201.0,United States,False,False,0,


### Storing the data
- Use Pandas library
- import pandas as pd

##### Save to local as .csv
- df.to_csv("file_name.csv", sep="\t")
- tab as a delimiter

##### Read .csv file back to programming environment
- df_name = pd.read_csv("file_name.csv", sep="\t")
- you can also give parameters to give more detailed reading instructions
- see pandas documentation: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html

In [356]:
#example:
df.to_csv("hoxhunt.csv",sep="\t")

In [358]:
#read from local
df_name = pd.read_csv("hoxhunt.csv",sep="\t")
df_name.head()

Unnamed: 0.1,Unnamed: 0,url,protocol,url_length,num_of_dots_fqdn,fqdn_length,age,country,isIpBlacklisted,isIpProxy,redirects,error
0,0,https://www.slideshare.net/weaveworks/client-s...,https,76,2,18,5077.0,United States,False,False,0,
1,1,http://cartaobndes.gov.br.cv31792.tmweb.ru/,http,43,5,35,4991.0,Russia,False,False,?,
2,2,https://paypal.co.uk.yatn.eu/m/,https,31,4,20,,,,,?,NoDomainAgeData
3,3,http://college-eisk.ru/cli/,http,27,1,15,3059.0,Russia,False,False,0,
4,4,https://dotpay-platnosc3.eu/dotpay/,https,35,1,19,,,,,?,NoDomainAgeData


In [361]:
#all seems ok! See readme.md for future discussion.