In [75]:
import pandas as pd
allUrls = pd.read_csv("allUrls.csv")
allUrls.head()

Unnamed: 0,url,result
0,http://intego3.info/EXEL/index.php,1
1,https://www.mathopenref.com/segment.html,0
2,https://www.computerhope.com/issues/ch000254.htm,0
3,https://www.investopedia.com/terms/n/next-elev...,0
4,https://jobs.emss.org.uk/lcc.aspx,0


In [76]:
legitUrls = pd.read_csv("legitimate.csv")
legitUrls.head()

Unnamed: 0,url
0,google.com
1,youtube.com
2,facebook.com
3,netflix.com
4,microsoft.com


In [77]:
# importing required packages for this section
from urllib.parse import urlparse,urlencode
import ipaddress
import re

In [78]:
# 0.Domain of the URL (Domain) 
def getDomain(url):
  domain = urlparse(url).hostname
  if re.match(r"^www.",domain):
	       domain = domain.replace("www.","")
  return domain

In [79]:
# 1.Checks for IP address in URL (Have_IP)
def havingIP(url):
  try:
    ipaddress.ip_address(urlparse(url).hostname)
    ip = 1
  except:
    ip = 0
  return ip

In [80]:
# 2.Checks the presence of @ in URL (Have_At)
def haveAtSign(url):
  if "@" in url:
    at = 1    
  else:
    at = 0    
  return at

In [81]:
# 3.Finding the length of URL and categorizing (URL_Length)
def getLength(url):
  if len(url) < 54:
    length = 0            
  else:
    length = 1            
  return length

In [82]:
# 4.Gives number of '/' in URL (URL_Depth)
def getDepth(url):
  s = urlparse(url).path.split('/')
  depth = 0
  for j in range(len(s)):
    if len(s[j]) != 0:
      depth = depth+1
  return depth

In [83]:
# 5. Checking for redirection '//' in the url (Redirection)
def redirection(url):
  pos = url.rfind('//')
  if pos > 6:
    if pos > 7:
      return 1
    else:
      return 0
  else:
    return 0

In [84]:
# 6. Existence of “HTTPS” Token in the Domain Part of the URL (https_Domain)
def httpDomain(url):
  domain = urlparse(url).netloc
  if 'https' in domain:
    return 1
  else:
    return 0

In [85]:
#listing shortening services
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
                      r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
                      r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
                      r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
                      r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
                      r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
                      r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
                      r"tr\.im|link\.zip\.net"

In [86]:
# 7. Checking for Shortening Services in URL (Tiny_URL)
def tinyURL(url):
    match=re.search(shortening_services,url)
    if match:
        return 1
    else:
        return 0

In [87]:
# 8.Checking for Prefix or Suffix Separated by (-) in the Domain (Prefix/Suffix)
def prefixSuffix(url):
    if '-' in urlparse(url).netloc:
        return 1            # phishing
    else:
        return 0            # legitimate

In [88]:
# 9. Checking Scheme of URL
def checkScheme(url):
  scheme = urlparse(url).scheme
  if(scheme == 'https'):
    return 0
  else:
    return 1

In [89]:
# 10. Check Using Non-Standard Port
def checkPort(url):
  port = urlparse(url).port
  if(port == None):
    return 0
  else:
    if(port == 80 or port == 443):
      return 0
    else:
      return 1

In [90]:
# 11. Checking Dots for Mutli-Domains
def checkDots(url):
  domain = urlparse(url).netloc
  count = 0
  for c in domain:
    if c == '.':
      count += 1
  if(count>2):
    return 1
  else:
    return 0

In [91]:
# 12. Check Query Proportion
def checkQuery(url):
  query = urlparse(url).query
  if(len(query)==0):
    return 0
  else:
    proportion = (len(query)/len(url))*100
    if(proportion>22):
      return 1
    else:
      return 0

In [92]:
# 13. Statistical Reports Based Feature
def checkStatistics(url):
  domain = getDomain(url)
  if domain in legitUrls.values:
    return 0
  else:
    return 1

In [93]:
# Feature Extraction
def extractFeatures(url,label):
  features = []
  features.append(getDomain(url))
  features.append(havingIP(url))
  features.append(haveAtSign(url))
  features.append(getLength(url))
  features.append(getDepth(url))
  features.append(redirection(url))
  features.append(httpDomain(url))
  features.append(tinyURL(url))
  features.append(prefixSuffix(url))
  features.append(checkScheme(url))
  features.append(checkPort(url))
  features.append(checkDots(url))
  features.append(checkQuery(url))
  features.append(checkStatistics(url))
  features.append(label)
  return features

In [94]:
# Scanning All Links
features = []

for i in range(0, len(allUrls)):
  url = allUrls['url'][i]
  label = allUrls['result'][i]
  features.append(extractFeatures(url,label))

In [95]:
# List to Dataframe
feature_names = ['domain', 'haveIp', 'haveAtSign', 'length', 'depth', 'redirection', 'httpDomain', 'tinyUrl', 'prefixSuffix', 'scheme', 'port', 'dots', 'query', 'stats', 'label']

allFeatures = pd.DataFrame(features, columns= feature_names)

allFeatures.head()

Unnamed: 0,domain,haveIp,haveAtSign,length,depth,redirection,httpDomain,tinyUrl,prefixSuffix,scheme,port,dots,query,stats,label
0,intego3.info,0,0,0,2,0,0,0,0,1,0,0,0,1,1
1,mathopenref.com,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,computerhope.com,0,0,0,2,0,0,0,0,0,0,0,0,0,0
3,investopedia.com,0,0,0,3,0,0,0,0,0,0,0,0,0,0
4,jobs.emss.org.uk,0,0,0,1,0,0,0,0,0,0,1,0,1,0


In [96]:
# Saving Extracted Features in File
allFeatures.to_csv('allFeatures.csv', index= False)