# **2.B FEATURE EXTRACTION**
Phishing URLs only

#### The objective of this notebook is to collect data and save it as a CSV file for Feature Extraction.

* Lexical Features
* Whois Features
* Popularity Features

#### This project is worked on Jupyter Notebook 

In [14]:
import pandas as pd
from urllib.parse import urlparse
import re
from bs4 import BeautifulSoup
import whois
import urllib.request
import time
import socket
from urllib.error import HTTPError
from datetime import datetime

In [15]:
#import dataset

#/Users/jillkathleen/Downloads/github/masqueraderx-Phishing-Websites-Detection/ModelTrain/raw_datasets/1000-phishing.txt

#/Users/jillkathleen/Downloads/github/masqueraderx-Phishing-Websites-Detection/ModelTrain/raw_datasets/legitimate_urls.txt

legitimate = pd.read_csv("/Users/jillkathleen/Desktop/Phishing-Analysis-Detection/Back-End/Feature-Extraction-ntbk/FE-more-data/Webpages_Classification_test_data.csv", usecols=['url'])



In [16]:
legitimate

Unnamed: 0,url
0,http://www.dutchthewiz.com/freeware/
1,http://www.collectiblejewels.com
2,http://www.deadlinedata.com
3,http://www.mil.fi/maavoimat/kalustoesittely/00...
4,http://www.avclub.com/content/node/24539
...,...
308297,http://animeworld.com/reviews/blacklion.html
308298,http://www.luxnova.com/comp/
308299,http://www.drive55.org
308300,http://www.usatoday.com/life/cyber/tech/2002/0...


In [17]:

legit = legitimate.sample(n = 1500, random_state = 12).copy()

legitimate_urls = legit.reset_index(drop=True)
legitimate_urls


Unnamed: 0,url
0,http://www.tripadvisor.com
1,http://www.neocomputers.com
2,http://www.stoneagerockgym.com/
3,http://www.ornl.gov/sci/ortep/topology/defs.txt
4,http://wc.rootsweb.ancestry.com/cgi-bin/igm.cg...
...,...
1495,http://www.mach3engineering.com/
1496,http://www.chefjobs.com/
1497,http://www.meijer.com/
1498,http://www.spacesurfer.com/wceleb/list/halle_b...


In [18]:
legitimate_urls.columns = ['url']
legitimate_urls.head()

Unnamed: 0,url
0,http://www.tripadvisor.com
1,http://www.neocomputers.com
2,http://www.stoneagerockgym.com/
3,http://www.ornl.gov/sci/ortep/topology/defs.txt
4,http://wc.rootsweb.ancestry.com/cgi-bin/igm.cg...


## 2.1 Lexical Features

* URL Length 
* URL Shortening Services “TinyURL”
* URL Presence of "@" Symbol
* URL Presence of special characters : _ ? = & etc
* URL Suspicious words (security sensitive words)
* URL Digit Count
* URL Protocol Count (http / https)
* URL Dot Count
* URL Hyphen Count
* Domain presence of IP Address
* Domain presence of hyphen / prefix or Suffix
* Sub Domain and Multi Sub Domains Count
* Redirecting "//" in URL (// position)
* URL presence of EXE


In [19]:
#class FeatureExtraction:
#    def __init__(url):
#        pass

# 1.Extracts domain from the given URL
def getDomain(url):
    domain = urlparse(url).netloc
    if re.match(r"^www.",domain):
        domain = domain.replace("www.","")
    return domain
    
# 2.Checks for IP address in URL (Have_IP)
def ip_address(url):
    try:
        ipaddress.ip_address(url)
        ip = 1
    except:
        ip = 0
    return ip
    
# 3.Checks the presence of @ in URL (Have_At)
def have_at_symbol(url):
    if "@" in url:
        at = 1 
    else:
        at = 0   
    return at
    
# 4.Finding the length of URL and categorizing (URL_Length)
def long_url(url):
    if len(url) < 54:
        length = 0    
    else:
        length = 1    
    return length

# 5.Gives number of '/' in URL (URL_Depth)
def getDepth(url):
    s = urlparse(url).path.split('/')
    depth = 0
    for j in range(len(s)):
        if len(s[j]) != 0:
            depth = depth+1
    return depth
        
# 6.Checking for redirection '//' in the url (Redirection)
def redirection(url):
    pos = url.rfind('//')
    if pos > 6:
        if pos > 7:
            return 1
        else:
            return 0
    else:
        return 0
    
# 7.Existence of “HTTPS” Token in the Domain Part of the URL (https_Domain)
def httpDomain(url):
    domain = urlparse(url).netloc
    if 'https://|http://' in domain:
        return 1
    else:
        return 0

    
# 8. Checking for Shortening Services in URL (Tiny_URL) 
def shortening_service(url):
    match = re.search('bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
                    'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
                    'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
                    'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
                    'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
                    'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
                    'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|'
                    'tr\.im|link\.zip\.net', url)
    if match:
        return 1               # phishing
    else:
        return 0               # legitimate
    
    
    
    
# 9.Checking for Prefix or Suffix Separated by (-) in the Domain (Prefix/Suffix)     
def prefix_suffix_separation(url):
    if "-" in urlparse(url).netloc:
        return 1            # phishing
    else:
        return 0            # legitimate
    
# 10. DNS Record 

    
# 11.Web traffic (Web_Traffic)
def web_traffic(url):
    try:
        url = urllib.parse.quote(url)
        rank = BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" + url).read(), "xml").find(
        "REACH")['RANK']
        rank = int(rank)
    except TypeError:
        return 1
    if rank <100000:
        return 1
    else:
        return 0
        
# 12.Survival time of domain: The difference between termination time and creation time (Domain_Age)  
def domainAge(domain_name):
    creation_date = domain_name.creation_date
    expiration_date = domain_name.expiration_date
    if (isinstance(creation_date,str) or isinstance(expiration_date,str)):
        try:
            creation_date = datetime.strptime(creation_date,'%Y-%m-%d')
            expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
        except:
            return 1
    if ((expiration_date is None) or (creation_date is None)):
        return 1
    elif ((type(expiration_date) is list) or (type(creation_date) is list)):
        return 1
    else:
        ageofdomain = abs((expiration_date - creation_date).days)
        if ((ageofdomain/30) < 6):
            age = 1
        else:
            age = 0
    return age

# 13.End time of domain: The difference between termination time and current time (Domain_End) 
def domainEnd(domain_name):
    expiration_date = domain_name.expiration_date
    if isinstance(expiration_date,str):
        try:
            expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
        except:
            return 1
    if (expiration_date is None):
        return 1
    elif (type(expiration_date) is list):
        return 1
    else:
        today = datetime.now()
        end = abs((expiration_date - today).days)
    if ((end/30) < 6):
        end = 0
    else:
        end = 1
    return end

# 14. Dot count
def dot_count(url):
    if url.count(".") < 3:
        return 0            # legitimate
    elif url.count(".") == 3:
        return 1            # suspicious
    else:
        return 1            # phishing
        
    
# 14. Special characters count
def specialcharCount(url):
    cnt = 0
    special_characters = [';','+=','_','?','=','&','[',']','/',':']
    for each_letter in url:
        if each_letter in special_characters:
            cnt = cnt + 1
    return cnt


# 15. 
def subdomCount(url):

    # separate protocol and domain then count the number of dots in domain
    
    domain = url.split("//")[-1].split("/")[0].split("www.")[-1]
    if(domain.count('.')<=1):
        return 0
    else:
        return 1

In [20]:
#Function to extract features
def featureExtraction(url,label):
    
    features = []
    
    features.append(getDomain(url))
    features.append(ip_address(url))
    features.append(have_at_symbol(url))
    features.append(long_url(url))
    features.append(getDepth(url))
    features.append(redirection(url))
    features.append(httpDomain(url))
    features.append(shortening_service(url))
    features.append(prefix_suffix_separation(url))
  
    dns = 0
    try:
        domain_name = whois.whois(urlparse(url).netloc)
    except:
        dns = 1
        
    features.append(dns)
    features.append(web_traffic(url))
    features.append(1 if dns == 1 else domainAge(domain_name))
    features.append(1 if dns == 1 else domainEnd(domain_name))
    
    features.append(dot_count(url))
    features.append(specialcharCount(url))
    features.append(subdomCount(url))
    

    features.append(label)
    
    
    return features

In [21]:
feature_names = ['domain', 'ip_present', 'at_present', 'url_length', 'url_depth','redirection', 
                      'https_domain', 'short_url', 'prefix/suffix', 'dns_record', 'web_traffic', 
                      'domain_age', 'domain_end', 'dot_count', 'specialchar_count','subdom_count', 'label']

label = 0

In [22]:
# Extracting the features & storing them in a list
# Lexical Features

# starting time
start_time = time.time()
print('\n')
print('Begin feature extraction for benign dataset.... \n')

##===================================##


#Extracting the feautres & storing them in a list
legit_features = []
rows = len(legitimate_urls['url'])
label = 0

for i in range(0, rows):
    url = legitimate_urls['url'][i]
    print(i), print(url)
    
    
    legit_features.append(featureExtraction(url,label))

    
##===================================##

elapsed = time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))
print('\n')
print(f"Runtime: Feature Extraction for legitimate dataset took:  {elapsed}")


print('\n\n\n\n')
print("***Legitimate Features")




Begin feature extraction for benign dataset.... 

0
http://www.tripadvisor.com
1
http://www.neocomputers.com
2
http://www.stoneagerockgym.com/
3
http://www.ornl.gov/sci/ortep/topology/defs.txt
4
http://wc.rootsweb.ancestry.com/cgi-bin/igm.cgi?db=:1529653
5
http://www.dps.state.la.us/lgcb/
6
http://www.moorabbingleneiraleader.com.au/
7
http://www.malivoirewineco.com/
8
http://www.animalactivism.org/resources/online/cruelty_free_shopping_list_29.php
9
http://www.imdb.com/name/nm0488057/
10
http://www.poemeopathy.com
11
http://www.gamespot.com/gamecube/rpg/animalcrossing/review.html
12
http://www-linguistics.stanford.edu/nwav/
13
http://mally.stanford.edu/mally.html
14
http://www.worldflex.net/
15
http://webring.com/hub?ring=heli
16
http://www.apra.com/about.cfm
17
http://pc.gamespy.com/pc/the-sims-2/
18
http://www.caffeappassionato.com/
19
http://www.glenoaks.cc.mi.us/
20
http://www.lifeformations.com/
21
http://www.osteohome.com
22
http://www.nightvis.com
23
http://www.vhcpa.com
24
ht

202
http://beehive.thisisbristol.com/default.asp?wci=sitehome&amp;id=10555
203
http://web.onetel.net.uk/~english4tomorrow/home_page_1x.html
204
http://members.tripod.com/j0nmark/main.html
205
http://www.gardenfreshbirdseed.com/
206
http://www.givoni.dk
207
http://www.geocities.com/foxxys_den/index.html
208
http://iss.jaxa.jp/kids/
209
http://www.myspace.com/friendsofkenband
210
http://www.pashnit.com/bikes/speedtriple-martins.htm
211
http://www.angelfire.com/music/tash/jarsofclay.html
212
http://academic.shu.edu/chesterton/
213
http://www.foresthomenaz.com/
214
http://www.webmates.ch/
215
http://members.tripod.com/spiffyshoes/zoinks.html
216
http://www.russian-translation.biz
Error trying to connect to socket: closing socket
217
http://www.rickard.karoo.net/articles/battles_stamford.html
218
http://www.nls.uk/info/readingrooms/askalibrarian.html
219
http://www.freetextlinkexchange.com/
220
http://www.curriculum.edu.au/accessasia/korea/kids.htm
221
http://www.sussex.ac.uk/chemistry/
222

395
http://www.umich.edu/~sigmachi/
396
http://www.oilworks.com
397
http://www.mlc-wels.edu/home/athletics/
398
http://www.radiofuture.com/
399
http://www.friends-classics.demon.co.uk/
400
http://www.eastwoodspres.org/
Error trying to connect to socket: closing socket
401
http://www.bobbyoresports.com
402
http://www.nanotechnologyinstitute.org/neitc.html
403
http://www.csbc.co.uk/
404
http://www.eboss.co.uk
405
http://www.coppercliff.com
406
http://www.slidecage.com
407
http://bytex64.net/vfs/
408
http://www.neuro.org.my
Error trying to connect to socket: closing socket
409
http://www.gardrolma.org
Error trying to connect to socket: closing socket
410
http://www.haro-online.com/movies/haunting.html
411
http://www.irandarroudi.com/
412
http://www.vibration-testing.com
413
http://ip.rsu.ru/
414
http://www.bbc.co.uk/history/historic_figures/edward_viii_king.shtml
415
http://www.juveniledefender.net/
416
http://www.chem.metu.edu.tr/
417
http://www.opc.ncep.noaa.gov/
418
http://www.filmsite

581
http://sports.yahoo.com/nhl/teams/stl/
582
http://www.12step.org
583
http://www.mobygames.com/game/sheet/gameid128
584
http://www.rod.beavon.clara.net/robert_hooke.htm
585
http://www.mesjunkets.com/
586
http://www.jda-cpa.com/
587
http://new-playboy.jp/
588
http://www.theyellowpencil.com/
589
http://www.smhct.org/
590
http://www.collectionscanada.gc.ca/2/4/h4-3100-e.html
591
http://www.plexisweb.com
592
http://www.wmdshred.com
593
http://www.plastyrobel.com/welcome_e.htm
594
http://www.pace.com/
595
http://www.kobe-jma.go.jp/knowledge/kids/kids.html
596
http://www.gallaudet.edu/
597
http://www.typhoonline.com/
598
http://www.coecreekfarm.com/
599
http://www.geocities.com/misccorson/
600
http://www.fchd.info/brierlht.htm
601
http://www.moordarts.co.uk
602
http://www.technicaredental.com/
603
http://www.cdnhomecare.ca/
604
http://www.aerobikick.com
605
http://pocket.ign.com/articles/367/367807p1.html
606
http://users.pipeline.com.au/~dyers/
607
http://www.sigchi.org/
608
http://www.c

769
http://www.sdcss.com
770
http://www.findagrave.com/cgi-bin/fg.cgi?page=gr&amp;grid=5218
771
http://www.napervillebikeclub.com/
772
http://www.bouteiller.com/
773
http://members.tripod.com/caroline_bowen/mbc.htm
774
http://www.asianart.com/mandalas/
775
http://www.ceremoniesandcelebrations.com
776
http://www.botany.hawaii.edu/faculty/carr/nyctagin.htm
777
http://www.alaskaindianarts.com
778
http://www.nottingham.ac.uk/biology/
779
http://www.newadvent.org/cathen/10660b.htm
780
http://www.descriptio.com
781
http://www.goodship.net/tuesdays/
Error trying to connect to socket: closing socket
782
http://www.therocketman.net
783
http://www.warlordsofearth.com
784
http://members.tripod.com/compton100/
785
http://www.geocities.com/nevermore4life/home.html
786
http://www.californiascientific.com/
Error trying to connect to socket: closing socket
787
http://www.altecdiagnostic.com/
788
http://members.tripod.com/photobite/
789
http://www.shetrust.org.uk
790
http://www.jccp.gr.jp/
791
http://w

970
http://music911.net/
971
http://www.2gnt.com/
972
http://www.freewebs.com/vaultie
973
http://www.tecla.com/authors/giulianimusic.htm
974
http://www.silverbackcomputers.co.uk
975
http://www.grandtraverselighthouse.com/
976
http://www.manizone.co.uk/
977
http://www.midsouthcon.org/
978
http://mysite.verizon.net/trinity_um/
979
http://www.britsofthehudson.org/
Error trying to connect to socket: closing socket
980
http://www.dcup.com
981
http://scoutsongs.com/
982
http://www.austin7.org/
983
http://s.webring.com/hub?ring=skemers
984
http://www.mwt.net/~stbon/
985
http://www.promotissue.eu/
986
http://members.core.com/~olearys3/
987
http://minerals.usgs.gov/minerals/pubs/commodity/copper/
988
http://www.safesoft.com
989
http://www.apepenpublishing.com/
990
http://www.bandname.com/
991
http://aappolicy.aappublications.org/cgi/content/full/pediatrics;99/4/639?fulltext=tobacco+smoke
992
http://members.iinet.com.au/~ianw/
993
http://news.bbc.co.uk/1/hi/business/1561832.stm
994
http://web.ti

1161
http://blaw.free.fr/poissons/sommaire.htm
1162
http://www.adhdchild.org
Error trying to connect to socket: closing socket
1163
http://www.dailyherald.com/
1164
http://www.geocities.com/harveststarfarm/
1165
http://www.nuance.com/pdfconverter/
1166
http://www.vickerstafflaw.com
1167
http://www.michaelstarr.co.uk
1168
http://www.plumeriabay.com/
1169
http://www.discodanthealienman.com/dd2movies.htm
1170
http://shakespeare.palomar.edu/lambtales/ltmnd.htm
1171
http://www.ecawar.org/home/index.php
1172
http://slackerz.8m.com
1173
http://groups.yahoo.com/group/theoriginalpaulbettanyclub/
1174
http://www.propertycentric.com
1175
http://www.hollywoodreporter.com/hr/film/reviews/article_display.jsp?&amp;rid=9906
1176
http://www.va-home-loansonline.com
1177
http://www.celebritygolf.com/celebrity-list.asp?search=joe+mantegna#
1178
http://article.gmane.org/gmane.comp.lib.gnustep.general/26371/
1179
http://www.johngodwin.net/1195/ax0007gr/1201.html
1180
http://www.brandequalsexperience.com
118

1331
http://nipponcinema.com
1332
http://www.wwf.org.au/tsn/
1333
http://spd-web.terma.com/projects/raise/
1334
http://www.traffico-aereo.it/
1335
http://www.asocon.org/main.htm
Error trying to connect to socket: closing socket
1336
http://undifinable.netfirms.com/
1337
http://www.shakygroundmusic.com
1338
http://www.happybooster.com/
1339
http://www.jonahhouse.org
Error trying to connect to socket: closing socket
1340
http://www.chrisdellavedova.com
1341
http://www.homeeducationpartnership.com/
1342
http://rna.rega.kuleuven.ac.be/masspec/
1343
http://online.anu.edu.au/forestry/teaching/t6/picea_a/picea_a.html
1344
http://www.lasthour.com/
1345
http://pc.ign.com/objects/011/011500.html
1346
http://www.sikh-heritage.co.uk/movements/radhasoamis/the%20radhasoamis.htm
1347
http://calstuff.blogsome.com/
1348
http://www.coolnurse.com/puberty.htm
1349
http://www.splendidspaces.com/
1350
http://www.tgeweb.com/ironworks/index.shtml
1351
http://www.electrocutas.com/
1352
http://stats.football.co

In [23]:
#Converting the list to dataframe

legitimate = pd.DataFrame(legit_features, columns= feature_names)
legitimate.head()

Unnamed: 0,domain,ip_present,at_present,url_length,url_depth,redirection,https_domain,short_url,prefix/suffix,dns_record,web_traffic,domain_age,domain_end,dot_count,specialchar_count,subdom_count,label
0,tripadvisor.com,0,0,0,0,0,0,0,0,0,1,1,1,0,3,0,0
1,neocomputers.com,0,0,0,0,0,0,0,0,0,1,1,1,0,3,0,0
2,stoneagerockgym.com,0,0,0,0,0,0,0,0,0,1,0,1,0,4,0,0
3,ornl.gov,0,0,0,4,0,0,0,0,0,1,1,1,1,7,0,0
4,wc.rootsweb.ancestry.com,0,0,1,2,0,0,0,0,0,1,1,1,1,8,1,0


In [24]:
# Storing the extracted legitimate URLs fatures to csv file

legitimate.to_csv('/Users/jillkathleen/Desktop/Phishing-Analysis-Detection/Back-End/Feature-Extraction-ntbk/FE-more-data/benign_SAMPLE.csv', index= False)

