# **2.B FEATURE EXTRACTION**

## Dataset 2 - Phishing - 30% - 4437 urls

#### The objective of this notebook is to collect data and save it as a CSV file for Feature Extraction.

* Lexical Features
* Whois Features
* Popularity Features

#### This project is worked on Jupyter Notebook 

In [1]:
import pandas as pd
from urllib.parse import urlparse
import re
from bs4 import BeautifulSoup
import whois
import urllib.request
import time
import socket
from urllib.error import HTTPError
from datetime import datetime

In [2]:
#import dataset


legitimate_urls = pd.read_csv("/Users/jillkathleen/Desktop/Phishing-Analysis-Detection/Back-End/Data-Collection/Dataset2/D2_legit_label30.csv")



In [3]:
legitimate_urls

Unnamed: 0,url,label
0,www.hackcraft.net/raii/,good
1,promo.yahoo.com/user_research/,good
2,www.cpm.z80.de/small_c.html,good
3,www.codeguru.com/Cpp/G-M/opengl/,good
4,www.adaptivepath.com/publications/essays/archi...,good
...,...,...
4432,www.ccs.neu.edu/research/demeter/,good
4433,www.w3.org/Robot/,good
4434,webclipart.about.com/od/animalclipartlinks/l/b...,good
4435,www.march-hare.com/library/html/ud6blurb.htm,good


## 2.1 Lexical Features

* URL Length 
* URL Shortening Services “TinyURL”
* URL Presence of "@" Symbol
* URL Presence of special characters : _ ? = & etc
* URL Suspicious words (security sensitive words)
* URL Digit Count
* URL Protocol Count (http / https)
* URL Dot Count
* URL Hyphen Count
* Domain presence of IP Address
* Domain presence of hyphen / prefix or Suffix
* Sub Domain and Multi Sub Domains Count
* Redirecting "//" in URL (// position)
* URL presence of EXE


In [4]:
#class FeatureExtraction:
#    def __init__(url):
#        pass

# 1.Extracts domain from the given URL
def getDomain(url):
    domain = urlparse(url).netloc
    if re.match(r"^www.",domain):
        domain = domain.replace("www.","")
    return domain
    
# 2.Checks for IP address in URL (Have_IP)
def ip_address(url):
    try:
        ipaddress.ip_address(url)
        ip = 1
    except:
        ip = 0
    return ip
    
# 3.Checks the presence of @ in URL (Have_At)
def have_at_symbol(url):
    if "@" in url:
        at = 1 
    else:
        at = 0   
    return at
    
# 4.Finding the length of URL and categorizing (URL_Length)
def long_url(url):
    if len(url) < 54:
        length = 0    
    else:
        length = 1    
    return length

# 5.Gives number of '/' in URL (URL_Depth)
def getDepth(url):
    s = urlparse(url).path.split('/')
    depth = 0
    for j in range(len(s)):
        if len(s[j]) != 0:
            depth = depth+1
    return depth
        
# 6.Checking for redirection '//' in the url (Redirection)
def redirection(url):
    pos = url.rfind('//')
    if pos > 6:
        if pos > 7:
            return 1
        else:
            return 0
    else:
        return 0
    
# 7.Existence of “HTTPS” Token in the Domain Part of the URL (https_Domain)
def httpDomain(url):
    domain = urlparse(url).netloc
    if 'https://|http://' in domain:
        return 1
    else:
        return 0

    
# 8. Checking for Shortening Services in URL (Tiny_URL) 
def shortening_service(url):
    match = re.search('bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
                    'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
                    'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
                    'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
                    'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
                    'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
                    'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|'
                    'tr\.im|link\.zip\.net', url)
    if match:
        return 1               # phishing
    else:
        return 0               # legitimate
    
    
    
    
# 9.Checking for Prefix or Suffix Separated by (-) in the Domain (Prefix/Suffix)     
def prefix_suffix_separation(url):
    if "-" in urlparse(url).netloc:
        return 1            # phishing
    else:
        return 0            # legitimate
    
# 10. DNS Record 

    
# 11.Web traffic (Web_Traffic)
def web_traffic(url):
    try:
        url = urllib.parse.quote(url)
        rank = BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" + url).read(), "xml").find(
        "REACH")['RANK']
        rank = int(rank)
    except TypeError:
        return 1
    if rank <100000:
        return 1
    else:
        return 0
        
# 12.Survival time of domain: The difference between termination time and creation time (Domain_Age)  
def domainAge(domain_name):
    creation_date = domain_name.creation_date
    expiration_date = domain_name.expiration_date
    if (isinstance(creation_date,str) or isinstance(expiration_date,str)):
        try:
            creation_date = datetime.strptime(creation_date,'%Y-%m-%d')
            expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
        except:
            return 1
    if ((expiration_date is None) or (creation_date is None)):
        return 1
    elif ((type(expiration_date) is list) or (type(creation_date) is list)):
        return 1
    else:
        ageofdomain = abs((expiration_date - creation_date).days)
        if ((ageofdomain/30) < 6):
            age = 1
        else:
            age = 0
    return age

# 13.End time of domain: The difference between termination time and current time (Domain_End) 
def domainEnd(domain_name):
    expiration_date = domain_name.expiration_date
    if isinstance(expiration_date,str):
        try:
            expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
        except:
            return 1
    if (expiration_date is None):
        return 1
    elif (type(expiration_date) is list):
        return 1
    else:
        today = datetime.now()
        end = abs((expiration_date - today).days)
    if ((end/30) < 6):
        end = 0
    else:
        end = 1
    return end

# 14. Dot count
def dot_count(url):
    if url.count(".") < 3:
        return 0            # legitimate
    elif url.count(".") == 3:
        return 1            # suspicious
    else:
        return 1            # phishing
        
    
# 14. Special characters count
def specialcharCount(url):
    cnt = 0
    special_characters = [';','+=','_','?','=','&','[',']','/',':']
    for each_letter in url:
        if each_letter in special_characters:
            cnt = cnt + 1
    return cnt


# 15. 
def subdomCount(url):

    # separate protocol and domain then count the number of dots in domain
    
    domain = url.split("//")[-1].split("/")[0].split("www.")[-1]
    if(domain.count('.')<=1):
        return 0
    else:
        return 1

In [5]:
#Function to extract features
def featureExtraction(url,label):
    
    features = []
    
    features.append(getDomain(url))
    features.append(ip_address(url))
    features.append(have_at_symbol(url))
    features.append(long_url(url))
    features.append(getDepth(url))
    features.append(redirection(url))
    features.append(httpDomain(url))
    features.append(shortening_service(url))
    features.append(prefix_suffix_separation(url))
  
    dns = 0
    try:
        domain_name = whois.whois(urlparse(url).netloc)
    except:
        dns = 1
        
    features.append(dns)
    features.append(web_traffic(url))
    features.append(1 if dns == 1 else domainAge(domain_name))
    features.append(1 if dns == 1 else domainEnd(domain_name))
    
    features.append(dot_count(url))
    features.append(specialcharCount(url))
    features.append(subdomCount(url))
    

    features.append(label)
    
    
    return features

In [6]:
feature_names = ['domain', 'ip_present', 'at_present', 'url_length', 'url_depth','redirection', 
                      'https_domain', 'short_url', 'prefix/suffix', 'dns_record', 'web_traffic', 
                      'domain_age', 'domain_end', 'dot_count', 'specialchar_count','subdom_count', 'label']

label = 0

In [7]:
# Extracting the features & storing them in a list
# Lexical Features

# starting time
start_time = time.time()
print('\n')
print('Begin feature extraction for benign dataset.... \n')

##===================================##


#Extracting the feautres & storing them in a list
legit_features = []
rows = len(legitimate_urls['url'])
label = 0

for i in range(0, rows):
    url = legitimate_urls['url'][i]
    print(i), print(url)
    
    
    legit_features.append(featureExtraction(url,label))

    
##===================================##

elapsed = time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))
print('\n')
print(f"Runtime: Feature Extraction for legitimate dataset took:  {elapsed}")


print('\n\n\n\n')
print("***Legitimate Features")




Begin feature extraction for benign dataset.... 

0
www.hackcraft.net/raii/
1
promo.yahoo.com/user_research/
2
www.cpm.z80.de/small_c.html
3
www.codeguru.com/Cpp/G-M/opengl/
4
www.adaptivepath.com/publications/essays/archives/000328.php
5
www.osnews.com/story/5602
6
www.angelfire.com/trek/scottmaurer/
7
lyonsdenvideo.com/LDV.htm
8
www.nada.kth.se/~viggo/problemlist/compendium.html
9
en.wikibooks.org/wiki/Programming:Prolog
10
pourpre.com/chromograf/en/
11
www.cs.umd.edu/hcil/photolib/
12
www.digital-web.com/articles/introduction_to_xml/
13
tools.ietf.org/html/rfc1157
14
tools.ietf.org/html/rfc2468
15
www.ibm.com/developerworks/web/library/wa-seo2/
16
www.census.gov/foreign-trade
17
vmware0.tripod.com/index.htm
18
tools.ietf.org/html/rfc2699
19
www.zib.de/weiser/vtl/
20
www.global-home.org/convee/
21
www.cs.cmu.edu/~illah/
22
www.sanfords.net/Spots_free_graphics/Space_Background_Tutorial.htm
23
cs.anu.edu.au/wicapuc/
24
www.xml.com/resourceguide/
25
cernlib.web.cern.ch/cernlib/
26
www

206
www.algotech.dk/word-html-cleaner-input.htm
207
dwhome.dataway.ch/support/traceroute.aqua
208
home.earthlink.net/~joseph-ja/programs.html#Tcl-Tk
209
exophase.com/psp/basilisk-ii-for-psp-mac-os-emulator-6604.htm
210
sais.aisnet.org/sais2005/cfp.htm
211
tools.ietf.org/html/rfc2776
212
nikonimaging.com/global/products/scanner/index.htm
213
www.cs.brown.edu/research/plt/software/divascheme/
214
tools.ietf.org/html/rfc2769
215
www.mailutilities.com/adr/
216
www.hatz.ch/arkeroid/
217
slashdot.org/articles/older/980613118210_F.shtml
218
www.sankhya.com/info/varadhi.html
219
xml.coverpages.org/ace-overview.html
220
www.rajivshah.com/Case_Studies/Cookies/CookiesLinks.html
221
www.wingfoot.com/products.html
222
www.lysator.liu.se/~zap/ldb/
223
www.angelfire.com/games/UoLS/index.html
224
www.informatik.uni-kiel.de/~mh/FLP/
225
www.spec.org/gpc/opc.static/overview.html
226
www.sics.se/people/fredriko
227
www.examsguide.com/scjp/freesimulator.html
228
www.devx.com/DevX/Door/20763
229
www.bejo.c

412
tools.ietf.org/html/rfc1357
413
www.kaon.com/software/swmeson.html
414
www.webring.com/hub?ring=wiznetsitering
415
www.salesforce.com/platform/
416
www.musicent.com/index.html
417
www.codesandciphers.org.uk/lorenz/index.htm
418
tools.ietf.org/html/rfc3167
419
www.legacy.com/obituaries/news-leader/obituary-preview.aspx?n=heather-grootens-thompson&amp;pid=151715903
420
sourceforge.net/projects/ici/
421
support.microsoft.com/default.aspx?scid=support.microsoft.com:80/support/kb/articles/q169/1/73.asp&amp;NoWebContent=1
422
hitmill.com/html/index.html
423
www.cs.cmu.edu/~bam/
424
www.redflymarketing.com/blog/
425
vidar.gimp.org/gimp/
426
barbados.org/chess/
427
www.custard.org/\%7Eandrew/optimize.php
428
listserv.uga.edu/archives/sas-l.html
429
www.aiai.ed.ac.uk/~jeff/lisp/eulisp.html
430
www.sheepyvalley.com/sheep.htm
431
www.codeguru.com/Cpp
432
bricksandbeams.ecrater.com/rss-cat.php?cid=570291
433
www.angelfire.com/electronic/webdesigner/
434
tools.ietf.org/html/rfc2964
435
sites.go

606
fare.tunes.org/LispM.html
607
www.grcdi.nl/links.htm
608
www.gleamtech.com/products/wexlite.asp
609
tools.ietf.org/html/rfc3901
610
www.squid-cache.org/mail-archive/squid-users/hypermail.html
611
www.stonebroom.com/suppxp.htm
612
www.mhonarc.org/~ehood/MIME/MIME.html
613
tools.ietf.org/html/rfc2269
614
math.nist.gov/jazznet/
615
tools.ietf.org/html/rfc2573
616
www.mat.univie.ac.at/~neum/glopt.html
617
tools.ietf.org/html/rfc1644
618
tools.ietf.org/html/rfc2279
619
www.sweetsoftware.co.nz/lua_overview.html
620
www.automatedqa.com/products/aqnet.asp
621
elouai.com/doll-makers.php
622
www.moshesipper.com/pcm/
623
planeta.clix.pt/dominio/01-xuk.htm
624
tools.ietf.org/html/rfc3036
625
www.rpg.net/realm/cyber/
626
tools.ietf.org/html/rfc2071
627
www.dmoz.org/Bookmarks/H/herbridg/
628
news.bbc.co.uk/1/hi/technology/7561943.stm
629
sites.google.com/site/cottonlinters/
630
www.vitanuova.com/inferno/licence.html
631
dreamers.com/jspain/engmain.html
632
www.jguru.com/faq/JDBC/
633
whiteknuckl

803
homepage2.nifty.com/sakazuki/rde_en/
804
exodus.physics.ucla.edu/Fortran95/PSTIResearchLecSeries1.html
805
news.bbc.co.uk/2/hi/technology/2688619.stm
806
smithelec.theshoppe.com/welcmepage.htm
807
botik.ru/pub/local/scp/refal5/
808
projects.gnome.org/gtkglext/
809
www.badgers-in-foil.co.uk/projects/jactionscript/
810
www.gerd-tentler.de/tools/mimemail_perl/
811
developer.apple.com/java/javawebstart/
812
www.arsvcs.demon.co.uk/leisure/abuse/abuse.html
813
www.ecommercetimes.com/perl/story/13397.html
814
www.sis.pitt.edu/~mbsclass/hall_of_fame/
815
kuoi.com/~kamikaze/doc/minsky.html
816
www.ils-international.com/goldmine/cgmhome.htm
817
tools.ietf.org/html/rfc1077
818
www.angelfire.com/scifi/phaserfire/
819
www.leadershipchallenge.com/WileyCDA/
820
www.angelfire.com/country/bco/
821
www.win.tue.nl/~engels/go/index_en.html
822
www.ibm.com/software/awdtools/rmc/
823
www.hadrons.org/~guillem/debian/debtakeover/README
824
www.javacoffeebreak.com/books/extracts/jini/jini.html
825
dillygre

995
www.dopscripts.com/doc/description.html
996
devel.diplom.org/manorcon/
997
www.firstmonday.org/issues/issue4_9/odlyzko/
998
www.webucator.com/webdev/ruby.cfm
999
sourceforge.net/projects/foing/
1000
home.hccnet.nl/a.w.m.van.der.horst/
1001
www.hammerdownman.com/freesoftwaredownloads/
1002
www.google.com/postini/
1003
www.codestyle.org/css/
1004
www.cix.co.uk/~rrussell/
1005
www.yale.edu/chinesemac/index.html
1006
tools.ietf.org/html/rfc1973
1007
members.tripod.com/NieleHawaiianGirl/aloha.html
1008
www.zeitrafferfilme.de/english/index.html
1009
www.telehold.com/voice_prompts.html
1010
www.learnthenet.com/english/section/email.html
1011
en.wikibooks.org/wiki/Logo_Programming
1012
www.cypressfilms.com/index.htm
1013
www.grafxbylisa.com/web/index.html
1014
www.opticompo.com/index2_en.html
1015
caoua.org/midish/
1016
www.cs.utexas.edu/users/ml/neither.html
1017
www.americankeystone.com/intro.htm
1018
www.ehow.com/articles_3362-php-mysql-programming.html
1019
www.webring.com/hub?ring=car

1191
www.chessandpoker.com/checks.html
1192
home.netvigator.com/~hkkcid
1193
groups.yahoo.com/group/Yu-gi-oh_Trading_card_game/
1194
www.apple.com.au/xserve/support.html
1195
math.berkeley.edu/~strain/Codes/
1196
www.perlfect.com/freescripts/dailystats/
1197
alpha.fdu.edu/~levine/reuse_course/java/spider/index.html
1198
news.cnet.com/2100-1023-257077.html
1199
tools.ietf.org/html/rfc2215
1200
www.wideweb.com/phonetrips/
1201
www.people.fas.harvard.edu/~airoldi/
1202
www.stdutility.com/stduviewer.html
1203
education.zdnet.com/?p=1056
1204
www.zdnet.com/news/netscape-acquires-newhoo/101006
1205
www.helsinki.fi/~msiivola/sgl.html
1206
sourceforge.net/projects/perl-gps/
1207
fso.cpasitesolutions.com/dfishercpa/
1208
forums.delphiforums.com/promusic/
1209
pagesperso-orange.fr/stephane.fonlladosa/english_ac.htm
1210
www.ietf.org/html.charters/ippm-charter.html
1211
www.wired.com/science/discoveries/news/2001/03/42581
1212
sabrinarios.tripod.com/sabrinariosphotography/
1213
www.adncomm.com/pi

1386
www.perl.com/pub/a/1999/01/foy.html
1387
static.userland.com/userLandDiscussArchive/msg016342.html
1388
tools.ietf.org/html/rfc1346
1389
home.neopets.com/templates/homepage.phtml?pet_name=Rambi
1390
www.industriesjaro.com/english/index.htm
1391
users.dickinson.edu/~braught/dlife
1392
plan9.bell-labs.com/cm/cs/who/ken/
1393
article.gmane.org/gmane.comp.lib.gnustep.general/26371/
1394
tools.ietf.org/html/rfc513
1395
www.securityfocus.com/infocus/1761
1396
www.jproc.ca/crypto/
1397
mah.everybody.org/hacks/perl/Image-Grab/
1398
www.freewebs.com/duckzland/m200.html
1399
tools.ietf.org/html/rfc2598
1400
www.nongnu.org/avr-libc/
1401
dmoz.org/Regional/Europe/United_Kingdom/faq.html
1402
whatis.techtarget.com/definition/0,289893,sid9_gci213932,00.html
1403
www.eweek.com/cp/bio/Lawrence-Lessig/
1404
blogs.adobe.com/penguin.swf/atom.xml
1405
www.cs.sfu.ca/~kabanets/
1406
tools.ietf.org/html/rfc2609
1407
pagesperso-orange.fr/direct-traduction/
1408
tech.groups.yahoo.com/group/mozart_software

1576
www.express-soft.com/af.html
1577
repec.economics.utoronto.ca/files/UT-ECIPA-MUNRO-02-03.pdf
1578
www2.webmagic.com/abuse.com/
1579
www.enhanceie.com/IE/SearchBuilder.asp
1580
www.dsm.fordham.edu/~ftnchek/
1581
www.bsp-gmbh.com/hercules/index.shtml
1582
tools.ietf.org/html/rfc660
1583
www.jonelo.de/java/nc/
1584
www.netlib.org/pvm3/index.html
1585
www.opengamingfoundation.org/ogl.html
1586
www.wordware.com/computer/delphi.shtml
1587
www.wiley-vch.de/publish/en/journals/alphabeticIndex/2256/
1588
www.kellogg.northwestern.edu/academic/realestate/
1589
pagesperso-orange.fr/dobtrad/
1590
www.theregister.co.uk/2000/12/27/choc_giant_heavies_kinder_charity/
1591
www.fi.muni.cz/~xsvobod2/misc/lapack/
1592
prowl.org/programs/g0pscan/
1593
www.angelfire.com/in/PAdNoctum/
1594
harmful.cat-v.org/software/csh
1595
ibis.nott.ac.uk/guidelines/index.html
1596
members.tripod.com/blostein/
1597
www.sinohotelguide.com/cantonfair/
1598
www.dowcorning.com/content/power/
1599
developers.sun.com/rss/jav

1770
news.cnet.com/2100-1033-214185.html
1771
en.wikipedia.org/wiki/PostScript
1772
news.cnet.com/2100-1023-224454.html
1773
www.ssware.com/megapack.htm
1774
tools.ietf.org/html/rfc1924
1775
hogranch.com/mayer/resume.html
1776
tools.ietf.org/html/rfc2934
1777
www.dmulti.com/products.html
1778
www.crockford.com/javascript/
1779
www.javaworld.com/javaworld/jw-11-2001/jw-1116-dcl_p.html
1780
www.quest.com/litespeed-for-sql-server/
1781
tech.groups.yahoo.com/group/iejug/
1782
www.wizards.com/default.asp?x=ah/prod/acquire
1783
www.cse.yorku.ca/~oz/wily/
1784
tools.ietf.org/html/rfc2689
1785
www.linanwindow.com/qiye/liyao/index.htm
1786
www.pawluh.com/free.html
1787
www.angelfire.com/music3/donlodic/
1788
www.nocrew.org/software/httptunnel.html
1789
tools.ietf.org/html/rfc916
1790
www.asiafinest.com/exclusive/kailayu.htm
1791
www.dcs.gla.ac.uk/~simon/quantum/
1792
www.kawasaki.com/Products/Watercraft.aspx
1793
www.w3.org/Security/Faq/wwwsf4.html#Q36
1794
www.lanl.gov/Caesar/
1795
www.netdemo

1971
medleygoats.homestead.com/index.html
1972
www.wired.com/techbiz/media/news/2002/03/51218
1973
www.tu-chemnitz.de/informatik/RA/cluster2000/
1974
www.aquarianage.org/services/cards/pgs/index.html
1975
sanjose.bizjournals.com/sanjose/stories/1999/05/03/story5.html
1976
www.paganlink.org/downloads/astrology/kastrolog.html
1977
www.davidaugust.com/memory/
1978
www.codeproject.com/vcpp/stl/stlintroduction.asp
1979
shiflett.org/articles/sql-injection
1980
perspolis.usc.edu/Users/zimmerma/mitra/report_5.html
1981
home.neopets.com/templates/homepage.phtml?pet_name=Kiwikku
1982
angelfire.com/games3/battlegamers
1983
www.eg.bucknell.edu/~cs366/occam.pdf
1984
sharkysoft.com/software/java/lava3/printf/
1985
www.seologic.com/faq/meta-tags.php
1986
www.kiraly.at/index.php?lang=en
1987
tools.ietf.org/html/rfc348
1988
www.pythonware.com/products/pil/index.htm
1989
www.guardian.co.uk/media/2001/jan/19/newmedia.marketingandpr
1990
mind.sourceforge.net/prolog.html
1991
java.net/projects/jna
1992
www

2160
www.motobit.com/help/tcpip/
2161
xbox360.ign.com/objects/957/957919.html
2162
www.buyagro.com/justamere/
2163
tech.groups.yahoo.com/group/sandchi/
2164
tools.ietf.org/html/rfc266
2165
weblogs.macromedia.com/mxna/index.cfm?query=bySmartCategory&amp;smartCategoryId=1&amp;smartCategoryName=ColdFusion&amp;smartCategoryKey=D0382F3A-9D2B-69E8-C7BC317066FA1CC2
2166
tools.ietf.org/html/rfc2735
2167
www.giss.nasa.gov/tools/
2168
www.angelfire.com/movies/ashrae/
2169
pynetlibs.sourceforge.net/default.html
2170
www.oxygenxml.com/xquery_debugger.html
2171
journals.ecs.soton.ac.uk/xml4j/xlinkexperience.html
2172
www.watchfire.com/products/appscan/default.aspx
2173
www.delorie.com/gnu/docs/emacs/sc_toc.html
2174
www.bcl.hamilton.ie/~barak/oaklisp/
2175
www.atomenabled.org/atom.xml
2176
sourceforge.net/projects/orp/
2177
www.angelfire.com/de2/newconcepts/index.html
2178
www.nongnu.org/freetalk/
2179
sites.google.com/site/blogsindex/web-directory
2180
pagesperso-orange.fr/yves.candau/numlib.htm
2

2346
my.execpc.com/~gopalan/vjpp/vjpp.html
2347
www.codesynthesis.com/products/xsde/
2348
www.caliburn.nl/topposting.html
2349
www.informatik.uni-kiel.de/~ca/email/check.html
2350
www.itu.int/ITU-T/asn1/xml/
2351
www.pps.jussieu.fr/~jch/software/polipo/
2352
www.ueda.info.waseda.ac.jp/asian97/
2353
www.freewebs.com/myfriendsplace/
2354
www.faqs.org/faqs/software-eng/testing-faq/
2355
fedoraproject.org/wiki/Statistics
2356
www.angelfire.com/in/HSI/
2357
icu.sourceforge.net/docs/papers/cpp_report/an_introduction_to_garbage_collection_part_i.html
2358
tools.ietf.org/html/rfc476
2359
www.quality.co.uk/ecother.htm
2360
www.informatik.uni-trier.de/~ley/db/journals/jlp/
2361
www.skally.net/mead/
2362
www.angelfire.com/biz/consultwoode/
2363
coweb.cc.gatech.edu:8888/squeakbook/uploads/shafer-final.pdf
2364
www.penmachine.com/techie/emailtrouble_2003-07.html
2365
www.straw.com/sig/dyehist.html
2366
www.information-age.com/article/2006/december_2006/master_brewer
2367
tools.ietf.org/html/rfc800


2536
msdn.microsoft.com/en-gb/office/aa905474.aspx
2537
soapclient.com/soaptest.html
2538
internext.co.za/clients/nestor/vhs.htm
2539
java.sun.com/developer/Books/javaprogramming/jnlp/
2540
www.freewebs.com/pupblu92/
2541
www.iona.com/products/orbix/orbix-mainframe.htm
2542
www.ccm.ece.vt.edu/etextiles/
2543
tools.ietf.org/html/rfc1560
2544
idautomation.com/fonts/fontpackage/
2545
www.skrenta.com/tass/
2546
www.atomicsoftware.org.uk/ShopCn.htm
2547
www.ldsfilm.com/lds_cin.html
2548
www.nytimes.com/library/tech/00/03/circuits/articles/23sull.html
2549
www.websupergoo.com/imageglue-1.htm
2550
www.christopher-wolf.de/hfe/
2551
easygestures.mozdev.org/index.html
2552
www.koshka.net/cgiftp.html
2553
www.tristana.org/writer
2554
www.sysandorg.it/crm_eng/
2555
www.techbooksforfree.com/java.shtml
2556
www.cs.york.ac.uk/puml/
2557
sourceforge.net/projects/forth-script/
2558
www.emerson.emory.edu/services/latex/latex_toc.html
2559
www.digitalgothic.net/intro.htm
2560
www.acez.com/mp3.htm
2561
ww

2730
xml.coverpages.org/omobj-dtd.txt
2731
www.wizzy.com/wizzy/transputer.html
2732
www.freewebs.com/onislair/index.htm
2733
www.massena.com/darrin/pilot/pila.htm
2734
home.neopets.com/templates/homepage.phtml?pet_name=golden_hailo
2735
www.bctia.org/awards/
2736
tools.ietf.org/html/rfc739
2737
www.theatlantic.com/unbound/forum/copyright/intro.htm
2738
www.cse.iitd.ernet.in/~ssen
2739
www.datacaltraining.com/training
2740
www.gnu.org/software/tar/
2741
www.adobe.com/products/digitaleditions/
2742
tools.ietf.org/html/rfc1877
2743
www.uic.edu/htbin/cgiwrap/bin/ojs/index.php/fm/
2744
www.tctc.com/~amfuture
2745
www.itpapers.com/search.aspx?&amp;scid=261
2746
www.arclight.net/~pdb/nonfiction/uncanny-valley.html
2747
freesoftwareshop.org/forum/
2748
www.cs.toronto.edu/~revow/
2749
java.sun.com/products/personaljava/
2750
tools.ietf.org/html/rfc2424
2751
tools.ietf.org/html/rfc967
2752
www.abodia.com/photography
2753
www.sacomaine.org/community/history/factoryisland.shtml
2754
www.columbia.e

2929
ebookstore.sony.com/reader/
2930
www.angelfire.com/biz5/USDM/index.html
2931
members.tripod.com/~Paralogy/index.html
2932
www.cs.ucsb.edu/~rsg/
2933
www.photoman.artspb.com/natalia/
2934
www.digilife.be/quickreferences
2935
www.towersperrin.com/reinsurance
2936
www.scottnicholson.com/syracuseboardgamers/
2937
www.esselbach.com/page.php?id=5
2938
exodus.physics.ucla.edu/vizexhibit/Mainpage.html
2939
www.typematrix.com/dvorak/
2940
www.goldshell.com/flashforge/main.htm
2941
explore.live.com/windows-live-movie-maker/
2942
edharpphotos.homestead.com/ehpindex.html
2943
www.erlang.se/workshop/remond.ps
2944
www.kedwards.com/jini/
2945
www.scottnicholson.com/syracuseboardgamers/
2946
tech.groups.yahoo.com/group/delphi-winsock/
2947
tools.ietf.org/html/rfc1473
2948
www.ukexpert.co.uk/photopost/
2949
www2.hull.ac.uk/science/computer_science.aspx
2950
sourceforge.net/projects/enetwizard/
2951
johnbokma.com/perl/
2952
www.wiltsgames.co.uk/6nimmt.html
2953
www.alientrap.org/nexuiz/
2954
petpa

3125
www.bluetetra.com/xsddoc/index.htm
3126
simplythebest.net/scripts/categories.php?cid=2
3127
www.networkworld.com/careers/2004/0412man.html?page=1
3128
en.wikipedia.org/wiki/Marvin_Minsky
3129
tools.ietf.org/html/rfc227
3130
thor.info.uaic.ro/~ispdc/
3131
www.smokehamfarm.com/vVRPages/Sheep/SHETLAND.htm#Shetland
3132
tools.ietf.org/html/rfc2659
3133
www.iamjohnq.com/screensaver.htm
3134
www.fluidhosting.com/traceroute.php
3135
www.g7jjf.com/rpcemu.htm
3136
forums.sun.com/category.jspa?categoryID=132
3137
publibz.boulder.ibm.com/cgi-bin/bookmgr_OS390/BOOKS/E04A2A01/CCONTENTS
3138
codeazur.com.br/stuff/fc64_final/
3139
www.i360hosting.com/default.asp
3140
webdevelopersjournal.com/articles/intro_to_servlets.html
3141
tools.ietf.org/html/rfc2783
3142
www.moonlake.net/clinic/
3143
sourceforge.net/projects/sowibb
3144
www.ibm.com/developerworks/linux/library/l-metaprog2.html
3145
www.embedded.com/story/OEG20011016S0116
3146
openmap.bbn.com/~tomlinso/ray/firstemailframe.html
3147
tools.ie

3309
www.ourblessedhome.com/drgrphx/adoptable/
3310
www.webhostdir.com/guides/basichtml/
3311
www.freewebs.com/scraptown/
3312
code.google.com/p/bnt/
3313
www.automatedqa.com/products/aqtime/
3314
www.addedbytes.com/cheat-sheets/ruby-on-rails-cheat-sheet/
3315
en.wikipedia.org/wiki/Reversible_computing
3316
www.minidisc.org/aes_atrac.html
3317
www.jeffjournal.org/papers/Volume1/06Aug_SlidingFriction.pdf
3318
www.aa1car.com/carleyware/
3319
sourceforge.net/projects/vscm/
3320
www.roble.com/docs/fw1_or_pix.html
3321
www.gginc.biz/fort.html
3322
www.eecs.ucf.edu/~leavens/JML/
3323
www.javaworld.com/news-reviews/jw-nr-book-reviews.html
3324
www.geckotribe.com/rss/
3325
www.spine-health.com/clinical-trials
3326
www.contentpurity.com/scanintro.htm
3327
www.intersectalliance.com/projects/index.html
3328
home.clara.net/orac/os2/rexx.htm
3329
www.webreference.com/perl/tutorial/rss1/
3330
www.pcmag.com/category2/0,1738,7488,00.asp
3331
www.ccm.ece.vt.edu/papers/edmison_2003_NGWSH03_etextile.pdf


3501
www.tropical-music.com/indexgo.html
3502
www.tek-tips.com/threadminder.cfm?pid=195
3503
homepages.uc.edu/~hollisgf/nakedmud.html
3504
singapore_aberdeen.tripod.com/mackenzie.html
3505
artist4hire.net/Image.html
3506
www.debian.org/ports/m68k/
3507
news.bbc.co.uk/1/hi/entertainment/showbiz/2713133.stm
3508
clans.gameclubcentral.com/shoot/
3509
tools.ietf.org/html/rfc1595
3510
formale-sprachen.informatik.uni-oldenburg.de/persons/karl-heinz.pennemann/
3511
www.redbooks.ibm.com/abstracts/sg244986.html
3512
sourceforge.net/projects/rxvt
3513
www.angelfire.com/in2/wolfsong/index.html
3514
tools.ietf.org/html/rfc1206
3515
assignments.uspto.gov/assignments/q?db=pat&amp;pat=6058214
3516
www.iana.org/domains/root/db/
3517
pw.vsb.bc.ca/trek/
3518
jude.change-vision.com/jude-web/index.html
3519
www.cours.polymtl.ca/roboop/
3520
www.angelfire.com/fl5/cfarmsllc/
3521
www.jitbit.com/rssfeedcreator.aspx
3522
may.cs.ucla.edu/projects/maisie/
3523
tools.ietf.org/html/rfc1783
3524
www.dejavusoftware

3692
groups.yahoo.com/group/PCGF/
3693
www.zieglersoft.dk/public/zieglercollection.asp
3694
www3.sympatico.ca/craig.kadoke/true/
3695
www.storefront.com/photo_kiosk.html
3696
library.gnome.org/devel/gtk/
3697
www.freescripts.com/ASP/scripts.html
3698
tools.ietf.org/html/rfc338
3699
anfi.homeunix.net/sendmail/
3700
www.k5n.us/Ilib.php
3701
static.userland.com/userLandDiscussArchive/msg022178.html
3702
faculty.washington.edu/wcalvin/
3703
www.westwindcos.com/familyhistory/
3704
slashdot.org/developers/01/07/08/1955209.shtml
3705
www.oswego.edu/academics/colleges_and_departments/departments/computer_science/
3706
search.cpan.org/~nkh/
3707
www.pandia.com/sw-2001/44-zeal.html
3708
tools.ietf.org/html/rfc1777
3709
www.21stcentury.co.uk/robotics/
3710
home.neopets.com/templates/homepage.phtml?pet_name=Hidden_Dream
3711
www.bitsavers.org/pdf/ibm/360/pls/GC28-6794-0_PLSIIguideMay74.pdf
3712
www.research.att.com/~bs/bs_faq2.html
3713
www.hollandnumerics.demon.co.uk/ROYSTON.HTM
3714
hem.passagen

3889
en.wikipedia.org/wiki/Programming_style
3890
www.javaworld.com/javaworld/jw-02-1999/jw-02-toolbox_p.html
3891
www.developer.com/net/vb/article.php/1540311
3892
www.marinelistingservice.com/colorblindtest/
3893
www.aspupload.com/aspjpeg.html
3894
www.apacheweek.com/features/dynamicpages
3895
www.m-kagaku.co.jp/index_en.htm
3896
www.pearsonhighered.com/educator/academic/product/0,,0201136880,00\%2Ben-USS_01DBC.html
3897
www.zdnet.com/news/netscape-acquires-newhoo/101006
3898
www.csse.monash.edu.au/~debbiep/billabong/rexpress.html
3899
tools.ietf.org/html/rfc3069
3900
www.nanotech-now.com/nanotechnology-glossary-A-C.htm
3901
www.cchem.berkeley.edu/jsngrp/fortran.html
3902
www.webterrace.com/graphics/index.htm
3903
tools.ietf.org/html/rfc2998
3904
atrey.karlin.mff.cuni.cz/~vojtech/joystick/
3905
www.ics.uci.edu/~franz/
3906
kernel.org/pub/linux/kernel/people/paulmck/perfbook/perfbook.html
3907
www.shimonsandler.com/web-directories-interview-with-greg-hartnett/
3908
www.apromotionguide

4074
www.cs.rpi.edu/~musser/stl-bigger-example.html
4075
www.angelfire.com/oh4/810/index.html
4076
grc.com/su-firewalls.htm
4077
www.sybase.com/support/techdocs/
4078
www.pctools.com/forum/
4079
www.pearsonhighered.com/educator/academic/product/0,,0130260363,00\%2Ben-USS_01DBC.html
4080
www.serpik.com/alaunch/
4081
www.renesas.com/products/mpumcu/superh/superh_landing.jsp
4082
tech.groups.yahoo.com/group/wsdl/
4083
tools.ietf.org/html/rfc652
4084
ps2.gamespy.com/playstation-2/reservoir-dogs/
4085
iptables-tutorial.frozentux.net/iptables-tutorial.html
4086
people.cs.kuleuven.be/~dirk.craeynest/ada-belgium/success/success.html
4087
www.royal.gov.uk/LatestNewsandDiary/Pressreleases/2002/50factsaboutTheDukeofEdinburgh.aspx
4088
web.me.com/hiangle/
4089
tools.ietf.org/html/rfc1778
4090
www.kirchgessner.net/gimp.html
4091
www.alhem.net/Sockets/
4092
damiendebin.net/archives/zzip/
4093
tools.ietf.org/html/rfc1915
4094
www.lrv.fri.uni-lj.si/facedb.html
4095
www.magellass.com/prod-wb.html
4096


4265
www.gcn.com/online/vol1_no1/22933-1.html?topic=daily-updates
4266
www.kinesis-ergo.com/advantage.htm
4267
members.tripod.com/fbaryol/homepage.html
4268
www.redbrick.dcu.ie/~cortex/
4269
tools.ietf.org/html/rfc1873
4270
www.powerquest.com/partitionmagic/index.html
4271
tools.ietf.org/html/rfc2685
4272
pub.cozmixng.org/~the-rwiki/
4273
www.ddj.com/architect/184415220/
4274
www.opencrypt.com/?ab01
4275
budgetvideo.homestead.com/mainpage.html
4276
www.eltima.com/products/showkit2/
4277
tools.ietf.org/html/rfc2001
4278
web.singnet.com.sg/~cbcpl/
4279
www.oreilly.com/catalog/9780596527310/
4280
www.caliban.org/ruby/ruby-google.shtml
4281
tools.ietf.org/html/rfc2057
4282
my.execpc.com/~absinthe/design.htm
4283
members.tripod.com/gsraj/misc/ejbmts/ejbmtscomp.html
4284
www.users.on.net/~geosurveys/
4285
www.helsinki.fi/iehc2006/papers2/Thomson.pdf
4286
www.cse.unsw.edu.au/~lxue/
4287
www.ssi-developer.net/main/templates/
4288
merlot.stat.uconn.edu/~nalini/programs.html
4289
tools.ietf.org/

In [10]:
#Converting the list to dataframe

legitimate = pd.DataFrame(legit_features, columns= feature_names)
legitimate.head()

Unnamed: 0,domain,ip_present,at_present,url_length,url_depth,redirection,https_domain,short_url,prefix/suffix,dns_record,web_traffic,domain_age,domain_end,dot_count,specialchar_count,subdom_count,label
0,,0,0,0,2,0,0,0,0,0,1,1,1,0,2,0,0
1,,0,0,0,2,0,0,0,0,0,1,1,1,0,3,1,0
2,,0,0,0,2,0,0,0,0,0,1,1,1,1,2,1,0
3,,0,0,0,4,0,0,0,0,0,1,1,1,0,4,0,0
4,,0,0,1,5,0,0,0,0,0,0,1,1,1,4,0,0


In [13]:
# Storing the extracted legitimate URLs fatures to csv file

legitimate.to_csv('/Users/jillkathleen/Desktop/Phishing-Analysis-Detection/Back-End/Extracted CSVs/from FeatureExtraction/Data-2/Data-2-benign-extracted.csv', index= False)

