# Detecting Phishing URLs

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer # create sparse matrix of words using regexptokenizes  
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report #gives whole info about matrices
from sklearn.metrics import confusion_matrix #gives diff between actual and predict
from sklearn.pipeline import make_pipeline # use for combining all prerocessors techniuqes and algos
from nltk.tokenize import RegexpTokenizer # regexp tokenizers use to split words from text  
from nltk.stem.snowball import SnowballStemmer # stemmes words
import warnings 
warnings.filterwarnings('ignore')

In [47]:
#loading dataset
phishing_urls = pd.read_csv('data/phishing_site_urls.csv')
phishing_urls.sample(5)

Unnamed: 0,URL,Label
492691,93.183.155.22/limto1.tar,bad
466587,worthpoint.com/worthopedia/harold-lloyd-bebe-d...,good
434882,spiderbytes.com/ambientrance/deahs-gw.htm,good
231609,profiles.lawyersdb.com/florida-venice/1099032-...,good
211329,markspoelstra.net/bio.html,good


In [48]:
phishing_urls.shape

(549346, 2)

In [49]:
phishing_urls.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549346 entries, 0 to 549345
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   URL     549346 non-null  object
 1   Label   549346 non-null  object
dtypes: object(2)
memory usage: 8.4+ MB


In [50]:
label_counts = pd.DataFrame(phishing_urls.Label.value_counts())
label_counts

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
good,392924
bad,156422


In [51]:
# RegexpTokenizer
#It is used in NLP for dividing paragraph into sentences and sentences to words

tokenizer = RegexpTokenizer(r'[A-Za-z]+')
phishing_urls['URL'][0]

'nobell.it/70ffb52d079109dca5664cce6f317373782/login.SkyPe.com/en/cgi-bin/verification/login/70ffb52d079109dca5664cce6f317373/index.php?cmd=_profile-ach&outdated_page_tmpl=p/gen/failed-to-load&nav=0.5.1&login_access=1322408526'

In [52]:
# this will be pull letter which matches to expression
tokenizer.tokenize(phishing_urls['URL'][0])

['nobell',
 'it',
 'ffb',
 'd',
 'dca',
 'cce',
 'f',
 'login',
 'SkyPe',
 'com',
 'en',
 'cgi',
 'bin',
 'verification',
 'login',
 'ffb',
 'd',
 'dca',
 'cce',
 'f',
 'index',
 'php',
 'cmd',
 'profile',
 'ach',
 'outdated',
 'page',
 'tmpl',
 'p',
 'gen',
 'failed',
 'to',
 'load',
 'nav',
 'login',
 'access']

In [53]:
phishing_urls['text_tokenized']=phishing_urls.URL.map(lambda t: tokenizer.tokenize(t))

In [54]:
phishing_urls.sample(5)

Unnamed: 0,URL,Label,text_tokenized
353948,hometownusa.com/ut/Riverdale.html,good,"[hometownusa, com, ut, Riverdale, html]"
537065,bestsourcecode.com/fm9wn7,bad,"[bestsourcecode, com, fm, wn]"
494097,interface.xyzs.com/,bad,"[interface, xyzs, com]"
210216,m.espn.go.com/nfl/story?storyId=7269318&e=RAD,good,"[m, espn, go, com, nfl, story, storyId, e, RAD]"
109631,portatilandaimes.com.br/chuky/inbox.htm,bad,"[portatilandaimes, com, br, chuky, inbox, htm]"


In [55]:
# Snowball is a small string processing language, gives root words
# root words means ex. University,universal,universestar here universe is root word
# and also removes of,and,is,was,are,I
#snow ball is package for give root word for multiple language
stemmer = SnowballStemmer("english") 

In [56]:
phishing_urls['text_stemmed'] = phishing_urls['text_tokenized'].map(lambda l: [stemmer.stem(word) for word in l])

In [57]:
phishing_urls.sample(5)

Unnamed: 0,URL,Label,text_tokenized,text_stemmed
46359,habbofontesangrenta.rel7.com/,bad,"[habbofontesangrenta, rel, com]","[habbofontesangrenta, rel, com]"
214267,mobiletechnology.dowjones.com/,good,"[mobiletechnology, dowjones, com]","[mobiletechnolog, dowjon, com]"
47177,www.hongtongsoft.com.cn/images/index.htm?us.ba...,bad,"[www, hongtongsoft, com, cn, images, index, ht...","[www, hongtongsoft, com, cn, imag, index, htm,..."
426164,rubiconpress.org/books,good,"[rubiconpress, org, books]","[rubiconpress, org, book]"
403563,newadvent.org/cathen/11128a.htm,good,"[newadvent, org, cathen, a, htm]","[newadv, org, cathen, a, htm]"


In [58]:
#joining stemmed words
phishing_urls['text_from_url'] = phishing_urls['text_stemmed'].map(lambda i:' '.join(i))

In [59]:
phishing_urls.sample(5)

Unnamed: 0,URL,Label,text_tokenized,text_stemmed,text_from_url
319744,education.com/schoolfinder/us/virginia/staffor...,good,"[education, com, schoolfinder, us, virginia, s...","[educ, com, schoolfind, us, virginia, stafford...",educ com schoolfind us virginia stafford grace...
351017,heraldscotland.com/sport/spl/aberdeen/glasgow-...,good,"[heraldscotland, com, sport, spl, aberdeen, gl...","[heraldscotland, com, sport, spl, aberdeen, gl...",heraldscotland com sport spl aberdeen glasgow ...
137121,slidcorabia.ro/administrator/includes/trulia/i...,bad,"[slidcorabia, ro, administrator, includes, tru...","[slidcorabia, ro, administr, includ, trulia, i...",slidcorabia ro administr includ trulia index html
477173,youtube.com/watch?v=mZk3fOzKpQ4,good,"[youtube, com, watch, v, mZk, fOzKpQ]","[youtub, com, watch, v, mzk, fozkpq]",youtub com watch v mzk fozkpq
212408,meds-zone.com/medic/drug/SUPER_C,good,"[meds, zone, com, medic, drug, SUPER, C]","[med, zone, com, medic, drug, super, c]",med zone com medic drug super c


In [60]:
#here our words are mostly texts we have to convert it into numbers for that we have to convert it using
# count vectorizer - Convert a collection of text documents to a matrix of token counts
cv = CountVectorizer()

In [61]:
feature = cv.fit_transform(phishing_urls.text_from_url)
feature

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 3676066 stored elements and shape (549346, 350837)>

In [62]:
feature[:5].toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [63]:
#training model
X_train,X_test,y_train,y_test = train_test_split(feature,phishing_urls.Label)

In [64]:
lr = LogisticRegression()
lr.fit(X_train,y_train)

In [65]:
report = classification_report(lr.predict(X_test), y_test,
                            target_names =['Bad','Good'])
print(report)

              precision    recall  f1-score   support

         Bad       0.91      0.97      0.94     36474
        Good       0.99      0.96      0.98    100863

    accuracy                           0.96    137337
   macro avg       0.95      0.97      0.96    137337
weighted avg       0.97      0.96      0.96    137337

