# Phishing URL Detection


## Modules dan Packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns  
import time 

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from nltk.stem.snowball import SnowballStemmer

from PIL import Image
import joblib
import pickle

In [None]:
data = pd.read_csv("phishing_site_urls.csv")

## Exploratory Data Analysis (EDA)

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.isnull().sum()

In [None]:
data.info()

In [None]:
def polaritas(teks):
  if teks =='good':
    return 1
  elif teks =='bad':
    return 0

In [None]:
data['label_enc'] = data['Label'].apply(polaritas)

In [None]:
print('Total Jumlah url:', data.shape[0], 'data\n')
print('terdiri dari (label):')
print('-- [1] Good\t\t:', data[data.label_enc == -1].shape[0], 'data')
print('-- [0] Bad\t\t:', data[data.label_enc == 1].shape[0], 'data\n')

In [None]:
sns.countplot(x='Label',data=data)

## Tokenisasi

In [None]:
from nltk.tokenize import RegexpTokenizer  
tokenizer = RegexpTokenizer(r'[A-Za-z]+')

In [None]:
tokenizer.tokenize(data.URL[0]) # melakukan tokenisasi pada URL pertama di dataset

In [None]:
# Tokenizing all the rows 
print('Getting words tokenized ...')
t0= time.perf_counter()
data['text_tokenized'] = data.URL.map(lambda t: tokenizer.tokenize(t))
t1 = time.perf_counter() - t0
print('Time taken',t1 ,'sec')

In [None]:
data.sample(5)

## Stemming

In [None]:
stemmer = SnowballStemmer("english") # choose a language

In [None]:
# Getting all the stemmed words
print('Getting words stemmed ...')
t0= time.perf_counter()
data['text_stemmed'] = data['text_tokenized'].map(lambda l: [stemmer.stem(word) for word in l])
t1= time.perf_counter() - t0
print('Time taken',t1 ,'sec')

In [None]:
data.sample(5)

In [None]:
# Joining all the stemmmed words.
print('Get joiningwords ...')
t0= time.perf_counter()
data['text_sent'] = data['text_stemmed'].map(lambda l: ' '.join(l))
t1= time.perf_counter() - t0
print('Time taken',t1 ,'sec')

In [None]:
bad_sites = data[data.Label == 'bad']
good_sites = data[data.Label == 'good']

In [None]:
bad_sites.head()

In [None]:
good_sites.head()

In [None]:
data.head()

## Ekstraksi fitur menjadi sebuah matrix

In [None]:
from sklearn.feature_extraction.text import CountVectorizer  
cv = CountVectorizer()

In [None]:
feature = cv.fit_transform(data.text_sent) #transform all text which we tokenize and stemed

In [None]:
feature[:5].toarray() # convert sparse matrix into array to print transformed features

In [None]:
with open('feature.pickle', 'wb') as output:
  pickle.dump(feature, output)

In [None]:
from sklearn.model_selection import train_test_split
trainX, testX, trainY, testY = train_test_split(feature, data.label_enc)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

# Create LR object
lr = LogisticRegression()

# Train algoritma
lr.fit(trainX,trainY)

In [None]:
lr.score(testX,testY)

In [None]:
Scores_ml = {}
Scores_ml['Logistic Regression'] = np.round(lr.score(testX,testY),2)

In [None]:
# creating confusing matrix
print('Training Accuracy :',lr.score(trainX,trainY))
print('Testing Accuracy :',lr.score(testX,testY))
con_mat = pd.DataFrame(confusion_matrix(lr.predict(testX), testY),
            columns = ['Predicted:Bad', 'Predicted:Good'],
            index = ['Actual:Bad', 'Actual:Good'])


print('\nCLASSIFICATION REPORT\n')
print(classification_report(lr.predict(testX), testY,
                            target_names =['Bad','Good']))

print('\nCONFUSION MATRIX')
plt.figure(figsize= (6,4))
sns.heatmap(con_mat, annot = True,fmt='d',cmap="YlGnBu")

## Multinomial NB


In [None]:
from sklearn.naive_bayes import MultinomialNB 

# Create mnb object
mnb = MultinomialNB()

# Fit/Latih algoritma model mnb
mnb.fit(trainX,trainY)

In [None]:
mnb.score(testX,testY)

In [None]:
Scores_ml['MultinomialNB'] = np.round(mnb.score(testX,testY),2)

In [None]:
print('Training Accuracy :',mnb.score(trainX,trainY))
print('Testing Accuracy :',mnb.score(testX,testY))
con_mat = pd.DataFrame(confusion_matrix(mnb.predict(testX), testY),
            columns = ['Predicted:Bad', 'Predicted:Good'],
            index = ['Actual:Bad', 'Actual:Good'])


print('\nCLASSIFICATION REPORT\n')
print(classification_report(mnb.predict(testX), testY,
                            target_names =['Bad','Good']))

print('\nCONFUSION MATRIX')
plt.figure(figsize= (6,4))
sns.heatmap(con_mat, annot = True,fmt='d',cmap="YlGnBu")

### Logistic Regression is the best fit model, Now we make sklearn pipeline using Logistic Regression

In [None]:
# from sklearn.pipeline import make_pipeline
pipeline_ls = make_pipeline(CountVectorizer(tokenizer = RegexpTokenizer(r'[A-Za-z]+').tokenize,stop_words='english'), LogisticRegression())

In [None]:
trainX, testX, trainY, testY = train_test_split(data.URL, data.Label)

In [None]:
pipeline_ls.fit(trainX,trainY)

In [None]:
pipeline_ls.score(testX,testY)

In [None]:
print('Training Accuracy :',pipeline_ls.score(trainX,trainY))
print('Testing Accuracy :',pipeline_ls.score(testX,testY))
con_mat = pd.DataFrame(confusion_matrix(pipeline_ls.predict(testX), testY),
            columns = ['Predicted:Bad', 'Predicted:Good'],
            index = ['Actual:Bad', 'Actual:Good'])


print('\nCLASSIFICATION REPORT\n')
print(classification_report(pipeline_ls.predict(testX), testY,
                            target_names =['Bad','Good']))

print('\nCONFUSION MATRIX')
plt.figure(figsize= (6,4))
sns.heatmap(con_mat, annot = True,fmt='d',cmap="YlGnBu")

In [None]:
joblib.dump((pipeline_ls), "model_phishing_lr.model")

In [None]:
# from joblib import load
loaded_model = load('model_phishing_lr.model')
result = loaded_model.score(testX,testY)
print(result)

In [None]:
# Simpan vectorizer dan model ke dalam file terpisah
pickle.dump(X_transformed, open('vectorizer.pickle','wb'))
joblib.dump(lr_model, 'model_phishing_lr.model')

## Try to predict

In [None]:
predict_bad = ['yeniik.com.tr/wp-admin/js/login.alibaba.com/login.jsp.php','fazan-pacir.rs/temp/libraries/ipad','tubemoviez.exe','svision-online.de/mgfi/administrator/components/com_babackup/classes/fx29id1.txt']
predict_good = ['youtube.com/','www.kaggle.com/code/ashishkumarbehera/phishing-site-prediction','retailhellunderground.com/','restorevisioncenters.com/html/technology.html']
loaded_model = joblib.load('model_phishing_lr.model')

# # Praproses teks
# processed_bad = text_preprocessing_process(predict_bad[0])
# processed_good = text_preprocessing_process(predict_good[0])


# # Ubah teks menjadi vektor fitur
# feature_vector_bad = vectorizer.transform([processed_bad])
# feature_vector_good = vectorizer.transform([processed_good])

result = loaded_model.predict(feature_vector_bad)
result2 = loaded_model.predict(feature_vector_good)

print(result)
print("*"*30)
print(result2)

In [None]:
from joblib import load
loaded_model = load('model_phishing_lr.model')

predict_bad = []
predict_good = []


while True:
    url = input("Masukkan URL (atau tekan Enter untuk keluar): ")
    if not url:
        break
        
    import pickle
    
    result = loaded_model.predict([url])
    print(result)