Phishing URL Detector🚨

In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt

In [3]:
data= pd.read_csv("phishing_website_dataset.csv")

data.info()

# None of the feature is having null values😭 


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188637 entries, 0 to 188636
Data columns (total 56 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   FILENAME                    188637 non-null  object 
 1   URL                         188637 non-null  object 
 2   URLLength                   188637 non-null  int64  
 3   Domain                      188637 non-null  object 
 4   DomainLength                188637 non-null  int64  
 5   IsDomainIP                  188637 non-null  int64  
 6   TLD                         188637 non-null  object 
 7   URLSimilarityIndex          188637 non-null  float64
 8   CharContinuationRate        188637 non-null  float64
 9   TLDLegitimateProb           188637 non-null  float64
 10  URLCharProb                 188637 non-null  float64
 11  TLDLength                   188637 non-null  int64  
 12  NoOfSubDomain               188637 non-null  int64  
 13  HasObfuscation

In [4]:
data.head()

Unnamed: 0,FILENAME,URL,URLLength,Domain,DomainLength,IsDomainIP,TLD,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,...,Pay,Crypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef,label
0,521848.txt,https://www.southbankmosaics.com,31,www.southbankmosaics.com,24,0,com,100.0,1.0,0.522907,...,0,0,1,34,20,28,119,0,124,1
1,31372.txt,https://www.uni-mainz.de,23,www.uni-mainz.de,16,0,de,100.0,0.666667,0.03265,...,0,0,1,50,9,8,39,0,217,1
2,597387.txt,https://www.voicefmradio.co.uk,29,www.voicefmradio.co.uk,22,0,uk,100.0,0.866667,0.028555,...,0,0,1,10,2,7,42,2,5,1
3,554095.txt,https://www.sfnmjournal.com,26,www.sfnmjournal.com,19,0,com,100.0,1.0,0.522907,...,1,1,1,3,27,15,22,1,31,1
4,151578.txt,https://www.rewildingargentina.org,33,www.rewildingargentina.org,26,0,org,100.0,1.0,0.079963,...,1,0,1,244,15,34,72,1,85,1


In [5]:
data=data.dropna() 

#remove data if null exists
 
data = data.select_dtypes(include=[np.number])

data.info() 
# Drop non-numeric columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188637 entries, 0 to 188636
Data columns (total 51 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   URLLength                   188637 non-null  int64  
 1   DomainLength                188637 non-null  int64  
 2   IsDomainIP                  188637 non-null  int64  
 3   URLSimilarityIndex          188637 non-null  float64
 4   CharContinuationRate        188637 non-null  float64
 5   TLDLegitimateProb           188637 non-null  float64
 6   URLCharProb                 188637 non-null  float64
 7   TLDLength                   188637 non-null  int64  
 8   NoOfSubDomain               188637 non-null  int64  
 9   HasObfuscation              188637 non-null  int64  
 10  NoOfObfuscatedChar          188637 non-null  int64  
 11  ObfuscationRatio            188637 non-null  float64
 12  NoOfLettersInURL            188637 non-null  int64  
 13  LetterRatioInU

In [6]:
data["IsDomainIP"].value_counts()

#now we need to train the model using stratified train test split as we dont want the model to train 
#only on zeroes of the IsDomainIP feature🗿

IsDomainIP
0    188168
1       469
Name: count, dtype: int64

In [7]:
from sklearn.model_selection import StratifiedShuffleSplit
Ssplit = StratifiedShuffleSplit(n_splits=1,random_state=42,test_size=0.2)
for trainIndex,testIndex in Ssplit.split(data,data["IsDomainIP"]):
    train=data.loc[trainIndex]
    test=data.loc[testIndex]

train["IsDomainIP"].value_counts()

#Now everything is fine😎


IsDomainIP
0    150534
1       375
Name: count, dtype: int64

In [8]:
matrix=data.corr()
matrix["label"].sort_values()
#correlation of each feature wrt labels💀

#|value|>0.5 is considered that the label depends highly on that feature 

SpacialCharRatioInURL        -0.534179
DegitRatioInURL              -0.436415
NoOfOtherSpecialCharsInURL   -0.377710
LetterRatioInURL             -0.367976
DomainLength                 -0.285115
NoOfLettersInURL             -0.258596
URLLength                    -0.238626
NoOfDegitsInURL              -0.188991
NoOfQMarkInURL               -0.177495
NoOfEqualsInURL              -0.081870
TLDLength                    -0.078300
NoOfSelfRedirect             -0.075621
IsDomainIP                   -0.059873
HasObfuscation               -0.053596
NoOfURLRedirect              -0.045521
ObfuscationRatio             -0.042643
LargestLineLength            -0.041393
NoOfAmpersandInURL           -0.035685
NoOfObfuscatedChar           -0.016223
NoOfSubDomain                -0.004339
NoOfPopup                     0.046579
TLDLegitimateProb             0.096377
Crypto                        0.096804
NoOfEmptyRef                  0.103988
HasPasswordField              0.135333
HasExternalFormSubmit    

In [9]:
X_train=train.iloc[:,:-1].values
Y_train=train.iloc[:,-1:].values

X_test=test.iloc[:,:-1].values
Y_test=test.iloc[:,-1:].values

#Splitting X and Y


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

model=LogisticRegression()
model.fit(X_train,Y_train)

y_pred=model.predict(X_test)

#training the model and testing it 

accuracy = accuracy_score(Y_test, y_pred)
precision = precision_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

#Imagine getting accuracy and F1 score 1🗿

  y = column_or_1d(y, warn=True)


Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1 Score: 1.00


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
