## Effect of Model Size

In [None]:
# PHISHING DETECTION USING MACHINE LEARNING
# By Ifediora Okolo

# Import the libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from tensorflow import keras
from keras.models import Sequential
from keras import Input
from keras.layers import Dense
import numpy as np # linear algebra
import pandas as pd # data processing

# Read the data

In [None]:
df = pd.read_csv("phishing_site_urls.csv")
print(df.shape)
df.head()

In [None]:
df.Label.value_counts()

In [None]:
sns.countplot(data=df, x="Label")
plt.show()

# Shuffle the data

In [None]:
df_shuffled = shuffle(df, random_state=42)

### Choose 10,000 samples

In [None]:
data_size = 10000

In [None]:
df_used = df_shuffled[:data_size].copy()

In [None]:
df_used.info()

# Replace the labels

In [None]:
df_used.replace({'good':0, 'bad':1}, inplace=True)

In [None]:
df_used.Label.value_counts()

# Divide the data into features and labels

In [None]:
X = df_used[['URL']].copy()
y = df_used.Label.copy()

# Initialize the tokenizer, stemmer, and Vectorizer

In [None]:
tokenizer = RegexpTokenizer(r'[A-Za-z]+')
stemmer = SnowballStemmer("english")
cv = CountVectorizer()

# Prepare the data

In [None]:
def prepare_data(X) :
    X['text_tokenized'] = X.URL.map(lambda t: tokenizer.tokenize(t))
    X['text_stemmed'] = X.text_tokenized.map(lambda t: [stemmer.stem(word) for word in t])
    X['text_sent'] = X.text_stemmed.map(lambda t: ' '.join(t))
    features = cv.fit_transform(X.text_sent)
    return X, features

In [None]:
X, features = prepare_data(X)

# Import and intitialize the models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [None]:
logreg = LogisticRegression()
knn = KNeighborsClassifier()
ad_c=AdaBoostClassifier()
rfc = RandomForestClassifier()
svc = SVC()

# Train and test the models using different training percentages

In [None]:
def train_test_model(model, X, y, training_percentage) :
    trainX, testX, trainY, testY = train_test_split(X, y, test_size=1-training_percentage, stratify=y, random_state=42)
    model.fit(trainX, trainY)
    predY = model.predict(testX)
    accuracy = accuracy_score(testY, predY)
    precision = precision_score(testY, predY, pos_label=1)
    recall = recall_score(testY, predY, pos_label=1)
    return accuracy, precision, recall  

In [None]:
training_sizes = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

In [None]:
def model_results(model) :
    results = []
    for p in training_sizes :
        results.append(train_test_model(model, features, y, p))
    return pd.DataFrame(results, columns=['Accuracy', 'Precision', 'Recall'])

In [None]:
logreg_results = model_results(logreg)
knn_results = model_results(knn)
ad_c_results = model_results(ad_c)
rfc_results = model_results(rfc)
svc_results = model_results(svc)

In [None]:
models = ['Logistic Regression', 'KNN', 'ADA Boost', 'Random Forest', 'SVM']
model_results = [logreg_results, knn_results, ad_c_results, rfc_results, svc_results]

In [None]:
accuracies = []
precisions = []
recalls = []
for model in model_results :
    accuracies.append(model.Accuracy.values)
    precisions.append(model.Precision.values)
    recalls.append(model.Recall.values)

In [None]:
accuracies = pd.DataFrame(np.transpose(accuracies), columns=models, index=training_sizes*100)
precisions = pd.DataFrame(np.transpose(precisions), columns=models, index=training_sizes*100)
recalls = pd.DataFrame(np.transpose(recalls), columns=models, index=training_sizes*100)

In [None]:
accuracies

In [None]:
precisions

In [None]:
recalls