## Naive Bayes

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.tokenize import RegexpTokenizer
import pandas as pd
import random

In [2]:
# read dataset
df = pd.read_csv('phishing_site_urls.csv')
urls = df['URL']
labels = df['Label']

label_mapping = {'good': 1, 'bad': 0}
y = labels.map(label_mapping).values

In [3]:
# preprocess data, tokenize and vectorize URLs
tokenizer = RegexpTokenizer(r'\w+')
url_tokens = [' '.join(tokenizer.tokenize(url)) for url in urls]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(url_tokens)



In [6]:
#train NB classifier
accuracies = []
for i in range(15):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random.randint(0, 1000))

    clf = MultinomialNB()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    print(f"NB Accuracy: {accuracy:.2f}")

df = pd.DataFrame({'nb': accuracies})
df.to_csv('NB_results.csv', index=False)


NB Accuracy: 0.97
NB Accuracy: 0.96
NB Accuracy: 0.96
NB Accuracy: 0.96
NB Accuracy: 0.96
NB Accuracy: 0.96
NB Accuracy: 0.96
NB Accuracy: 0.96
NB Accuracy: 0.96
NB Accuracy: 0.96
NB Accuracy: 0.96
NB Accuracy: 0.97
NB Accuracy: 0.96
NB Accuracy: 0.96
NB Accuracy: 0.96
