# Logistic Regression

### Importing libraries and dataset

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from nltk.tokenize import RegexpTokenizer
import pandas as pd
import random

# Read dataset
df = pd.read_csv('phishing_site_urls.csv')
urls = df['URL']
labels = df['Label']


### Preprocessing the data

In [9]:

# Map labels to binary values
label_mapping = {'good': 1, 'bad': 0}
y = labels.map(label_mapping).values

# Tokenize and vectorize URLs
tokenizer = RegexpTokenizer(r'\w+')
url_tokens = [' '.join(tokenizer.tokenize(url)) for url in urls]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(url_tokens)

### Splitting, training, & testing

In [10]:

# Lists to store metrics
accuracies = []
roc_auc_scores = []
f1_scores = []

# Train and evaluate 15 logistic regression models
for i in range(15):
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=random.randint(0, 1000)
    )
    
    # Initialize and train logistic regression model
    logreg = LogisticRegression(max_iter=1000, random_state=random.randint(0, 1000))
    logreg.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = logreg.predict(X_test)
    y_pred_proba = logreg.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    f1 = f1_score(y_test, y_pred)
    
    # Append metrics to lists
    accuracies.append(accuracy)
    roc_auc_scores.append(roc_auc)
    f1_scores.append(f1)
    
    print(f"Iteration {i+1}: Accuracy = {accuracy:.4f}, ROC AUC = {roc_auc:.4f}, F1 = {f1:.4f}")


Iteration 1: Accuracy = 0.9656, ROC AUC = 0.9939, F1 = 0.9763
Iteration 2: Accuracy = 0.9668, ROC AUC = 0.9941, F1 = 0.9771
Iteration 3: Accuracy = 0.9662, ROC AUC = 0.9942, F1 = 0.9767
Iteration 4: Accuracy = 0.9670, ROC AUC = 0.9943, F1 = 0.9773
Iteration 5: Accuracy = 0.9655, ROC AUC = 0.9939, F1 = 0.9763
Iteration 6: Accuracy = 0.9656, ROC AUC = 0.9943, F1 = 0.9762
Iteration 7: Accuracy = 0.9660, ROC AUC = 0.9939, F1 = 0.9765
Iteration 8: Accuracy = 0.9663, ROC AUC = 0.9940, F1 = 0.9768
Iteration 9: Accuracy = 0.9660, ROC AUC = 0.9941, F1 = 0.9766
Iteration 10: Accuracy = 0.9658, ROC AUC = 0.9939, F1 = 0.9764
Iteration 11: Accuracy = 0.9656, ROC AUC = 0.9940, F1 = 0.9763
Iteration 12: Accuracy = 0.9657, ROC AUC = 0.9940, F1 = 0.9764
Iteration 13: Accuracy = 0.9662, ROC AUC = 0.9942, F1 = 0.9767
Iteration 14: Accuracy = 0.9657, ROC AUC = 0.9934, F1 = 0.9764
Iteration 15: Accuracy = 0.9653, ROC AUC = 0.9939, F1 = 0.9760


### Saving Metrics

In [11]:

# Save metrics to separate CSV files
pd.DataFrame({'accuracy': accuracies}).to_csv('LogReg_accuracies.csv', index=False)
pd.DataFrame({'roc_auc': roc_auc_scores}).to_csv('LogReg_roc_auc.csv', index=False)
pd.DataFrame({'f1_score': f1_scores}).to_csv('LogReg_f1_scores.csv', index=False)

print("\nMetrics saved to 'LogReg_accuracies.csv', 'LogReg_roc_auc.csv', and 'LogReg_f1_scores.csv'")



Metrics saved to 'LogReg_accuracies.csv', 'LogReg_roc_auc.csv', and 'LogReg_f1_scores.csv'
