# Logistic Regression

### Importing libraries and dataset

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import random

# Read dataset
df = pd.read_csv('phishing_site_urls.csv')
urls = df['URL']
labels = df['Label']


### Preprocessing the data

In [21]:
# Map labels to binary values
label_mapping = {'good': 0, 'bad': 1}
y = labels.map(label_mapping).values

# Feature extraction
def extract_features(url):
    features = {}
    # Basic features
    features['https'] = 1 if 'https' in url else 0
    features['url_length'] = len(url)
    features['special_char_length'] = sum(1 for char in url if char in '!@#$%^&*()-=')
    # Special character counts
    features['num_$'] = url.count('$')
    features['num_!'] = url.count('!')
    features['num_@'] = url.count('@')
    features['num_#'] = url.count('#')
    features['num_%'] = url.count('%')
    features['num_^'] = url.count('^')
    features['num_&'] = url.count('&')
    features['num_*'] = url.count('*')
    features['num_('] = url.count('(')
    features['num_)'] = url.count(')')
    features['num_-'] = url.count('-')
    features['num_='] = url.count('=')
    features['num_~'] = url.count('~')
    features['num_.'] = url.count('.')
    features['num_/'] = url.count('/')
    # Domain type
    features['is_com'] = 1 if '.com' in url else 0
    features['is_org'] = 1 if '.org' in url else 0
    features['is_net'] = 1 if '.net' in url else 0
    features['is_me'] = 1 if '.me' in url else 0
    features['is_gov'] = 1 if '.gov' in url else 0
    features['is_other_domain'] = 1 if all(domain not in url for domain in ['.com', '.org', '.net', '.me', '.gov']) else 0
    return features

# Apply feature extraction to all URLs
X = pd.DataFrame([extract_features(url) for url in urls])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X) 


### Splitting, training, & testing

In [22]:

# Initialize a dictionary to store results
results = {
    'logreg_accuracy': [],
    'logreg_roc_auc': [],
    'logreg_f1_score': []
}

# Train and evaluate 15 logistic regression models
for i in range(15):
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=random.randint(0, 1000)
    )
    
    # Train logistic regression model
    logreg = LogisticRegression(max_iter=1000, solver='saga', random_state=random.randint(0, 1000))
    logreg.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = logreg.predict(X_test)
    y_pred_proba = logreg.predict_proba(X_test)[:, 1]  # Probability of the positive class
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    f1 = f1_score(y_test, y_pred)
    
    # Append metrics to results
    results['logreg_accuracy'].append(accuracy)
    results['logreg_roc_auc'].append(roc_auc)
    results['logreg_f1_score'].append(f1)
    
    print(f"Iteration {i+1}: Accuracy = {accuracy:.4f}, ROC AUC = {roc_auc:.4f}, F1 = {f1:.4f}")



Iteration 1: Accuracy = 0.7804, ROC AUC = 0.7874, F1 = 0.5268
Iteration 2: Accuracy = 0.7813, ROC AUC = 0.7862, F1 = 0.5260
Iteration 3: Accuracy = 0.7797, ROC AUC = 0.7893, F1 = 0.5244


### Saving metrics

In [None]:
# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Save results to a CSV file
results_df.to_csv('../results.csv', index=False)

print("\nMetrics saved to 'results.csv'")
