In [11]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load dataset
file_path = "C:/Users/Christiana/Downloads/phiusiil+phishing+url+dataset/PhiUSIIL_Phishing_URL_Dataset.csv"
df = pd.read_csv(file_path)


# Sigmoid function
def sigmoid(z):
    z = np.clip(z, -500, 500)  # Avoid overflow
    return 1 / (1 + np.exp(-z))



def train_logistic_regression(X, y, lr=0.01, epochs=10, batch_size=1024, lambda_=0.1):
    m, n = X.shape
    w = np.zeros(n)  # Initialize weights
    b = 0            # Initialize bias

    for epoch in range(epochs):
        indices = np.arange(m)
        np.random.shuffle(indices)
        X = X[indices]
        y = y[indices]

        for i in range(0, m, batch_size):
            X_batch = X[i:i + batch_size]
            y_batch = y[i:i + batch_size]

            z = np.dot(X_batch, w) + b
            predictions = sigmoid(z)

            # Gradients with L2 regularization
            dw = (1 / len(X_batch)) * np.dot(X_batch.T, (predictions - y_batch)) + (lambda_ / m) * w
            db = (1 / len(X_batch)) * np.sum(predictions - y_batch)

            # Update weights and bias
            w -= lr * dw
            b -= lr * db

    return w, b

def predict(X, w, b, threshold=0.5):
    m, n = X.shape   
    p = np.zeros(m)
   
    # YOUR CODE HERE
     ### BEGIN SOLUTION
    
    F_wb = sigmoid(np.dot(X,w) + b)
    p = (F_wb>=0.5)
        
    ### END SOLUTION
    
    return p
# Select features and label
LABEL = df.iloc[:, -1:].columns[0]
cols = [
    'LineOfCode', 'NoOfExternalRef', 'LargestLineLength', 'URLLength', 'NoOfImage', 'NoOfJS', 'NoOfSelfRef', 'NoOfCSS',
    'URLCharProb', 'CharContinuationRate', 'LetterRatioInURL', 'IsHTTPS', 'SpacialCharRatioInURL', 'NoOfEmptyRef',
    'NoOfOtherSpecialCharsInURL', 'HasDescription', 'HasSocialNet', 'DomainLength', 'DegitRatioInURL', 'NoOfDegitsInURL',
    'HasCopyrightInfo', 'NoOfLettersInURL', 'TLDLegitimateProb', 'DomainTitleMatchScore', 'IsResponsive',
    'HasHiddenFields', 'HasSubmitButton', 'NoOfSubDomain', 'HasFavicon', 'HasTitle'
]

X = df[cols].values  # Features
y = df[LABEL].values  # Target label

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=42)

# Train logistic regression

weights, bias = train_logistic_regression(X_train, y_train, lr=0.001, epochs=10, batch_size=1024, lambda_=0.1)

# Make predictions
y_pred = predict(X_test, weights, bias)


# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.6717
Precision: 0.6354
Recall: 1.0000
F1 Score: 0.7771
