In [8]:
import numpy as np
import pandas as pd

# Sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Batch gradient descent logistic regression
def train_logistic_regression(X, y, lr=0.01, epochs=1000, batch_size=1024):
    m, n = X.shape
    w = np.zeros(n)  # Initialize weights
    b = 0            # Initialize bias

    for epoch in range(epochs):
        indices = np.arange(m)
        np.random.shuffle(indices)
        X = X[indices]
        y = y[indices]

        for i in range(0, m, batch_size):
            X_batch = X[i:i + batch_size]
            y_batch = y[i:i + batch_size]

            z = np.dot(X_batch, w) + b
            predictions = sigmoid(z)

            dw = (1 / len(X_batch)) * np.dot(X_batch.T, (predictions - y_batch))
            db = (1 / len(X_batch)) * np.sum(predictions - y_batch)

            w -= lr * dw
            b -= lr * db

    return w, b

# Predict function
def predict(X, w, b, threshold=0.5):
    z = np.dot(X, w) + b
    probabilities = sigmoid(z)
    return (probabilities >= threshold).astype(int)

# Accuracy, precision, recall metrics
def evaluate_model(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))  # True positives
    tn = np.sum((y_true == 0) & (y_pred == 0))  # True negatives
    fp = np.sum((y_true == 0) & (y_pred == 1))  # False positives
    fn = np.sum((y_true == 1) & (y_pred == 0))  # False negatives

    accuracy = (tp + tn) / len(y_true)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0

    return accuracy, precision, recall

# Manual train-test split
def train_test_split(X, y, test_size=0.2):
    m = len(X)
    test_size = int(m * test_size)
    indices = np.arange(m)
    np.random.shuffle(indices)
    test_indices = indices[:test_size]
    train_indices = indices[test_size:]
    return X[train_indices], X[test_indices], y[train_indices], y[test_indices]

# Import data
file_path = "dataset/PhishingDataset.csv"
data = pd.read_csv(file_path)

# Define features and label column
feature_columns = [
    'LineOfCode', 'NoOfExternalRef', 'LargestLineLength', 'URLLength', 'NoOfImage', 'NoOfJS', 'NoOfSelfRef', 'NoOfCSS',
    'URLCharProb', 'CharContinuationRate', 'LetterRatioInURL', 'IsHTTPS', 'SpacialCharRatioInURL', 'NoOfEmptyRef',
    'NoOfOtherSpecialCharsInURL', 'HasDescription', 'HasSocialNet', 'DomainLength', 'DegitRatioInURL', 'NoOfDegitsInURL',
    'HasCopyrightInfo', 'NoOfLettersInURL', 'TLDLegitimateProb', 'DomainTitleMatchScore', 'IsResponsive',
    'HasHiddenFields', 'HasSubmitButton', 'NoOfSubDomain', 'HasFavicon', 'HasTitle'
]
label_column = 'label'

# Extract features and target
X = data[feature_columns].values
y = data[label_column].values

# Precompute mean and std for normalization
mean_X = np.mean(X, axis=0)
std_X = np.std(X, axis=0)

# Normalize features using precomputed mean and std
X = (X - mean_X) / std_X

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01)

# Train the model
weights, bias = train_logistic_regression(X_train, y_train, lr=0.1, epochs=1000, batch_size=1024)

# Predict on the test set
y_pred = predict(X_test, weights, bias)

# Evaluate the model
accuracy, precision, recall = evaluate_model(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Sensitivity (Recall): {recall:.4f}")


Accuracy: 0.9992
Precision: 1.0000
Sensitivity (Recall): 0.9985
