In [20]:
import numpy as np
import pandas as pd

# Sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Batch gradient descent logistic regression
def train_logistic_regression(X, y, lr=0.01, epochs=1000, batch_size=1024):
    """
    Train logistic regression using batch gradient descent.
    :param X: Feature matrix
    :param y: Target labels
    :param lr: Learning rate
    :param epochs: Number of iterations for gradient descent
    :param batch_size: Size of each batch
    :return: Weights (w) and bias (b)
    """
    m, n = X.shape
    w = np.zeros(n)  # Initialize weights
    b = 0            # Initialize bias

    for epoch in range(epochs):
        # Shuffle data for each epoch
        indices = np.arange(m)
        np.random.shuffle(indices)
        X = X[indices]
        y = y[indices]

        for i in range(0, m, batch_size):
            # Extract batch
            X_batch = X[i:i + batch_size]
            y_batch = y[i:i + batch_size]

            # Forward pass
            z = np.dot(X_batch, w) + b
            predictions = sigmoid(z)

            # Compute gradients
            dw = (1 / len(X_batch)) * np.dot(X_batch.T, (predictions - y_batch))
            db = (1 / len(X_batch)) * np.sum(predictions - y_batch)

            # Update weights and bias
            w -= lr * dw
            b -= lr * db

    return w, b

# Predict function
def predict(X, w, b, threshold=0.5):
    z = np.dot(X, w) + b
    probabilities = sigmoid(z)
    return (probabilities >= threshold).astype(int)

# Import data
data=pd.read_csv("dataset/PhishingDataset.csv")

# Define features and label column
feature_columns = [
    'LineOfCode', 'NoOfExternalRef', 'LargestLineLength', 'URLLength', 'NoOfImage', 'NoOfJS', 'NoOfSelfRef', 'NoOfCSS',
    'URLCharProb', 'CharContinuationRate', 'LetterRatioInURL', 'IsHTTPS', 'SpacialCharRatioInURL', 'NoOfEmptyRef',
    'NoOfOtherSpecialCharsInURL', 'HasDescription', 'HasSocialNet', 'DomainLength', 'DegitRatioInURL', 'NoOfDegitsInURL',
    'HasCopyrightInfo', 'NoOfLettersInURL', 'TLDLegitimateProb', 'DomainTitleMatchScore', 'IsResponsive',
    'HasHiddenFields', 'HasSubmitButton', 'NoOfSubDomain', 'HasFavicon', 'HasTitle'
]
label_column = 'label'

# Extract features and target
X = data[feature_columns].values
y = data[label_column].values

# Precompute mean and std for normalization
mean_X = np.mean(X, axis=0)
std_X = np.std(X, axis=0)

# Normalize features using precomputed mean and std
X = (X - mean_X) / std_X

# Train the model
weights, bias = train_logistic_regression(X, y, lr=0.1, epochs=1000, batch_size=1024)

# Test the model with a sample
test_sample = np.array([[200, 5, 150, 120, 10, 2, 3, 4, 0.5, 0.6, 0.8, 1, 0.3, 2, 1, 0, 1, 25, 0.7, 15, 0, 20, 0.9, 0.8, 1, 0, 1, 3, 1, 1]])
test_sample_normalized = (test_sample - mean_X) / std_X  # Normalize using training stats
test_prediction = predict(test_sample_normalized, weights, bias)
print(f"Prediction for test sample: {'Scam' if test_prediction[0] == 1 else 'Not scam'}")


Prediction for test sample: Not scam
