In [40]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Step 1: Load and Preprocess Dataset

### Load Dataset

In [41]:
df = pd.read_csv('/content/drive/MyDrive/PhiUSIIL_Phishing_URL_Dataset.csv').sample(10000, random_state=42)

### Select features and target

In [42]:
# Select features and target
target = 'label'
features = df.select_dtypes(include=[np.number])

### Split into train and test sets

In [43]:
X_train, X_test, y_train, y_test = train_test_split(features, df[target], test_size=0.2, random_state=42, stratify=df[target])

### Standardize features

In [44]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Convert to PyTorch tensors

In [45]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# Step 2: Create a PyTorch Dataset

In [46]:
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

# Create DataLoader
train_dataset = CustomDataset(X_train_tensor, y_train_tensor)
test_dataset = CustomDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Step 3: Define the Bidirectional RNN Model

In [47]:
class BiRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes, pooling):
        super(BiRNN, self).__init__()
        self.rnn = nn.RNN(
            input_size, hidden_size, num_layers, batch_first=True, bidirectional=True
        )
        self.pooling = pooling
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        x = x.unsqueeze(1)  # Add sequence dimension
        out, _ = self.rnn(x)
        if self.pooling == "max":
            out, _ = torch.max(out, dim=1)
        elif self.pooling == "avg":
            out = torch.mean(out, dim=1)
        out = self.fc(out)
        return out

# Step 4: Training and Evaluation Function

In [48]:
def train_and_evaluate(hidden_size, pooling, optimizer_name, epochs, results):
    input_size = X_train_tensor.shape[1]
    num_classes = len(y_train.unique())

    model = BiRNN(input_size, hidden_size, num_layers=1, num_classes=num_classes, pooling=pooling)
    criterion = nn.CrossEntropyLoss()

    if optimizer_name == "SGD":
        optimizer = optim.SGD(model.parameters(), lr=0.01)
    elif optimizer_name == "RMSProp":
        optimizer = optim.RMSprop(model.parameters(), lr=0.01)
    elif optimizer_name == "Adam":
        optimizer = optim.Adam(model.parameters(), lr=0.01)

    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

    # Early stopping parameters
    best_loss = float('inf')
    patience = 5
    patience_counter = 0

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        scheduler.step()

        # Evaluate on test set
        model.eval()
        test_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in test_loader:
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                test_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        test_loss /= len(test_loader)
        accuracy = 100 * correct / total

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {running_loss:.4f}, Test Loss: {test_loss:.4f}, Accuracy: {accuracy:.2f}%")

        # Early stopping
        if test_loss < best_loss:
            best_loss = test_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered")
                break

    # Append results
    results.append({
        "hidden_size": hidden_size,
        "pooling": pooling,
        "optimizer": optimizer_name,
        "epochs": epochs,
        "test_loss": test_loss,
        "accuracy": accuracy
    })

# Collect results
results = []

# Step 5: Experimentation by Parameter

### Experiment 1: Hidden Size

In [49]:
# Experiment 1: Varying Hidden Size
hidden_sizes = [32, 64, 128]
for hidden_size in hidden_sizes:
    print(f"\nExperiment with Hidden Size: {hidden_size}")
    train_and_evaluate(hidden_size=hidden_size, pooling="max", optimizer_name="Adam", epochs=50, results=results)


Experiment with Hidden Size: 32
Epoch 1/50, Loss: 2.2669, Test Loss: 0.0001, Accuracy: 100.00%
Epoch 2/50, Loss: 0.0118, Test Loss: 0.0000, Accuracy: 100.00%
Epoch 3/50, Loss: 0.0054, Test Loss: 0.0000, Accuracy: 100.00%
Epoch 4/50, Loss: 0.0032, Test Loss: 0.0000, Accuracy: 100.00%
Epoch 5/50, Loss: 0.0022, Test Loss: 0.0000, Accuracy: 100.00%
Epoch 6/50, Loss: 0.0015, Test Loss: 0.0000, Accuracy: 100.00%
Epoch 7/50, Loss: 0.0012, Test Loss: 0.0000, Accuracy: 100.00%
Epoch 8/50, Loss: 0.0009, Test Loss: 0.0000, Accuracy: 100.00%
Epoch 9/50, Loss: 0.0007, Test Loss: 0.0000, Accuracy: 100.00%
Epoch 10/50, Loss: 0.0006, Test Loss: 0.0000, Accuracy: 100.00%
Epoch 11/50, Loss: 0.0005, Test Loss: 0.0000, Accuracy: 100.00%
Epoch 12/50, Loss: 0.0005, Test Loss: 0.0000, Accuracy: 100.00%
Epoch 13/50, Loss: 0.0005, Test Loss: 0.0000, Accuracy: 100.00%
Epoch 14/50, Loss: 0.0005, Test Loss: 0.0000, Accuracy: 100.00%
Epoch 15/50, Loss: 0.0005, Test Loss: 0.0000, Accuracy: 100.00%
Epoch 16/50, Los

### Experiment 2: Pooling Methods

In [50]:
pooling_methods = ["max", "avg"]
for pooling in pooling_methods:
    print(f"\nExperiment with Pooling Method: {pooling}")
    train_and_evaluate(hidden_size=64, pooling=pooling, optimizer_name="Adam", epochs=50, results=results)


Experiment with Pooling Method: max
Epoch 1/50, Loss: 1.2667, Test Loss: 0.0019, Accuracy: 99.95%
Epoch 2/50, Loss: 0.0021, Test Loss: 0.0016, Accuracy: 99.95%
Epoch 3/50, Loss: 0.0010, Test Loss: 0.0015, Accuracy: 99.95%
Epoch 4/50, Loss: 0.0007, Test Loss: 0.0013, Accuracy: 99.95%
Epoch 5/50, Loss: 0.0005, Test Loss: 0.0012, Accuracy: 99.95%
Epoch 6/50, Loss: 0.0004, Test Loss: 0.0012, Accuracy: 99.95%
Epoch 7/50, Loss: 0.0003, Test Loss: 0.0011, Accuracy: 99.95%
Epoch 8/50, Loss: 0.0003, Test Loss: 0.0010, Accuracy: 99.95%
Epoch 9/50, Loss: 0.0002, Test Loss: 0.0009, Accuracy: 99.95%
Epoch 10/50, Loss: 0.0002, Test Loss: 0.0009, Accuracy: 99.95%
Epoch 11/50, Loss: 0.0002, Test Loss: 0.0009, Accuracy: 99.95%
Epoch 12/50, Loss: 0.0002, Test Loss: 0.0009, Accuracy: 99.95%
Epoch 13/50, Loss: 0.0002, Test Loss: 0.0009, Accuracy: 99.95%
Epoch 14/50, Loss: 0.0002, Test Loss: 0.0009, Accuracy: 99.95%
Epoch 15/50, Loss: 0.0001, Test Loss: 0.0009, Accuracy: 99.95%
Epoch 16/50, Loss: 0.0001, 

### Experiment 3: Number of Epochs

In [51]:
epochs_list = [5, 50, 100, 250, 350]
for epochs in epochs_list:
    print(f"\nExperiment with Epochs: {epochs}")
    train_and_evaluate(hidden_size=64, pooling="max", optimizer_name="Adam", epochs=epochs, results=results)


Experiment with Epochs: 5
Epoch 1/5, Loss: 1.9791, Test Loss: 0.0020, Accuracy: 99.95%
Epoch 2/5, Loss: 0.0235, Test Loss: 0.0000, Accuracy: 100.00%
Epoch 3/5, Loss: 0.0006, Test Loss: 0.0000, Accuracy: 100.00%
Epoch 4/5, Loss: 0.0004, Test Loss: 0.0000, Accuracy: 100.00%
Epoch 5/5, Loss: 0.0004, Test Loss: 0.0000, Accuracy: 100.00%

Experiment with Epochs: 50
Epoch 1/50, Loss: 1.7314, Test Loss: 0.0001, Accuracy: 100.00%
Epoch 2/50, Loss: 0.0023, Test Loss: 0.0000, Accuracy: 100.00%
Epoch 3/50, Loss: 0.0010, Test Loss: 0.0000, Accuracy: 100.00%
Epoch 4/50, Loss: 0.0007, Test Loss: 0.0000, Accuracy: 100.00%
Epoch 5/50, Loss: 0.0005, Test Loss: 0.0000, Accuracy: 100.00%
Epoch 6/50, Loss: 0.0004, Test Loss: 0.0000, Accuracy: 100.00%
Epoch 7/50, Loss: 0.0003, Test Loss: 0.0000, Accuracy: 100.00%
Epoch 8/50, Loss: 0.0003, Test Loss: 0.0000, Accuracy: 100.00%
Epoch 9/50, Loss: 0.0002, Test Loss: 0.0000, Accuracy: 100.00%
Epoch 10/50, Loss: 0.0002, Test Loss: 0.0000, Accuracy: 100.00%
Epoch

### Experiment 4: Optimizers

In [52]:
optimizers = ["SGD", "RMSProp", "Adam"]
for optimizer_name in optimizers:
    print(f"\nExperiment with Optimizer: {optimizer_name}")
    train_and_evaluate(hidden_size=64, pooling="max", optimizer_name=optimizer_name, epochs=50, results=results)


Experiment with Optimizer: SGD
Epoch 1/50, Loss: 34.7239, Test Loss: 0.0412, Accuracy: 99.95%
Epoch 2/50, Loss: 7.5346, Test Loss: 0.0193, Accuracy: 100.00%
Epoch 3/50, Loss: 4.2596, Test Loss: 0.0122, Accuracy: 100.00%
Epoch 4/50, Loss: 2.9281, Test Loss: 0.0087, Accuracy: 100.00%
Epoch 5/50, Loss: 2.2087, Test Loss: 0.0067, Accuracy: 100.00%
Epoch 6/50, Loss: 1.7612, Test Loss: 0.0054, Accuracy: 100.00%
Epoch 7/50, Loss: 1.4569, Test Loss: 0.0045, Accuracy: 100.00%
Epoch 8/50, Loss: 1.2375, Test Loss: 0.0039, Accuracy: 100.00%
Epoch 9/50, Loss: 1.0723, Test Loss: 0.0034, Accuracy: 100.00%
Epoch 10/50, Loss: 0.9436, Test Loss: 0.0030, Accuracy: 100.00%
Epoch 11/50, Loss: 0.8821, Test Loss: 0.0029, Accuracy: 100.00%
Epoch 12/50, Loss: 0.8721, Test Loss: 0.0029, Accuracy: 100.00%
Epoch 13/50, Loss: 0.8623, Test Loss: 0.0029, Accuracy: 100.00%
Epoch 14/50, Loss: 0.8528, Test Loss: 0.0028, Accuracy: 100.00%
Epoch 15/50, Loss: 0.8434, Test Loss: 0.0028, Accuracy: 100.00%
Epoch 16/50, Loss

# Step 6: Display Results

In [53]:
results_df = pd.DataFrame(results)
print("\nFinal Results:")
print(results_df)


Final Results:
    hidden_size pooling optimizer  epochs     test_loss  accuracy
0            32     max      Adam      50  2.272466e-06    100.00
1            64     max      Adam      50  2.358092e-06    100.00
2           128     max      Adam      50  1.396050e-07    100.00
3            64     max      Adam      50  7.528792e-04     99.95
4            64     avg      Adam      50  2.597874e-06    100.00
5            64     max      Adam       5  3.062082e-06    100.00
6            64     max      Adam      50  1.386664e-06    100.00
7            64     max      Adam     100  1.020905e-05    100.00
8            64     max      Adam     250  9.020944e-07    100.00
9            64     max      Adam     350  1.181789e-06    100.00
10           64     max       SGD      50  2.633669e-03    100.00
11           64     max   RMSProp      50  4.020933e-09    100.00
12           64     max      Adam      50  1.273837e-06    100.00
