In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

In [2]:
# Load dataset
data_path = "../log-extractor/extracted_data_normalized.csv"
data = pd.read_csv(data_path)

# Preprocess data
data = data.dropna()  # Drop rows with missing values

# Extract necessary columns
time, ip, url, normalized_url = data['Time'], data['IP'], data['URL'], data['Normalized_URL']

In [3]:
# Label encode URLs
url_encoder = LabelEncoder()
data['URL_encoded'] = url_encoder.fit_transform(data['URL'])
data['Normalized_URL_encoded'] = url_encoder.fit_transform(data['Normalized_URL'])

In [4]:
# Build Multi-Order Markov transition probabilities
multi_order_transitions = defaultdict(lambda: defaultdict(int))
order = 4

for i in range(len(data) - order):
    current_sequence = tuple(data['Normalized_URL_encoded'].iloc[i:i + order])
    next_url = data['Normalized_URL_encoded'].iloc[i + order]
    multi_order_transitions[current_sequence][next_url] += 1

# Normalize probabilities
multi_order_probs = {current: {next_url: count / sum(next_dict.values())
                               for next_url, count in next_dict.items()}
                     for current, next_dict in multi_order_transitions.items()}

In [5]:
# Prepare sequential data for Neural Network
sequences = []
next_urls = []
sequence_length = order

for i in range(len(data) - sequence_length):
    seq = data['Normalized_URL_encoded'].iloc[i:i + sequence_length].values
    next_url = data['Normalized_URL_encoded'].iloc[i + sequence_length]
    sequences.append(seq)
    next_urls.append(next_url)

sequences = np.array(sequences)
next_urls = np.array(next_urls)

In [6]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(sequences, next_urls, test_size=0.2, random_state=42)

# Define batch processing generator with on-the-fly one-hot encoding
def batch_generator(X, y, batch_size, num_classes):
    num_samples = len(X)
    while True:
        for offset in range(0, num_samples, batch_size):
            end = min(offset + batch_size, num_samples)
            X_batch = X[offset:end]
            y_batch = tf.keras.utils.to_categorical(y[offset:end], num_classes=num_classes)
            yield X_batch, y_batch

# Get number of classes
num_classes = len(url_encoder.classes_)

In [7]:
# Train model with batch processing
batch_size = 32
train_gen = batch_generator(X_train, y_train, batch_size, num_classes)
test_gen = batch_generator(X_test, y_test, batch_size, num_classes)

steps_per_epoch = len(X_train) // batch_size
validation_steps = len(X_test) // batch_size

In [None]:
# Build Neural Network model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=num_classes, output_dim=50, input_length=sequence_length),
    tf.keras.layers.LSTM(64, return_sequences=False),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(train_gen, epochs=10, steps_per_epoch=steps_per_epoch, validation_data=test_gen, validation_steps=validation_steps)

In [None]:
# Predict function combining Multi-Order Markov and Neural Network
def predict_next_url(sequence):
    # Use Multi-Order Markov model to get transition probabilities
    markov_predictions = multi_order_probs.get(tuple(sequence), {})

    # Use Neural Network for prediction
    nn_prediction = model.predict(np.array([sequence]))[0]

    # Combine Markov and Neural Network predictions
    combined_probs = np.zeros(num_classes)
    for url, prob in markov_predictions.items():
        combined_probs[url] += prob
    combined_probs += nn_prediction

    # Handle unseen sequences
    if not markov_predictions:
        print("Warning: Sequence not found in training data. Using only Neural Network predictions.")

    # Return the URL with the highest combined probability
    predicted_index = np.argmax(combined_probs)
    return url_encoder.inverse_transform([predicted_index])[0]

In [None]:
# Evaluation metrics
def evaluate_model():
    correct = 0
    top_3_correct = 0
    top_5_correct = 0

    for i in range(len(X_test)):
        sequence = X_test[i]
        true_label = y_test[i]

        # Use Multi-Order Markov and Neural Network combined prediction
        markov_predictions = multi_order_probs.get(tuple(sequence), {})
        nn_prediction = model.predict(np.array([sequence]))[0]

        combined_probs = np.zeros(num_classes)
        for url, prob in markov_predictions.items():
            combined_probs[url] += prob
        combined_probs += nn_prediction

        # Handle unseen sequences
        if not markov_predictions:
            print(f"Warning: Sequence {sequence} not found in training data. Using only Neural Network predictions.")

        top_k_indices = np.argsort(combined_probs)[-5:][::-1]

        if true_label in top_k_indices[:1]:
            correct += 1
        if true_label in top_k_indices[:3]:
            top_3_correct += 1
        if true_label in top_k_indices[:5]:
            top_5_correct += 1

    total = len(X_test)
    accuracy = correct / total
    top_3_accuracy = top_3_correct / total
    top_5_accuracy = top_5_correct / total

    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"Top-3 Accuracy: {top_3_accuracy * 100:.2f}%")
    print(f"Top-5 Accuracy: {top_5_accuracy * 100:.2f}%")

In [None]:
# Example prediction
example_sequence = X_test[0]
predicted_url = predict_next_url(example_sequence)
print("Predicted URL:", predicted_url)

# Evaluate model
evaluate_model()