In [1]:
import re
import pandas as pd
from collections import defaultdict

In [2]:
# 1. Membaca file CSV yang sudah dinormalisasi
file_path = "../log-extractor/extracted_data_normalized.csv"  # File hasil normalisasi
df = pd.read_csv(file_path)

# Memastikan kolom sesuai
if not {'Time', 'IP', 'Normalized_URL'}.issubset(df.columns):
    raise ValueError("Kolom 'Time', 'IP', atau 'Normalized_URL' tidak ditemukan dalam dataset.")

In [3]:
# 2. Konversi waktu untuk sorting dan pembagian
df['Time'] = pd.to_datetime(df['Time'], format='%d/%b/%Y:%H:%M:%S %z')

# Urutkan berdasarkan waktu
df = df.sort_values(by='Time')

In [4]:
# 3. Membagi data menjadi training dan testing (80:20 split)
split_index = int(len(df) * 0.8)
training_data = df.iloc[:split_index].copy()  # Salinan eksplisit
testing_data = df.iloc[split_index:].copy()   # Salinan eksplisit

In [5]:
# 4. Membangun Higher-Order Markov Model menggunakan data training
# Higher-order state dengan 2 URL terakhir
transitions = defaultdict(lambda: defaultdict(int))

# Mengelompokkan berdasarkan IP untuk mempertahankan sesi user
grouped = training_data.groupby('IP')

for ip, group in grouped:
    states = group['Normalized_URL'].tolist()
    for i in range(len(states) - 2):  # Menggunakan 2 URL terakhir sebagai state
        state = (states[i], states[i + 1])  # Higher-order state
        next_state = states[i + 2]
        transitions[state][next_state] += 1

# Mengubah ke probabilitas transisi
markov_model_higher = {}
for state, next_states in transitions.items():
    total_transitions = sum(next_states.values())
    markov_model_higher[state] = {url: count / total_transitions for url, count in next_states.items()}

In [6]:
# 5. Fungsi untuk memprediksi URL berikutnya
def predict_next_url_higher_order(current_state, model, top_n=1):
    if current_state not in model:
        return None  # Tidak ada prediksi yang tersedia
    
    # Urutkan berdasarkan probabilitas
    sorted_predictions = sorted(model[current_state].items(), key=lambda x: x[1], reverse=True)
    return [url for url, prob in sorted_predictions[:top_n]]

In [7]:
# 6. Evaluasi pada data testing
correct_predictions = 0
total_predictions = 0

grouped_testing = testing_data.groupby('IP')

for ip, group in grouped_testing:
    states = group['Normalized_URL'].tolist()
    for i in range(len(states) - 2):  # Evaluasi dengan 2 URL terakhir sebagai state
        current_state = (states[i], states[i + 1])
        actual_next_url = states[i + 2]
        
        # Prediksi URL berikutnya
        predicted_urls = predict_next_url_higher_order(current_state, markov_model_higher, top_n=1)
        
        if predicted_urls and actual_next_url in predicted_urls:
            correct_predictions += 1
        total_predictions += 1

# Hitung akurasi
accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
print(f"Akurasi prediksi URL berikutnya (Higher-Order Markov Model): {accuracy:.2%}")

Akurasi prediksi URL berikutnya (Higher-Order Markov Model): 39.67%


In [8]:
# Evaluasi Top-k Accuracy
def evaluate_top_k_accuracy_higher_order(testing_data, model, top_k=3):
    correct_predictions = 0
    total_predictions = 0

    grouped_testing = testing_data.groupby('IP')

    for ip, group in grouped_testing:
        states = group['Normalized_URL'].tolist()
        for i in range(len(states) - 2):  # Evaluasi dengan 2 URL terakhir sebagai state
            current_state = (states[i], states[i + 1])
            actual_next_url = states[i + 2]
            predicted_urls = predict_next_url_higher_order(current_state, model, top_n=top_k)
            if predicted_urls and actual_next_url in predicted_urls:
                correct_predictions += 1
            total_predictions += 1

    return correct_predictions / total_predictions if total_predictions > 0 else 0

# Evaluasi Top-3 Accuracy
top_k_accuracy = evaluate_top_k_accuracy_higher_order(testing_data, markov_model_higher, top_k=3)
print(f"Top-3 Accuracy (Higher-Order Markov Model): {top_k_accuracy:.2%}")

Top-3 Accuracy (Higher-Order Markov Model): 61.53%
