In [12]:
import pandas as pd
from collections import defaultdict
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
import pickle

In [None]:
# Baca file dataset
file_path = "../log-extractor/extracted_data_normalized_filtered_part1.csv"
df = pd.read_csv(file_path)

# Pastikan kolom yang diperlukan tersedia
if not {'IP', 'Normalized_URL', 'Time'}.issubset(df.columns):
    raise ValueError("Kolom yang diperlukan tidak ditemukan dalam dataset.")

# Konversi waktu dan urutkan berdasarkan waktu
df['Time'] = pd.to_datetime(df['Time'], format='%d/%b/%Y:%H:%M:%S %z')
df = df.sort_values(by='Time')

# Hitung jumlah URL unik
url_counts = df['Normalized_URL'].value_counts()

# Hanya pertahankan URL dengan jumlah data >= 2
valid_urls = url_counts[url_counts >= 2].index
df = df[df['Normalized_URL'].isin(valid_urls)]

print(f"Dataset setelah menghapus kelas dengan jumlah data < 2: {len(df)} baris")

# Stratified sampling berdasarkan kolom 'Normalized_URL'
training_data, testing_data = train_test_split(
    df,
    test_size=0.2,  # 20% untuk testing
    stratify=df['Normalized_URL'],  # Stratify berdasarkan URL
    random_state=42  # Seed untuk hasil sampling konsisten
)

print(f"Jumlah data training: {len(training_data)}")
print(f"Jumlah data testing: {len(testing_data)}")

# Simpan hasil pembagian ke file (opsional)
training_data.to_csv("../log-extractor/training_data.csv", index=False)
testing_data.to_csv("../log-extractor/testing_data.csv", index=False)
print("Data training dan testing disimpan ke file.")

Dataset setelah menghapus kelas dengan jumlah data < 2: 870656 baris
Jumlah data training: 696524
Jumlah data testing: 174132
Data training dan testing disimpan ke file.


In [3]:
# Fungsi untuk membangun Markov Chain
def build_markov_chain(training_data, order=1):
    transitions = defaultdict(lambda: defaultdict(int))
    grouped = training_data.groupby('IP')

    for ip, group in grouped:
        urls = group['Normalized_URL'].tolist()
        for i in range(len(urls) - order):
            state = tuple(urls[i:i+order])  # State terdiri dari 'order' URL terakhir
            next_url = urls[i+order]
            transitions[state][next_url] += 1

    # Konversi ke probabilitas
    markov_model = {}
    for state, next_urls in transitions.items():
        total_transitions = sum(next_urls.values())
        markov_model[state] = {url: count / total_transitions for url, count in next_urls.items()}
    
    return markov_model

In [4]:
# Bangun Markov Chains Multi-Orde
markov_models = {}
max_order = 4  # Anda dapat menyesuaikan max_order
for order in range(1, max_order + 1):
    markov_models[order] = build_markov_chain(training_data, order=order)

print(f"Dibangun Markov chains untuk orde 1 hingga {max_order}")

Dibangun Markov chains untuk orde 1 hingga 4


In [5]:
def prepare_nn_dataset(testing_data, markov_models, max_order=4):
    """
    Siapkan dataset untuk Neural Network dari data testing dan Markov Chains.
    
    Args:
        testing_data (DataFrame): Data testing dengan kolom 'IP' dan 'Normalized_URL'.
        markov_models (dict): Multi-Orde Markov Chains yang sudah dibangun.
        max_order (int): Orde maksimum dari Markov Chains yang digunakan.
    
    Returns:
        tuple: (X, y), di mana:
               - X adalah array input untuk Neural Network.
               - y adalah array target (URL berikutnya).
    """
    X = []
    y = []

    grouped = testing_data.groupby('IP')

    for ip, group in grouped:
        urls = group['Normalized_URL'].tolist()
        for i in range(max_order, len(urls)):  # Pastikan cukup data untuk max_order
            # Siapkan input dari probabilitas prediksi setiap orde
            input_vector = []
            for order in range(1, max_order + 1):
                state = tuple(urls[i-order:i])  # State untuk orde tertentu
                if state in markov_models[order]:
                    input_vector.append(markov_models[order][state].get(urls[i], 0))  # Probabilitas URL berikutnya
                else:
                    input_vector.append(0)  # Jika state tidak ditemukan
            
            # Target adalah URL berikutnya
            target_url = urls[i]
            
            # Masukkan ke dataset
            X.append(input_vector)
            y.append(target_url)
    
    return np.array(X), np.array(y)

In [6]:
# Siapkan dataset Neural Network
X_train, y_train = prepare_nn_dataset(training_data, markov_models, max_order=max_order)
X_test, y_test = prepare_nn_dataset(testing_data, markov_models, max_order=max_order)

print(f"Dataset Neural Network selesai disiapkan:")
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

Dataset Neural Network selesai disiapkan:
X_train shape: (693061, 4), y_train shape: (693061,)
X_test shape: (170860, 4), y_test shape: (170860,)


In [7]:
# Encode target (y_train dan y_test)
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_test_encoded = encoder.transform(y_test)

print(f"Jumlah kelas unik (URL): {len(encoder.classes_)}")

Jumlah kelas unik (URL): 1874


In [14]:
# Bangun neural network dengan 2 hidden layers
mlp = MLPClassifier(
    hidden_layer_sizes=(64, 32),  # Ukuran hidden layer
    max_iter=100,                # Maksimal iterasi
    random_state=42,             # Seed untuk reproduksibilitas
    verbose=True,                # Tampilkan log pelatihan
    learning_rate_init=0.001     # Learning rate awal
)

# Latih neural network
print("Melatih Neural Network...")
mlp.fit(X_train, y_train_encoded)
print("Pelatihan selesai.")

Melatih Neural Network...
Iteration 1, loss = 4.27305226
Iteration 2, loss = 3.90225977
Iteration 3, loss = 3.82137598
Iteration 4, loss = 3.77939442
Iteration 5, loss = 3.74928017
Iteration 6, loss = 3.72577778
Iteration 7, loss = 3.70497983
Iteration 8, loss = 3.68570729
Iteration 9, loss = 3.66745998
Iteration 10, loss = 3.65128779
Iteration 11, loss = 3.63719737
Iteration 12, loss = 3.62541096
Iteration 13, loss = 3.61332842
Iteration 14, loss = 3.60338641
Iteration 15, loss = 3.59369980
Iteration 16, loss = 3.58484773
Iteration 17, loss = 3.57730791
Iteration 18, loss = 3.57073077
Iteration 19, loss = 3.56436059
Iteration 20, loss = 3.55861620
Iteration 21, loss = 3.55422101
Iteration 22, loss = 3.54870075
Iteration 23, loss = 3.54434812
Iteration 24, loss = 3.54010050
Iteration 25, loss = 3.53632412
Iteration 26, loss = 3.53232876
Iteration 27, loss = 3.52864685
Iteration 28, loss = 3.52457915
Iteration 29, loss = 3.52133351
Iteration 30, loss = 3.51775015
Iteration 31, loss = 3.



In [15]:
# Evaluasi model
train_accuracy = mlp.score(X_train, y_train_encoded)
test_accuracy = mlp.score(X_test, y_test_encoded)

print(f"Train Accuracy: {train_accuracy:.2%}")
print(f"Test Accuracy: {test_accuracy:.2%}")

Train Accuracy: 33.48%
Test Accuracy: 27.36%


In [16]:
# Simpan model
with open("neural_network_model.pkl", "wb") as model_file:
    pickle.dump(mlp, model_file)

# Simpan encoder
with open("label_encoder.pkl", "wb") as encoder_file:
    pickle.dump(encoder, encoder_file)

print("Model dan encoder telah disimpan.")

Model dan encoder telah disimpan.


In [19]:
# Mendapatkan Output Neural Network
# Prediksi probabilitas menggunakan Neural Network
nn_predictions = mlp.predict_proba(X_test)

# Cek dimensi hasil prediksi
print(f"Dimensi prediksi Neural Network: {nn_predictions.shape}")

Dimensi prediksi Neural Network: (170860, 1874)


In [20]:
# Integrasi Output Neural Network ke Multi-Order Markov Chains
def hybrid_predict(current_state, markov_models, nn_prediction, max_order=4):
    """
    Membuat prediksi hybrid berdasarkan Multi-Order Markov Chains dan Neural Network.

    Args:
        current_state (tuple): State saat ini untuk Markov Chains.
        markov_models (dict): Dictionary Multi-Order Markov Chains.
        nn_prediction (array): Prediksi probabilitas Neural Network untuk URL berikutnya.
        max_order (int): Orde maksimum dari Markov Chains.

    Returns:
        list: URL yang diprediksi (berdasarkan probabilitas tertinggi).
    """
    hybrid_scores = defaultdict(float)

    # Gabungkan probabilitas dari setiap Markov Chain dengan bobot dari Neural Network
    for order in range(1, max_order + 1):
        state = tuple(current_state[-order:])  # State untuk orde tertentu
        if state in markov_models[order]:
            for next_url, prob in markov_models[order][state].items():
                nn_index = encoder.transform([next_url])[0]  # Cari indeks URL di encoder
                hybrid_scores[next_url] += prob * nn_prediction[nn_index]

    # Urutkan berdasarkan skor hybrid
    sorted_predictions = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)
    return [url for url, score in sorted_predictions]

In [21]:
# Evaluasi Hybrid Prediction
correct_predictions = 0
total_predictions = 0

for i in range(len(X_test)):
    current_state = X_test[i]  # State saat ini
    actual_next_url = y_test[i]  # URL berikutnya yang sebenarnya

    # Prediksi hybrid
    nn_prediction = nn_predictions[i]
    predicted_urls = hybrid_predict(current_state, markov_models, nn_prediction, max_order=4)

    if predicted_urls and actual_next_url in predicted_urls[:1]:  # Top-1 prediction
        correct_predictions += 1
    total_predictions += 1

# Hitung akurasi
hybrid_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
print(f"Hybrid Prediction Accuracy (Top-1): {hybrid_accuracy:.2%}")


Hybrid Prediction Accuracy (Top-1): 0.00%
