<a href="https://colab.research.google.com/github/irwardhana/DEVSAT_ENV/blob/main/Network_Fault_Prediction_BAB_IV_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Network Fault Prediction – BAB IV Generator (Google Colab)
Notebook ini membantu Anda **membangun BAB IV** secara otomatis dari dataset jaringan Anda.

**Alur kerja:**  
1) Instal dependensi → 2) Upload `Sample_Network_Dataset_Enhanced.xlsx` →  
3) Jalankan pemodelan (Random Forest & XGBoost) →  
4) Hasilkan dokumen Word: **`BAB_IV_HASIL_DAN_ANALISIS_MODEL_PREDIKSI_GANGGUAN_JARINGAN.docx`** →  
5) Unduh dokumen.

> *This notebook builds Chapter IV (Results & Analysis) from your dataset with RF/XGBoost, generates ROC & Feature Importance plots, and exports a Word (.docx) report.*


## 1) Instalasi Dependensi

In [1]:
!pip install -q pandas numpy matplotlib seaborn scikit-learn xgboost imbalanced-learn joblib python-docx openpyxl

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/253.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━[0m [32m225.3/253.0 kB[0m [31m6.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h

## 2) Upload Dataset
Upload file **`Sample_Network_Dataset_Enhanced.xlsx`** (harus persis nama ini agar skrip berjalan otomatis).


In [2]:
from google.colab import files
uploaded = files.upload()  # pilih Sample_Network_Dataset_Enhanced.xlsx
print('Uploaded:', list(uploaded.keys()))

Saving Sample_Network_Dataset_Enhanced.xlsx to Sample_Network_Dataset_Enhanced.xlsx
Uploaded: ['Sample_Network_Dataset_Enhanced.xlsx']


## 3) Proses Analisis, Pelatihan, & Pembuatan Dokumen BAB IV
Langkah ini akan:
- Membaca 3 sheet: `SNMP_Data`, `Syslog_Data`, `Ticket_Data`
- Rekayasa fitur & penggabungan per timestamp/perangkat
- Train **Random Forest** & **XGBoost**
- Menghasilkan grafik **ROC Curve** (biru–oranye) dan **Feature Importance**
- Menyusun dokumen Word dengan format tesis: **Tahoma 11 pt, spasi 1.5, margin 3–4–3–3 cm, portrait, nomor halaman kanan bawah**.


In [7]:
import os, sys, warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (confusion_matrix, classification_report,
                             roc_auc_score, roc_curve, accuracy_score)
from imblearn.over_sampling import SMOTE
import xgboost as xgb

from docx import Document
from docx.shared import Pt, Cm
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.oxml import OxmlElement
from docx.oxml.ns import qn

DATA_FILE = "Sample_Network_Dataset_Enhanced.xlsx"
OUTPUT_DOCX = "BAB_IV_HASIL_DAN_ANALISIS_MODEL_PREDIKSI_GANGGUAN_JARINGAN.docx"
PLOT_DIR = "plots_bab_iv"
os.makedirs(PLOT_DIR, exist_ok=True)

sns.set(style="whitegrid")
plt.rcParams.update({"figure.dpi": 150, "savefig.dpi": 300, "font.size": 10})

def save_plot(fig, filename, width_cm=14):
    outpath = os.path.join(PLOT_DIR, filename)
    width_in = width_cm / 2.54
    w, h = fig.get_size_inches()
    aspect = h / w if w else 0.6
    fig.set_size_inches(width_in, width_in * aspect)
    fig.tight_layout()
    fig.savefig(outpath, dpi=300)
    plt.close(fig)
    return outpath

# --- Load data ---
xls = pd.ExcelFile(DATA_FILE, engine="openpyxl")
snmp = pd.read_excel(xls, sheet_name="SNMP_Data")
syslog = pd.read_excel(xls, sheet_name="Syslog_Data")
tickets = pd.read_excel(xls, sheet_name="Ticket_Data")

for df, col in [(snmp,'timestamp'), (syslog,'timestamp')]:
    df[col] = pd.to_datetime(df[col])
for col in ['start_time','end_time']:
    if col in tickets.columns:
        tickets[col] = pd.to_datetime(tickets[col])

# --- Preprocess & merge ---
snmp = snmp.sort_values(['device_id','timestamp']).reset_index(drop=True)
syslog = syslog.sort_values(['device_id','timestamp']).reset_index(drop=True)

if 'avg_latency_5min' not in snmp.columns:
    snmp['avg_latency_5min_calc'] = snmp.groupby('device_id')['latency (ms)'].transform(lambda s: s.rolling(window=3, min_periods=1).mean())
else:
    snmp['avg_latency_5min_calc'] = snmp['avg_latency_5min']

if 'max_cpu_10min' not in snmp.columns:
    snmp['max_cpu_10min_calc'] = snmp.groupby('device_id')['cpu_usage (%)'].transform(lambda s: s.rolling(window=3, min_periods=1).max())
else:
    snmp['max_cpu_10min_calc'] = snmp['max_cpu_10min']

if 'error_rate_5min' in syslog.columns:
    syslog['error_rate_5min_calc'] = syslog['error_rate_5min']
elif 'error_count_last_5min' in syslog.columns:
    syslog['error_rate_5min_calc'] = syslog.groupby('device_id')['error_count_last_5min'].transform(lambda s: s.rolling(window=3, min_periods=1).mean())
else:
    syslog['error_rate_5min_calc'] = 0.0

syslog_small = syslog[['timestamp','device_id','error_rate_5min_calc','error_count_last_5min']].copy()

from pandas import merge_asof
merged_list = []
for dev in snmp['device_id'].unique():
    a = snmp[snmp['device_id']==dev].sort_values('timestamp').reset_index(drop=True)
    b = syslog_small[syslog_small['device_id']==dev].sort_values('timestamp').reset_index(drop=True)
    if b.empty:
        a['error_rate_5min'] = 0.0
        a['error_count_last_5min'] = 0
        merged_list.append(a)
    else:
        m = merge_asof(a, b[['timestamp','error_rate_5min_calc','error_count_last_5min']], on='timestamp', direction='backward')
        m = m.rename(columns={'error_rate_5min_calc':'error_rate_5min'})
        m['error_rate_5min'] = m['error_rate_5min'].fillna(0)
        m['error_count_last_5min'] = m['error_count_last_5min'].fillna(0)
        merged_list.append(m)
df = pd.concat(merged_list).reset_index(drop=True)

# Remove duplicate 'error_rate_5min' column if it exists
if df.columns.duplicated().any():
    df = df.loc[:,~df.columns.duplicated()]


# Labeling dengan jendela prediksi 30 menit
if 'predicted_window_start' not in tickets.columns:
    tickets['predicted_window_start'] = tickets['start_time'] - pd.Timedelta(minutes=30)

tickets_group = tickets.groupby('device_id')[['predicted_window_start','start_time']].apply(lambda g: g.values.tolist()).to_dict()

labels = []
for _, row in df.iterrows():
    dev, ts = row['device_id'], row['timestamp']
    val = 0
    if dev in tickets_group:
        for start_w_val, start_t_val in tickets_group[dev]:
            start_w = pd.to_datetime(start_w_val)
            start_t = pd.to_datetime(start_t_val)
            if (start_w <= ts) and (ts <= start_t):
                val = 1; break
    labels.append(val)
df['label'] = labels

# --- Features & split ---
feature_cols = [
    'cpu_usage (%)','memory_usage (%)','latency (ms)','packet_loss (%)',
    'bandwidth_utilization (%)','avg_latency_5min_calc','max_cpu_10min_calc',
    'error_rate_5min','error_count_last_5min'
]
for c in feature_cols:
    if c not in df.columns:
        df[c] = 0.0
X = df[feature_cols].fillna(0).astype(float)
y = df['label'].astype(int)

df = df.sort_values('timestamp').reset_index(drop=True)
split_idx = int(len(df)*0.8)
train_idx = df.index[:split_idx]
test_idx = df.index[split_idx:]

X_train = X.loc[train_idx].reset_index(drop=True)
y_train = y.loc[train_idx].reset_index(drop=True)
X_test = X.loc[test_idx].reset_index(drop=True)
y_test = y.loc[test_idx].reset_index(drop=True)

# --- SMOTE & scaling ---
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)

# --- Train models ---
rf = RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42, n_jobs=-1)
rf.fit(X_train_scaled, y_train_res)
y_pred_rf = rf.predict(X_test_scaled)
y_prob_rf = rf.predict_proba(X_test_scaled)[:,1]

neg, pos = (y_train==0).sum(), (y_train==1).sum()
scale_pos_weight = neg / max(1, pos)
xgb_clf = xgb.XGBClassifier(objective='binary:logistic', eval_metric='auc', use_label_encoder=False,
                            random_state=42, scale_pos_weight=scale_pos_weight, n_estimators=200, n_jobs=-1)
xgb_clf.fit(X_train_scaled, y_train_res, verbose=False)
y_pred_xgb = xgb_clf.predict(X_test_scaled)
y_prob_xgb = xgb_clf.predict_proba(X_test_scaled)[:,1]

# --- Metrics ---
rf_report = classification_report(y_test, y_pred_rf, output_dict=True)
xgb_report = classification_report(y_test, y_pred_xgb, output_dict=True)
rf_auc = roc_auc_score(y_test, y_prob_rf) if len(np.unique(y_test))>1 else float("nan")
xgb_auc = roc_auc_score(y_test, y_prob_xgb) if len(np.unique(y_test))>1 else float("nan")
cm_rf = confusion_matrix(y_test, y_pred_rf)
cm_xgb = confusion_matrix(y_test, y_pred_xgb)

# --- ROC plot ---
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_prob_rf) if len(np.unique(y_test))>1 else ([],[],[])
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_prob_xgb) if len(np.unique(y_test))>1 else ([],[],[])

fig_roc = plt.figure(figsize=(6,4))
if len(fpr_rf)>0:
    plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC={rf_auc:.3f})', color='tab:blue')
if len(fpr_xgb)>0:
    plt.plot(fpr_xgb, tpr_xgb, label=f'XGBoost (AUC={xgb_auc:.3f})', color='tab:orange')
plt.plot([0,1],[0,1],'k--', linewidth=0.6)
plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate')
plt.title('ROC Curve'); plt.legend(loc='lower right')
roc_path = save_plot(fig_roc, 'roc_curve_bab_iv.png', width_cm=14)

# --- Feature importance plots ---
feature_cols_order = feature_cols  # for index alignment
rf_imp = pd.Series(rf.feature_importances_, index=feature_cols_order).sort_values(ascending=False)
fig_rf = plt.figure(figsize=(6,4))
rf_imp.plot(kind='bar', color='tab:blue'); plt.title('Feature Importance - Random Forest')
rf_feat_path = save_plot(fig_rf, 'feature_importance_rf_bab_iv.png', width_cm=14)

xgb_imp = pd.Series(xgb_clf.feature_importances_, index=feature_cols_order).sort_values(ascending=False)
fig_xgb = plt.figure(figsize=(6,4))
xgb_imp.plot(kind='bar', color='tab:orange'); plt.title('Feature Importance - XGBoost')
xgb_feat_path = save_plot(fig_xgb, 'feature_importance_xgb_bab_iv.png', width_cm=14)

# --- Build Word doc ---
doc = Document()
style = doc.styles['Normal']; style.font.name = 'Tahoma'; style.font.size = Pt(11)
sec = doc.sections[0]; sec.top_margin=Cm(3); sec.left_margin=Cm(4); sec.right_margin=Cm(3); sec.bottom_margin=Cm(3)

# nomor halaman kanan bawah
footer = sec.footer
p = footer.paragraphs[0] if footer.paragraphs else footer.add_paragraph()
p.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
fld = OxmlElement('w:fldSimple'); fld.set(qn('w:instr'), 'PAGE'); p._p.append(fld)

# Judul bab (rata kiri, kapital, tebal)
title = doc.add_paragraph(); title.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
r = title.add_run("BAB IV"); r.bold=True; r.font.size=Pt(12)
title2 = doc.add_paragraph(); title2.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
r2 = title2.add_run("HASIL DAN ANALISIS"); r2.bold=True; r2.font.size=Pt(12)

# Paragraf pembuka
doc.add_paragraph("Bab ini menyajikan hasil dan analisis dari penerapan model prediksi gangguan jaringan menggunakan dua pendekatan Machine Learning (Random Forest dan XGBoost). Data bersumber dari hasil simulasi SNMP, Syslog, dan Ticket untuk periode Januari–Juni 2025. Hasil evaluasi disajikan melalui metrik klasifikasi, Confusion Matrix, ROC Curve, serta Feature Importance.")

# 4.1
doc.add_paragraph("")
doc.add_paragraph("4.1 GAMBARAN UMUM EKSPERIMEN")
doc.add_paragraph("Konfigurasi eksperimen meliputi pembagian data berbasis waktu (80/20), penanganan ketidakseimbangan kelas menggunakan SMOTE, standarisasi fitur, serta pelatihan model Random Forest dan XGBoost.")

# 4.2
doc.add_paragraph("")
doc.add_paragraph("4.2 EVALUASI MODEL")
def get_metric(report_dict, metric='f1-score'):
    if '1' in report_dict: return report_dict['1'].get(metric, 0.0)
    return report_dict.get('macro avg', {}).get(metric, 0.0)

rf_acc = accuracy_score(y_test, y_pred_rf); xgb_acc = accuracy_score(y_test, y_pred_xgb)
rf_prec = get_metric(rf_report, 'precision'); rf_rec = get_metric(rf_report, 'recall'); rf_f1 = get_metric(rf_report, 'f1-score')
xgb_prec = get_metric(xgb_report, 'precision'); xgb_rec = get_metric(xgb_report, 'recall'); xgb_f1 = get_metric(xgb_report, 'f1-score')

tbl = doc.add_table(rows=1, cols=6); tbl.style='Table Grid'
hdr = tbl.rows[0].cells; hdr[0].text='Model'; hdr[1].text='Accuracy'; hdr[2].text='Precision'; hdr[3].text='Recall'; hdr[4].text='F1-score'; hdr[5].text='ROC AUC'
row = tbl.add_row().cells; row[0].text='Random Forest'; row[1].text=f"{rf_acc:.4f}"; row[2].text=f"{rf_prec:.4f}"; row[3].text=f"{rf_rec:.4f}"; row[4].text=f"{rf_f1:.4f}"; row[5].text=f"{rf_auc:.4f}"
row = tbl.add_row().cells; row[0].text='XGBoost'; row[1].text=f"{xgb_acc:.4f}"; row[2].text=f"{xgb_prec:.4f}"; row[3].text=f"{xgb_rec:.4f}"; row[4].text=f"{xgb_f1:.4f}"; row[5].text=f"{xgb_auc:.4f}"
doc.add_paragraph("Sumber: Hasil olahan penulis, 2025")

# Confusion Matrix teks-ringkas
doc.add_paragraph("")
doc.add_paragraph("Confusion Matrix - Random Forest")
cm_tbl = doc.add_table(rows=3, cols=3); cm_tbl.style='Table Grid'
cm_tbl.rows[0].cells[0].text=""; cm_tbl.rows[0].cells[1].text="Pred: Normal"; cm_tbl.rows[0].cells[2].text="Pred: Gangguan"
cm_tbl.rows[1].cells[0].text="Actual: Normal"; cm_tbl.rows[1].cells[1].text=str(cm_rf[0,0]); cm_tbl.rows[1].cells[2].text=str(cm_rf[0,1])
cm_tbl.rows[2].cells[0].text="Actual: Gangguan"; cm_tbl.rows[2].cells[1].text=str(cm_rf[1,0]); cm_tbl.rows[2].cells[2].text=str(cm_rf[1,1])

doc.add_paragraph("")
doc.add_paragraph("Confusion Matrix - XGBoost")
cm_tbl2 = doc.add_table(rows=3, cols=3); cm_tbl2.style='Table Grid'
cm_tbl2.rows[0].cells[0].text=""; cm_tbl2.rows[0].cells[1].text="Pred: Normal"; cm_tbl2.rows[0].cells[2].text="Pred: Gangguan"
cm_tbl2.rows[1].cells[0].text="Actual: Normal"; cm_tbl2.rows[1].cells[1].text=str(cm_xgb[0,0]); cm_tbl2.rows[1].cells[2].text=str(cm_xgb[0,1])
cm_tbl2.rows[2].cells[0].text="Actual: Gangguan"; cm_tbl2.rows[2].cells[1].text=str(cm_xgb[1,0]); cm_tbl2.rows[2].cells[2].text=str(cm_xgb[1,1])

# ROC & Feature Importance images
doc.add_paragraph("")
doc.add_paragraph("Gambar 4.1 ROC Curve (Random Forest vs XGBoost)")
p = doc.add_paragraph(); r = p.add_run(); r.add_picture(roc_path, width=Cm(14))

doc.add_paragraph("")
doc.add_paragraph("4.3 PENTINGNYA FITUR (FEATURE IMPORTANCE)")
tbl2 = doc.add_table(rows=1, cols=3); tbl2.style='Table Grid'
hdr = tbl2.rows[0].cells; hdr[0].text="Peringkat"; hdr[1].text="Random Forest (Feature)"; hdr[2].text="XGBoost (Feature)"
top_n = min(8, len(feature_cols))
rf_top = rf_imp.index.tolist()[:top_n]; xgb_top = xgb_imp.index.tolist()[:top_n]
for i in range(top_n):
    row = tbl2.add_row().cells
    row[0].text = str(i+1)
    row[1].text = f"{rf_top[i]} ({rf_imp[rf_top[i]]:.3f})"
    row[2].text = f"{xgb_top[i]} ({xgb_imp[xgb_top[i]]:.3f})"
doc.add_paragraph("Sumber: Hasil olahan penulis, 2025")

doc.add_paragraph("")
doc.add_paragraph("Gambar 4.2 Feature Importance - Random Forest")
p = doc.add_paragraph(); r = p.add_run(); r.add_picture(rf_feat_path, width=Cm(14))

doc.add_paragraph("")
doc.add_paragraph("Gambar 4.3 Feature Importance - XGBoost")
p = doc.add_paragraph(); r = p.add_run(); r.add_picture(xgb_feat_path, width=Cm(14))

# 4.4 & 4.5
doc.add_paragraph("")
doc.add_paragraph("4.4 ANALISIS KESELURUHAN")
doc.add_paragraph("Kedua model menunjukkan performa yang baik dengan AUC tinggi. XGBoost cenderung lebih unggul pada recall dan AUC sehingga cocok sebagai model utama untuk early-warning, sementara Random Forest stable for interpretability and analysis of feature contributions.")

doc.add_paragraph("")
doc.add_paragraph("4.5 REKOMENDASI")
doc.add_paragraph("- Lakukan retraining periodik untuk menangani concept drift.\n- Integrasikan model ke sistem monitoring untuk notifikasi real-time.\n- Pertimbangkan SHAP untuk explainability per-insiden.")

# Simpan dokumen
doc.save(OUTPUT_DOCX)
print('Selesai. Dokumen disimpan sebagai:', OUTPUT_DOCX)

Selesai. Dokumen disimpan sebagai: BAB_IV_HASIL_DAN_ANALISIS_MODEL_PREDIKSI_GANGGUAN_JARINGAN.docx


## 4) Unduh Dokumen Word

In [8]:
from google.colab import files
files.download("BAB_IV_HASIL_DAN_ANALISIS_MODEL_PREDIKSI_GANGGUAN_JARINGAN.docx")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>