In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub

In [None]:

try:
    from imblearn.over_sampling import SMOTE, RandomOverSampler
    from imblearn.under_sampling import RandomUnderSampler
    from imblearn.combine import SMOTETomek
    IMBALANCED_LEARN_AVAILABLE = True
except ImportError:
    print("Warning: imbalanced-learn not available. Install with: pip install imbalanced-learn")
    IMBALANCED_LEARN_AVAILABLE = False



# Download selected version
path = kagglehub.dataset_download("agungpambudi/network-malware-detection-connection-analysis/versions/1")
# Define the path where files were extracted
dataset_path = "/Users/ayda/.cache/kagglehub/datasets/agungpambudi/network-malware-detection-connection-analysis/versions/1"

# List files in that directory
files = os.listdir(dataset_path)
print("Files in dataset folder:", files)

# Assuming there's one large CSV (you can adjust this if needed)
csv_file = next((f for f in files if f.endswith(".csv")), None)

# Read the CSV (pipe-separated, as in CTU datasets)
data = pd.read_csv(os.path.join(dataset_path, csv_file), sep='|', low_memory=False)


print("Path to dataset files:", path)
drop_cols = ['uid', 'history', 'tunnel_parents', 'detailed-label', 'local_resp', 'local_orig', 'missed_bytes']
data.drop(columns=drop_cols, inplace=True, errors='ignore')

data.dropna(subset=['label', 'proto', 'service', 'duration', 'orig_bytes', 'resp_bytes'], inplace=True)


for col in ['duration', 'orig_bytes', 'resp_bytes']:
    data[col] = data[col].replace('-', np.nan)  # Replace '-' with NaN
    data[col] = pd.to_numeric(data[col], errors='coerce')  # Convert to numeric
    data[col].fillna(0, inplace=True)  # Replace NaNs with 0


data['proto'] = data['proto'].astype('category').cat.codes
data['service'] = data['service'].astype('category').cat.codes


data['label'] = data['label'].apply(lambda x: 0 if x == 'Benign' else 1)


features = ['proto', 'service', 'duration', 'orig_bytes', 'resp_bytes']
X = data[features]
y = data['label']


print("=== DATA BALANCE ANALYSIS ===")
class_distribution = Counter(y)
print(f"Class distribution: {class_distribution}")
print(f"Class 0 (Benign): {class_distribution[0]} ({class_distribution[0]/len(y)*100:.2f}%)")
print(f"Class 1 (Malicious): {class_distribution[1]} ({class_distribution[1]/len(y)*100:.2f}%)")

# Calculate imbalance ratio
minority_class = min(class_distribution.values())
majority_class = max(class_distribution.values())
imbalance_ratio = majority_class / minority_class
print(f"Imbalance ratio: {imbalance_ratio:.2f}:1")


is_imbalanced = imbalance_ratio > 2.0
print(f"Dataset is {'imbalanced' if is_imbalanced else 'balanced'}")


X_balanced, y_balanced = X.copy(), y.copy()

if is_imbalanced and IMBALANCED_LEARN_AVAILABLE:
    print("\n=== APPLYING DATA BALANCING ===")
    
    
    smote = SMOTE(random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)
    
    
    print("Balancing method used: SMOTE")
    balanced_distribution = Counter(y_balanced)
    print(f"Balanced class distribution: {balanced_distribution}")
    print(f"New dataset size: {len(y_balanced)} (original: {len(y)})")
    
    plt.subplot(1, 2, 2)
    plt.bar(balanced_distribution.keys(), balanced_distribution.values())
    plt.title('Balanced Class Distribution (SMOTE)')
    plt.xlabel('Class')
    plt.ylabel('Count')
    
elif is_imbalanced and not IMBALANCED_LEARN_AVAILABLE:
    print("\n=== MANUAL BALANCING (Simple Random Oversampling) ===")
    # Simple manual balancing if imbalanced-learn is not available
    minority_class_label = 0 if class_distribution[0] < class_distribution[1] else 1
    majority_class_label = 1 - minority_class_label
    
    # Get indices for each class
    minority_indices = y[y == minority_class_label].index
    majority_indices = y[y == majority_class_label].index
    
    # Oversample minority class
    n_samples_needed = len(majority_indices) - len(minority_indices)
    oversampled_indices = np.random.choice(minority_indices, n_samples_needed, replace=True)
    
    # Combine all indices
    balanced_indices = np.concatenate([majority_indices, minority_indices, oversampled_indices])
    
    # Create balanced dataset
    X_balanced = X.iloc[balanced_indices].reset_index(drop=True)
    y_balanced = y.iloc[balanced_indices].reset_index(drop=True)
    
    print("Balancing method used: Random Oversampling")
    balanced_distribution = Counter(y_balanced)
    print(f"Balanced class distribution: {balanced_distribution}")

plt.tight_layout()
plt.show()



In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced, test_size=0.2, stratify=y_balanced, random_state=42
)


print("\n=== CROSS-VALIDATION ANALYSIS ===")
clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(clf, X_balanced, y_balanced, cv=cv, scoring='accuracy')

print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# Additional cross-validation metrics
from sklearn.model_selection import cross_validate
cv_results = cross_validate(
    clf, X_balanced, y_balanced, cv=cv, 
    scoring=['accuracy', 'precision', 'recall', 'f1'], 
    return_train_score=False
)

print(f"CV Accuracy: {cv_results['test_accuracy'].mean():.4f} (+/- {cv_results['test_accuracy'].std() * 2:.4f})")
print(f"CV Precision: {cv_results['test_precision'].mean():.4f} (+/- {cv_results['test_precision'].std() * 2:.4f})")
print(f"CV Recall: {cv_results['test_recall'].mean():.4f} (+/- {cv_results['test_recall'].std() * 2:.4f})")
print(f"CV F1-score: {cv_results['test_f1'].mean():.4f} (+/- {cv_results['test_f1'].std() * 2:.4f})")


print("\n=== FINAL MODEL TRAINING ===")
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Test Set Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Classification Report:\n{classification_report(y_test, y_pred)}")


print("\n=== DETAILED EVALUATION ===")
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)
