
# 🚀 Intrusion Detection System using Machine Learning  
### Expo Project – Cybersecurity & Machine Learning  
**By: Chella Krishnan D**  

---


In [None]:

# Step 1: Download dataset directly into Colab
!wget https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain+.csv


In [None]:

# Step 2: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:

# Step 3: Load dataset
df = pd.read_csv("KDDTrain+.csv")

print("Dataset Shape:", df.shape)
print("\nSample Rows:")
display(df.head())

print("\nClass Distribution:")
print(df['label'].value_counts())


In [None]:

# Step 4: Preprocessing
X = df.drop('label', axis=1)
y = df['label'].apply(lambda x: 0 if x == 'normal' else 1)  # 0=Normal, 1=Attack

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:

# Step 5: Feature Importance using Extra Trees
etc = ExtraTreesClassifier()
etc.fit(X_scaled, y)
importances = etc.feature_importances_

feat_importances = pd.Series(importances, index=df.drop('label', axis=1).columns)
feat_importances.nlargest(10).plot(kind='barh', figsize=(8,6))
plt.title("Top 10 Important Features")
plt.show()


In [None]:

# Step 6: Train/Test split and model training
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

models = {
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB()
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    results[name] = {
        "Accuracy": accuracy_score(y_test, preds),
        "Precision": precision_score(y_test, preds),
        "Recall": recall_score(y_test, preds),
        "F1": f1_score(y_test, preds)
    }

results_df = pd.DataFrame(results).T
results_df


In [None]:

# Step 7: Heatmap of results
plt.figure(figsize=(8,4))
sns.heatmap(results_df, annot=True, cmap="Blues", fmt=".2f")
plt.title("Model Performance Metrics")
plt.show()


In [None]:

# Step 8: Confusion Matrix for Random Forest
best_model = RandomForestClassifier()
best_model.fit(X_train, y_train)
preds = best_model.predict(X_test)

ConfusionMatrixDisplay.from_estimator(best_model, X_test, y_test, cmap="Blues")
plt.title("Confusion Matrix - Random Forest")
plt.show()



# ✅ Conclusion  
- Random Forest performed best with ~90% accuracy.  
- Tree-based methods generally outperform others.  
- Dataset imbalance remains a limitation.  
- Future work: Deep Learning and balancing strategies for improved detection.  

---
