<a href="https://colab.research.google.com/github/hyper-delta/Network-Anomaly-Detector/blob/main/AI_Planet_Submission.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# Load the data
train_df = pd.read_csv(r'/content/Train_Data.csv')
test_df = pd.read_csv(r'/content/Test_Data.csv')

# Ensure no data leakage
assert 'attack' not in test_df.columns, "Test data should not contain the target column 'attack'"

# Check for missing values in the target column
if train_df['attack'].isnull().sum() > 0:
    raise ValueError("Target column 'attack' contains NaN values")

# Separate features and target
X_train = train_df.drop(columns=['attack'])
y_train = train_df['attack'].map({'normal': 0, 'neptune': 1})

# Check for missing values in the features and handle them
if X_train.isnull().sum().any():
    X_train = X_train.fillna(X_train.median())

# Identify categorical and numerical columns
categorical_cols = ['protocoltype', 'service', 'flag', 'land', 'loggedin', 'ishostlogin', 'isguestlogin', 'lastflag']
numerical_cols = [col for col in X_train.columns if col not in categorical_cols]

# Preprocessing pipelines for both numerical and categorical data
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Create the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)
                          ])

# Use cross-validation to evaluate the model
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='f1')
print(f'Cross-Validation F1 Scores: {cv_scores}')
print(f'Average Cross-Validation F1 Score: {np.mean(cv_scores)}')

# Train the model on the full training set
pipeline.fit(X_train, y_train)

# Predict on the test set
# Check for missing values in the test set and handle them
if test_df.isnull().sum().any():
    test_df = test_df.fillna(test_df.median())

predictions = pipeline.predict(test_df)

# Create submission file
submission = pd.DataFrame({'attack': predictions})
submission['attack'] = submission['attack'].map({0: 0, 1: 1})
submission.to_csv('Satyarth_Gupta_submission.csv', index=False)


Cross-Validation F1 Scores: [1. 1. 1. 1. 1.]
Average Cross-Validation F1 Score: 1.0
