In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.impute import SimpleImputer

# Load the datasets
train_data = pd.read_csv('Train_Data.csv')
test_data = pd.read_csv('Test_Data.csv')

# Display the first few rows of the training data
print(train_data.head())

# Check for missing values
print(train_data.isnull().sum())

# Separate features and target variable from the training data
X = train_data.drop('attack', axis=1)
y = train_data['attack']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=[np.number]).columns

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define models to train
models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "SVM": SVC(random_state=42),
    "KNN": KNeighborsClassifier(),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "AdaBoost": AdaBoostClassifier(random_state=42)
}

# Define parameter grids for each model
param_grids = {
    "RandomForest": {
        'model__n_estimators': [100, 200],
        'model__max_depth': [None, 10, 20],
        'model__min_samples_split': [2, 10],
        'model__min_samples_leaf': [1, 5]
    },
    "SVM": {
        'model__C': [0.1, 1, 10],
        'model__gamma': ['scale', 'auto']
    },
    "KNN": {
        'model__n_neighbors': [3, 5, 7],
        'model__weights': ['uniform', 'distance']
    },
    "DecisionTree": {
        'model__max_depth': [None, 10, 20],
        'model__min_samples_split': [2, 10],
        'model__min_samples_leaf': [1, 5]
    },
    "GradientBoosting": {
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.01, 0.1, 1],
        'model__max_depth': [3, 5, 7]
    },
    "AdaBoost": {
        'model__n_estimators': [50, 100],
        'model__learning_rate': [0.01, 0.1, 1]
    }
}

# Preprocessing pipeline
model_pipelines = {}
for model_name in models:
    model_pipelines[model_name] = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', models[model_name])
    ])

# Find the best model and parameters using GridSearchCV
best_model = None
best_score = 0
best_params = None

for model_name in model_pipelines:
    grid_search = GridSearchCV(model_pipelines[model_name], param_grids[model_name], cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X, y)
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best score for {model_name}: {grid_search.best_score_}")
    
    if grid_search.best_score_ > best_score:
        best_score = grid_search.best_score_
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_

print(f"\nBest Model: {best_model}")
print(f"Best Parameters: {best_params}")

# Evaluate the best model on the test set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
best_model.fit(X_train, y_train)
y_val_pred = best_model.predict(X_val)
print("Validation Accuracy: ", accuracy_score(y_val, y_val_pred))
print("Confusion Matrix: \n", confusion_matrix(y_val, y_val_pred))
print("Classification Report: \n", classification_report(y_val, y_val_pred))

# Make predictions on the actual test data
test_predictions = best_model.predict(test_data)

# Load sample submission file to match the format
sample_submission = pd.read_csv('Sample_Submission.csv')
sample_submission['attack'] = test_predictions

# Save the predictions to a CSV file
sample_submission.to_csv('Test_Predictions.csv', index=False)

print("Predictions saved to test_predictions.csv")


   duration protocoltype      service flag  srcbytes  dstbytes  land  \
0         0          tcp  netbios_dgm  REJ         0         0     0   
1         0          tcp         smtp   SF      1239       400     0   
2         0          tcp         http   SF       222       945     0   
3         0          tcp         http   SF       235      1380     0   
4         0          tcp    uucp_path  REJ         0         0     0   

   wrongfragment  urgent  hot  ...  dsthostsamesrvrate  dsthostdiffsrvrate  \
0              0       0    0  ...                0.06                0.06   
1              0       0    0  ...                0.45                0.04   
2              0       0    0  ...                1.00                0.00   
3              0       0    0  ...                1.00                0.00   
4              0       0    0  ...                0.01                0.08   

   dsthostsamesrcportrate  dsthostsrvdiffhostrate  dsthostserrorrate  \
0                    0.00 