In [None]:
!pip install scikit-learn

import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp310-cp310-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.15.2-cp310-cp310-macosx_14_0_arm64.whl.metadata (61 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp310-cp310-macosx_12_0_arm64.whl (11.1 MB)
[2K   [38;2;249;38;114m━━━━━━━━━━━━[0m[38;5;237m╺[0m[38;5;237m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/11.1 MB[0m [31m466.4 kB/s[0m eta [36m0:00:17[0m:12[0m

In [None]:
datasetpath = kagglehub.dataset_download('chethuhn/network-intrusion-dataset')
print(f"Dataset downloaded to: {datasetpath}")

In [None]:
datasetfile = os.path.join(datasetpath, 'network_intrusion_data.csv')

In [None]:
files = os.listdir(datasetpath)
csv_files = []
for file in files:
    if file.endswith('.csv'):
        csv_files.append(file)

In [None]:
data = pd.DataFrame()

In [None]:
print(data.head())

In [None]:
for file in csv_files:
        file_path = os.path.join(datasetpath, file)
        temp_data = pd.read_csv(file_path)
        data = pd.concat([data, temp_data], ignore_index=True)

In [None]:
col_names = {col: col.strip() for col in data.columns}
data.rename(columns = col_names, inplace = True)

In [None]:
data.replace([np.inf,-np.inf],np.nan,inplace=True)
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)

In [None]:
target = data['Label']
features = data.drop(columns=['Label'])

In [None]:
categorical_columns = features.select_dtypes(include=['object']).columns.tolist()
encoder = LabelEncoder()
for col in categorical_columns:
    features[col] = encoder.fit_transform(features[col].astype(str))

In [None]:
numerical_columns = features.select_dtypes(include=['float64', 'int64']).columns.tolist()
scaler = StandardScaler()
features[numerical_columns] = scaler.fit_transform(features[numerical_columns])

In [None]:
n_features = features.shape[1]
pca = PCA(n_components=min(n_features, 50)) 
features_pca = pca.fit_transform(features)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features_pca, target, test_size=0.2, random_state=42)
model = XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")