In [2]:
import pandas as pd
import numpy as np
import os
import kagglehub
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
datasetpath = kagglehub.dataset_download('chethuhn/network-intrusion-dataset')
print(f"Dataset downloaded to: {datasetpath}")

Dataset downloaded to: /Users/ananthakrishnan/.cache/kagglehub/datasets/chethuhn/network-intrusion-dataset/versions/1


In [4]:
datasetfile = os.path.join(datasetpath, 'network_intrusion_data.csv')

In [5]:
files = os.listdir(datasetpath)
csv_files = []
for file in files:
    if file.endswith('.csv'):
        csv_files.append(file)

In [6]:
data = pd.DataFrame()

In [7]:
print(data.head())

Empty DataFrame
Columns: []
Index: []


In [8]:
for file in csv_files:
        file_path = os.path.join(datasetpath, file)
        temp_data = pd.read_csv(file_path)
        data = pd.concat([data, temp_data], ignore_index=True)

In [9]:
col_names = {col: col.strip() for col in data.columns}
data.rename(columns = col_names, inplace = True)

In [10]:
data.replace([np.inf,-np.inf],np.nan,inplace=True)
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)

In [11]:
target = data['Label']
features = data.drop(columns=['Label'])

In [12]:
categorical_columns = features.select_dtypes(include=['object']).columns.tolist()
encoder = LabelEncoder()
for col in categorical_columns:
    features[col] = encoder.fit_transform(features[col].astype(str))

In [13]:
numerical_columns = features.select_dtypes(include=['float64', 'int64']).columns.tolist()
scaler = StandardScaler()
features[numerical_columns] = scaler.fit_transform(features[numerical_columns])

In [14]:
n_features = features.shape[1]
pca = PCA(n_components=min(n_features, 50)) 
features_pca = pca.fit_transform(features)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(features_pca, target, test_size=0.2, random_state=42)
