In [14]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy import stats

# Load the dataset
df = pd.read_csv('/content/Modified_dataset.csv')

# Step 1: Handle Missing Values
missing_data_rows = df[df.isna().any(axis=1)]

# Fill numerical columns with the median and categorical columns with the mode
num_columns = df.select_dtypes(include=['float64', 'int64']).columns
cat_columns = df.select_dtypes(include=['object']).columns

for col in num_columns:
    df[col].fillna(df[col].median(), inplace=True)

for col in cat_columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Step 2: Remove Duplicates
duplicate_rows = df[df.duplicated()]
df.drop_duplicates(inplace=True)

# Step 3: Encode Categorical Columns
# Encode 'Attack Type' and 'Label' columns if they are categorical
for col in ['Attack Type', 'Label']:
    if col in df.columns and df[col].dtype == 'object':
        df[col] = pd.Categorical(df[col]).codes

# Step 4: Outlier Handling
z_scores = stats.zscore(df[num_columns])
abs_z_scores = abs(z_scores)
df[num_columns] = np.clip(df[num_columns], -3, 3)

# Step 5: Feature Scaling
scaler = StandardScaler()
df[num_columns] = scaler.fit_transform(df[num_columns])

# Display a preview of the cleaned data
print(df.head())

# Save the cleaned data
df.to_csv('cleaned_dataset.csv', index=False)


   Destination Port  Flow Duration  Total Fwd Packets  Total Backward Packets  \
0          0.121268       0.146019           0.676946                0.842174   
1          0.121268       0.146019           0.676946                0.842174   
2          0.121268       0.146019           0.676946                0.842174   
3          0.121268       0.146019           0.676946                0.842174   
4          0.121268       0.146019           0.676946                0.842174   

   Total Length of Fwd Packets  Total Length of Bwd Packets  \
0                     0.518936                     0.768955   
1                     0.518936                     0.768955   
2                     0.518936                     0.768955   
3                     0.518936                     0.768955   
4                     0.518936                     0.768955   

   Fwd Packet Length Max  Fwd Packet Length Min  Fwd Packet Length Mean  \
0               0.518936              -0.570549            

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
