In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from collections import Counter

# Load your dataset
data = pd.read_csv('alldataset.csv')  # Replace with your file path

# Separate features (X) and target (y)
X = data.drop('Label', axis=1)  # Features
y = data['Label']  # Target variable

# Check class distribution before SMOTE
print("Original class distribution:", Counter(y))

# Split data into train and test sets FIRST (avoid data leakage)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Apply SMOTE only to the training data
smote = SMOTE(random_state=42, sampling_strategy='auto')
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Check new class distribution after SMOTE
print("Class distribution after SMOTE:", Counter(y_train_smote))

# Convert back to DataFrame (optional)
train_df_smote = pd.DataFrame(X_train_smote, columns=X.columns)
train_df_smote['Label'] = y_train_smote

# Now you can use X_train_smote, y_train_smote for training
print("\nResampled training set shape:", X_train_smote.shape)
print("Test set shape (unchanged):", X_test.shape)

# Save the resampled data if needed
train_df_smote.to_csv('botnet_dataset_smote_train.csv', index=False)
pd.concat([X_test, y_test], axis=1).to_csv('botnet_dataset_test.csv', index=False)

Original class distribution: Counter({1: 17821994, 0: 520028})
Class distribution after SMOTE: Counter({1: 14257595, 0: 14257595})

Resampled training set shape: (28515190, 12)
Test set shape (unchanged): (3668405, 12)
