In [1]:
import pickle

# Load the dataset from the pickle file
with open('processed/binary_dataset.pkl', 'rb') as f:
    data = pickle.load(f)

X_scaled = data['features']  # Features
y_binary = data['labels']    # Labels
X_scaled.shape


(2821051, 42)

In [2]:
import numpy as np

# Assuming 'y_binary' is a numpy array or list containing binary labels (0s and 1s)
# Count the occurrences of each class
unique_classes, class_counts = np.unique(y_binary, return_counts=True)

# Print the class distribution
for cls, count in zip(unique_classes, class_counts):
    print(f"Class {cls}: {count} samples")

# Alternatively, you can display the class distribution using a pandas DataFrame
import pandas as pd

class_distribution = pd.DataFrame({'Class': unique_classes, 'Count': class_counts})
print(class_distribution)


Class 0: 2273097 samples
Class 1: 547954 samples
   Class    Count
0      0  2273097
1      1   547954


OVERSAMPLING

In [3]:
from imblearn.over_sampling import SMOTE
from collections import Counter

#USE RANDOM SAMPLING INSTEAD ?


# Assuming 'X_scaled' and 'y_binary' are your feature matrix and binary labels
# Check the class distribution before applying SMOTE
print("Class distribution before SMOTE:")
print(Counter(y_binary))

# Define the sampling strategy for SMOTE
# Set 'sampling_strategy' to 'auto' or a specific ratio to balance the classes
sampling_strategy = 'auto'  # Use 'auto' to balance the classes

# Initialize SMOTE
smote = SMOTE(sampling_strategy=sampling_strategy)

# Apply SMOTE to the dataset
X_resampled, y_resampled = smote.fit_resample(X_scaled, y_binary)

# Check the class distribution after applying SMOTE
print("Class distribution after SMOTE:")
print(Counter(y_resampled))


Class distribution before SMOTE:
Counter({0: 2273097, 1: 547954})
Class distribution after SMOTE:
Counter({0: 2273097, 1: 2273097})


UNDERSAMPLING

In [3]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

# Assuming 'X_scaled' and 'y_binary' are your feature matrix and binary labels
# Check the class distribution before applying undersampling
print("Class distribution before undersampling:")
print(Counter(y_binary))

# Initialize RandomUnderSampler
rus = RandomUnderSampler(random_state=42)

# Apply RandomUnderSampler to the dataset
X_resampled, y_resampled = rus.fit_resample(X_scaled, y_binary)

# Check the class distribution after applying RandomUnderSampler
print("Class distribution after undersampling:")
print(Counter(y_resampled))

Class distribution before undersampling:
Counter({0: 2273097, 1: 547954})
Class distribution after undersampling:
Counter({0: 547954, 1: 547954})


In [4]:
X_resampled.shape

(4546194, 42)

In [7]:
import pickle

# Save the resampled dataset using pickle
resampled_dataset = {'features': X_resampled, 'labels': y_resampled}

# Define the filename for the saved dataset
output_filename = 'processed/oversampled_balanced_binary_dataset.pkl'

# Open a file in binary write mode and save the dataset using pickle.dump()
with open(output_filename, 'wb') as f:
    pickle.dump(resampled_dataset, f)

print(f"Resampled dataset saved to {output_filename}")


Resampled dataset saved to processed/oversampled_balanced_binary_dataset.pkl


In [6]:
import pandas as pd

# Convert resampled features (X_resampled) and labels (y_resampled) to pandas DataFrames
X_resampled_df = pd.DataFrame(X_resampled, columns=data['features'].columns)  # Assuming column names are available in the loaded data
y_resampled_series = pd.Series(y_resampled, name='Label')

# Concatenate features and labels into a single DataFrame
resampled_df = pd.concat([X_resampled_df, y_resampled_series], axis=1)

# Define the filename for the saved undersampled dataset
output_filename = 'processed/undersampled_dataset.csv'

# Save the undersampled dataset to CSV
resampled_df.to_csv(output_filename, index=False)

print(f"Undersampled dataset saved to {output_filename}")


Undersampled dataset saved to processed/undersampled_dataset.csv
