In [1]:
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.14.4
#   kernelspec:
#     display_name: Python 3
#     language: python
#     name: python3
# ---

# # Breast Cancer Prediction - Data Preprocessing

# ## 1. Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# ## 2. Load the Cleaned Dataset
# Load the cleaned dataset from the `data/processed/` folder
df = pd.read_csv('../data/processed/breast_cancer_clean.csv')

# Display the first few rows of the dataset
print("First 5 rows of the cleaned dataset:")
print(df.head())

# ## 3. Drop Unnecessary Columns
# Drop the `id` column as it is not useful for prediction
df = df.drop('id', axis=1)

# Display the first few rows after dropping the `id` column
print("\nFirst 5 rows after dropping the `id` column:")
print(df.head())

# ## 4. Split the Data into Features and Target
# Separate features (X) and target variable (y)
X = df.drop('diagnosis_encoded', axis=1)
y = df['diagnosis_encoded']

# Display the shape of features and target
print("\nShape of features (X):", X.shape)
print("Shape of target (y):", y.shape)

# ## 5. Split the Data into Training and Testing Sets
# Split the data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shape of training and testing sets
print("\nShape of training set (X_train):", X_train.shape)
print("Shape of testing set (X_test):", X_test.shape)
print("Shape of training labels (y_train):", y_train.shape)
print("Shape of testing labels (y_test):", y_test.shape)

# ## 6. Normalize Numerical Features
# Normalize the numerical features to ensure all features are on the same scale
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Display the first few rows of the normalized training set
print("\nFirst 5 rows of the normalized training set:")
print(X_train[:5])

# ## 7. Handle Class Imbalance (if any)
# Check the distribution of the target variable in the training set
print("\nDistribution of the target variable in the training set:")
print(pd.Series(y_train).value_counts())

# Apply SMOTE to balance the classes (if imbalanced)
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

# Display the distribution of the target variable after SMOTE
print("\nDistribution of the target variable after SMOTE:")
print(pd.Series(y_res).value_counts())

# ## 8. Save the Preprocessed Data
# Save the preprocessed data to the `data/processed/` folder
np.save('../data/processed/X_train.npy', X_res)
np.save('../data/processed/X_test.npy', X_test)
np.save('../data/processed/y_train.npy', y_res)
np.save('../data/processed/y_test.npy', y_test)

print("\nData preprocessing completed! Preprocessed data saved to `data/processed/`.")

First 5 rows of the cleaned dataset:
         id  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302        17.99         10.38          122.80     1001.0   
1    842517        20.57         17.77          132.90     1326.0   
2  84300903        19.69         21.25          130.00     1203.0   
3  84348301        11.42         20.38           77.58      386.1   
4  84358402        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   symmetry_mean  ...  texture_worst  perimeter_worst  area_worst  \
0         0.2419