In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import pickle

# Function to load and inspect the dataset
def load_and_inspect_data(file_path):
    df = pd.read_csv(file_path)
    print("Dataset Information:")
    print(df.info())
    missing_values = df.isnull().sum()
    print("\nMissing Values:")
    print(missing_values)
    print("\nSummary Statistics:")
    print(df.describe())
    return df

# Function to encode categorical 'RiskLevel' column
def encode_risk_level(df):
    label_encoder = LabelEncoder()
    df['RiskLevel'] = label_encoder.fit_transform(df['RiskLevel'])
    return df, label_encoder

# Function to visualize the pairplot for dataset features
def plot_pairplot(df):
    sns.pairplot(df, hue="RiskLevel", palette="viridis", diag_kind="kde", height=2.5)
    plt.show()

# Function to visualize outliers using boxplots
def plot_boxplots(df):
    plt.figure(figsize=(15, 10))
    num_cols = len(df.columns)
    num_rows = (num_cols + 1) // 2
    for i, column in enumerate(df.columns, 1):
        plt.subplot(num_rows, 2, i)
        sns.boxplot(x=df[column], color='teal')
    plt.tight_layout()
    plt.show()

# Function to handle outliers by capping with IQR method
def cap_outliers(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    df_no_outliers = df.copy()
    for column in df.columns:
        lower_bound = Q1[column] - 1.5 * IQR[column]
        upper_bound = Q3[column] + 1.5 * IQR[column]
        df_no_outliers[column] = np.where(df[column] < lower_bound, lower_bound, df_no_outliers[column])
        df_no_outliers[column] = np.where(df[column] > upper_bound, upper_bound, df_no_outliers[column])
    return df_no_outliers

# Function to split data into train, validation, and test sets
def split_data(df):
    X = df.drop('RiskLevel', axis=1)
    y = df['RiskLevel']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)
    return X_train, X_val, X_test, y_train, y_val, y_test

# Function to handle class imbalance using SMOTE
def handle_imbalance(X_train, y_train):
    smote = SMOTE(sampling_strategy={0: 406}, random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    return X_train_resampled, y_train_resampled

# Function to scale the features using StandardScaler
def scale_data(X_train_resampled, X_val, X_test):
    scaler = StandardScaler()
    X_train_resampled_scaled = scaler.fit_transform(X_train_resampled)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    with open('scaler.pkl', 'wb') as scaler_file:
        pickle.dump(scaler, scaler_file)
    return X_train_resampled_scaled, X_val_scaled, X_test_scaled

# Function to save training and test data to CSV
def save_data_to_csv(X_train, X_val, X_test, y_train, y_val, y_test):
    pd.DataFrame(X_train).to_csv('X_train.csv', index=False)
    pd.DataFrame(X_val).to_csv('X_val.csv', index=False)
    pd.DataFrame(X_test).to_csv('X_test.csv', index=False)
    pd.DataFrame(y_train).to_csv('y_train.csv', index=False)
    pd.DataFrame(y_val).to_csv('y_val.csv', index=False)
    pd.DataFrame(y_test).to_csv('y_test.csv', index=False)

# Main function that ties everything together
def preprocess_and_save(file_path):
    df = load_and_inspect_data(file_path)
    df, label_encoder = encode_risk_level(df)
    plot_pairplot(df)
    plot_boxplots(df)
    df_no_outliers = cap_outliers(df)
    X_train, X_val, X_test, y_train, y_val, y_test = split_data(df_no_outliers)
    X_train_resampled, y_train_resampled = handle_imbalance(X_train, y_train)
    X_train_resampled_scaled, X_val_scaled, X_test_scaled = scale_data(X_train_resampled, X_val, X_test)
    save_data_to_csv(X_train, X_val, X_test, y_train, y_val, y_test)
    return X_train_resampled_scaled, X_val_scaled, X_test_scaled, y_train_resampled, y_val, y_test, label_encoder

# Execute if script is run as the main module
if __name__ == "__main__":
    file_path = 'maternal_health_risk.csv'
    X_train_resampled_scaled, X_val_scaled, X_test_scaled, y_train_resampled, y_val, y_test, label_encoder = preprocess_and_save(file_path)
    print("\nPreprocessing Completed!")
    print(f"Shapes:\nX_train: {X_train_resampled_scaled.shape}\nX_val: {X_val_scaled.shape}\nX_test: {X_test_scaled.shape}")
    print(f"y_train: {y_train_resampled.shape}\ny_val: {y_val.shape}\ny_test: {y_test.shape}")


FileNotFoundError: [Errno 2] No such file or directory: 'maternal_health_risk.csv'