In [14]:
# Import necessary libraries
import boto3
from io import StringIO
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [15]:
# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings("ignore")

In [16]:
# Create output directory if not exists
output_dir = "../data/outputs/"
os.makedirs(output_dir, exist_ok=True)

In [17]:
# Load dataset
# file_path = "data/inputs/insurance_data.csv"
# df = pd.read_csv(file_path)
# Define S3 bucket and file details
bucket_name = "refocus-storage"
filename = 'insurance_data_2025-02-11_02-45-54'
file_key = "inusrance-data-raw/{}.csv".format(filename)

# Create S3 client
s3 = boto3.client("s3")

# Read CSV file from S3
obj = s3.get_object(Bucket=bucket_name, Key=file_key)
csv_data = obj["Body"].read().decode("utf-8")

# Convert CSV data to pandas DataFrame
df = pd.read_csv(StringIO(csv_data))

print("Dataset Loaded Successfully!")

Dataset Loaded Successfully!


In [18]:
# ---- STEP 1: HANDLE MISSING VALUES ----
# Check for missing values
missing_values = df.isnull().sum()

# If missing values exist, impute with mean/median/mode as appropriate
for col in df.columns:
    if df[col].isnull().sum() > 0:
        if df[col].dtype == 'object':
            df[col].fillna(df[col].mode()[0], inplace=True)  # Fill categorical with mode
        else:
            df[col].fillna(df[col].median(), inplace=True)  # Fill numerical with median

print("Missing Values Handled.")

Missing Values Handled.


In [19]:
# ---- STEP 2: REMOVE DUPLICATES ----
df.drop_duplicates(inplace=True)
print("Duplicates Removed.")

Duplicates Removed.


In [20]:
# ---- STEP 3: HANDLE NEGATIVE VALUES ----
# Check and correct any negative values in numerical columns
num_columns = ["age", "annual_premium", "claims_count"]
for col in num_columns:
    df[col] = df[col].apply(lambda x: np.abs(x))  # Take absolute value if negative

print("Negative Values Handled.")

Negative Values Handled.


In [21]:
# ---- STEP 4: FEATURE ENGINEERING ----
# 4.1 Convert categorical column "policy_type" into dummy variables
df = pd.get_dummies(df, columns=["policy_type"], drop_first=True)

In [22]:
# 4.2 Normalize numerical features using Min-Max Scaling
scaler = MinMaxScaler()
df[num_columns] = scaler.fit_transform(df[num_columns])

In [23]:
# 4.3 Ensure target variable is properly formatted
df["churn"] = df["churn"].astype(int)

print("Feature Engineering Completed.")

Feature Engineering Completed.


In [24]:
# ---- STEP 5: SAVE CLEANED AND FEATURE ENGINEERED DATA ----
cleaned_data_path = os.path.join(output_dir, "cleaned_insurance_data.csv")
features_data_path = os.path.join(output_dir, "features_insurance_data.csv")

# Save cleaned dataset before feature engineering
df.to_csv(cleaned_data_path, index=False)
print(f"Cleaned Data Saved at: {cleaned_data_path}")

# Save fully processed dataset with feature engineering
df.to_csv(features_data_path, index=False)
print(f"Feature-Engineered Data Saved at: {features_data_path}")

Cleaned Data Saved at: data/outputs/cleaned_insurance_data.csv
Feature-Engineered Data Saved at: data/outputs/features_insurance_data.csv


In [25]:
# Define S3 bucket and file details
bucket_name = "refocus-storage"
file_key = "insurance-data-processed/{}_processed.csv".format(filename)

# Convert DataFrame to CSV format in memory
csv_buffer = StringIO()
df.to_csv(csv_buffer, index=False)

# Create S3 client
s3 = boto3.client("s3")

# Upload CSV file to S3
s3.put_object(Bucket=bucket_name, Key=file_key, Body=csv_buffer.getvalue())

print(f"File successfully uploaded to s3://{bucket_name}/{file_key}")

File successfully uploaded to s3://refocus-storage/insurance-data-processed/insurance_data_2025-02-11_02-45-54_processed.csv


In [26]:
print("\nData Preprocessing Completed Successfully!")


Data Preprocessing Completed Successfully!
