In [2]:
import pandas as pd
import numpy as np
import os
import sys
import logging
import pickle
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Setup logging with UTF-8 encoding for both file and stream handlers
log_file = 'etl_log.csv'
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s,%(levelname)s,%(message)s',
    handlers=[
        logging.FileHandler(log_file, mode='w', encoding='utf-8'),
        logging.StreamHandler(sys.stdout)  # Default encoding, may still cause issues
    ]
)

# Try adding specific encoding to StreamHandler
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setFormatter(logging.Formatter('%(asctime)s,%(levelname)s,%(message)s'))
console_handler.stream = sys.stdout  # Reset stream with 'utf-8' compatibility
logging.getLogger().addHandler(console_handler)

def process_csv(input_file):
    try:
        logging.info(f"Started processing: {input_file}")

        # Load data
        df = pd.read_csv(input_file)
        logging.info("CSV loaded successfully")

        # Detect categorical and numeric columns
        cat_cols = df.select_dtypes(include=['object']).columns.tolist()
        num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
        logging.info(f"Categorical Columns: {cat_cols}")
        logging.info(f"Numeric Columns: {num_cols}")

        # Fill missing values
        for col in num_cols:
            df[col].fillna(df[col].mean(), inplace=True)
            logging.info(f"Filled missing numeric: {col}")
        for col in cat_cols:
            df[col].fillna(df[col].mode()[0], inplace=True)
            logging.info(f"Filled missing categorical: {col}")

        # Encode categorical columns
        for col in cat_cols:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
            with open(f'label_encoder_{col}.pkl', 'wb') as f:
                pickle.dump(le, f)
            logging.info(f"Encoded: {col}")

        # Scale numeric columns
        scaler = StandardScaler()
        df[num_cols + cat_cols] = scaler.fit_transform(df[num_cols + cat_cols])
        with open('scaler.pkl', 'wb') as f:
            pickle.dump(scaler, f)
        logging.info("Scaling done")

        # Save processed output
        df.to_csv('processed_output.csv', index=False)
        logging.info("Processed data saved to 'processed_output.csv'")

        # Save column summary
        summary = df.describe().transpose()
        summary.to_csv('column_summary.csv')
        logging.info("Column summary saved to 'column_summary.csv'")

        logging.info("ETL pipeline completed successfully!")

    except Exception as e:
        logging.error(f"Error: {e}")

if __name__ == "__main__":
    input_path = "processed_titanic_data.csv"  # Change this path if needed

    if os.path.exists(input_path):
        process_csv(input_path)
    else:
        logging.error(f"File not found: {input_path}")


2025-05-04 10:45:20,798,INFO,Started processing: processed_titanic_data.csv
2025-05-04 10:45:20,798,INFO,Started processing: processed_titanic_data.csv
2025-05-04 10:45:20,798,INFO,Started processing: processed_titanic_data.csv
2025-05-04 10:45:20,807,INFO,CSV loaded successfully
2025-05-04 10:45:20,807,INFO,CSV loaded successfully
2025-05-04 10:45:20,807,INFO,CSV loaded successfully
2025-05-04 10:45:20,813,INFO,Categorical Columns: ['Name']
2025-05-04 10:45:20,813,INFO,Categorical Columns: ['Name']
2025-05-04 10:45:20,813,INFO,Categorical Columns: ['Name']
2025-05-04 10:45:20,817,INFO,Numeric Columns: ['Sex', 'Age', 'Fare', 'Embarked']
2025-05-04 10:45:20,817,INFO,Numeric Columns: ['Sex', 'Age', 'Fare', 'Embarked']
2025-05-04 10:45:20,817,INFO,Numeric Columns: ['Sex', 'Age', 'Fare', 'Embarked']
2025-05-04 10:45:20,819,INFO,Filled missing numeric: Sex
2025-05-04 10:45:20,819,INFO,Filled missing numeric: Sex
2025-05-04 10:45:20,819,INFO,Filled missing numeric: Sex
2025-05-04 10:45:20,82