In [6]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
import sys
import os
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def process_csv(input_file):
    try:
        logging.info(f"Processing file: {input_file}")

        # Load CSV
        df = pd.read_csv(input_file)
        logging.info("CSV file loaded successfully.")

        # Select relevant columns if they exist
        selected_cols = ['Name', 'Sex', 'Age', 'Fare', 'Embarked']
        available_cols = [col for col in selected_cols if col in df.columns]
        df = df[available_cols]
        logging.info(f"Selected columns: {available_cols}")

        # Fill missing numeric values with mean
        for col in df.select_dtypes(include='number').columns:
            df[col].fillna(df[col].mean(), inplace=True)
            logging.info(f"Filled missing values in column: {col}")

        # Fill missing categorical values with mode
        for col in df.select_dtypes(include='object').columns:
            df[col].fillna(df[col].mode()[0], inplace=True)
            logging.info(f"Filled missing values in categorical column: {col}")

        # Encode categorical columns
        label_encoder = LabelEncoder()
        for col in df.select_dtypes(include='object').columns:
            df[col] = label_encoder.fit_transform(df[col])
            logging.info(f"Encoded column: {col}")

        # Scale numeric columns
        scaler = StandardScaler()
        numeric_cols = df.select_dtypes(include='number').columns
        df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
        logging.info(f"Scaled numeric columns: {numeric_cols}")

        # Save processed output
        output_file = "processed_output.csv"
        df.to_csv(output_file, index=False)
        logging.info(f"Processed file saved as '{output_file}'")

    except FileNotFoundError:
        logging.error(f"Error: The file '{input_file}' does not exist.")
    except pd.errors.ParserError:
        logging.error("Error: There was an issue parsing the CSV file.")
    except Exception as e:
        logging.error(f"Error: {e}")

if __name__ == "__main__":
    input_path = "processed_titanic_data.csv"  # Change this to your actual path

    if os.path.exists(input_path):
        process_csv(input_path)
    else:
        logging.error(f"Error: File '{input_path}' not found.")


2025-05-04 09:48:19,183 - INFO - Processing file: processed_titanic_data.csv
2025-05-04 09:48:19,190 - INFO - CSV file loaded successfully.
2025-05-04 09:48:19,198 - INFO - Selected columns: ['Name', 'Sex', 'Age', 'Fare', 'Embarked']
2025-05-04 09:48:19,210 - INFO - Filled missing values in column: Sex
2025-05-04 09:48:19,219 - INFO - Filled missing values in column: Age
2025-05-04 09:48:19,221 - INFO - Filled missing values in column: Fare
2025-05-04 09:48:19,222 - INFO - Filled missing values in column: Embarked
2025-05-04 09:48:19,227 - INFO - Filled missing values in categorical column: Name
2025-05-04 09:48:19,234 - INFO - Encoded column: Name
2025-05-04 09:48:19,247 - INFO - Scaled numeric columns: Index(['Name', 'Sex', 'Age', 'Fare', 'Embarked'], dtype='object')
2025-05-04 09:48:19,256 - INFO - Processed file saved as 'processed_output.csv'
