In [3]:
# data_ingestion.py (Updated)

import pandas as pd
import numpy as np
import os


def load_and_preprocess_data(file_path):
    """
    Loads the raw insurance data from a pipe-delimited TXT file,
    performs initial data type conversions, and handles some basic cleaning.

    Args:
        file_path (str): The path to the raw data file (e.g., 'root/data/ml.txt').

    Returns:
        pandas.DataFrame: A preprocessed DataFrame.
    """
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Data file not found at: {file_path}")

    print(f"Loading data from: {file_path}")
    # Use low_memory=False to potentially resolve DtypeWarning during initial read
    df = pd.read_csv(file_path, sep='|', encoding='utf-8', low_memory=False)

    print("Initial data load successful. Performing initial preprocessing...")

    # --- Date Conversions ---
    # TransactionMonth: YYYY-MM-DD HH:MM:SS format
    df['TransactionMonth'] = pd.to_datetime(
        df['TransactionMonth'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
    # VehicleIntroDate: M/YYYY format
    df['VehicleIntroDate'] = pd.to_datetime(
        df['VehicleIntroDate'], format='%m/%Y', errors='coerce')

    # --- Numerical Conversions ---
    numerical_cols = [
        'Cylinders', 'cubiccapacity', 'kilowatts', 'NumberOfDoors',
        'CustomValueEstimate', 'CapitalOutstanding', 'NumberOfVehiclesInFleet',
        'SumInsured', 'CalculatedPremiumPerTerm', 'TotalPremium', 'TotalClaims'
    ]
    for col in numerical_cols:
        # Replace empty strings or spaces with NaN before converting to numeric
        df[col] = df[col].replace(r'^\s*$', np.nan, regex=True)
        # Convert to numeric, coercing errors. Convert to float to handle NaNs.
        df[col] = pd.to_numeric(df[col], errors='coerce').astype(float)

    # --- String Cleaning and Categorical Conversion ---
    string_cols = df.select_dtypes(include='object').columns
    for col in string_cols:
        # These are handled by boolean_map later
        if col not in ['WrittenOff', 'Rebuilt', 'Converted']:
            df[col] = df[col].astype(str).str.strip()
            # Replace 'Not specified', empty strings, or strings with only spaces with NaN
            df[col] = df[col].replace(['Not specified', '', ' '], np.nan)

    # --- Boolean Conversion for Yes/No columns ---
    boolean_map = {'Yes': True, 'No': False}
    # Columns 'WrittenOff', 'Rebuilt', 'Converted', 'CrossBorder' are often boolean-like
    # Also include 'AlarmImmobiliser', 'TrackingDevice', 'NewVehicle'
    bool_cols_to_process = [
        'AlarmImmobiliser', 'TrackingDevice', 'NewVehicle', 'WrittenOff',
        'Rebuilt', 'Converted', 'CrossBorder'
    ]
    for col in bool_cols_to_process:
        # Map values, then fill NaNs with False and explicitly convert to boolean dtype
        df[col] = df[col].map(boolean_map).fillna(False).astype(bool)

    # Convert object columns that should be categorical to 'category' dtype for memory efficiency
    # Exclude columns that are likely IDs or unique identifiers
    id_cols = ['UnderwrittenCoverID', 'PolicyID', 'mmcode']
    for col in df.select_dtypes(include='object').columns:
        if col not in id_cols:
            df[col] = df[col].astype('category')

    print("Initial preprocessing complete.")
    return df


if __name__ == "__main__":
    # Define the path to your data file
    # In a script, __file__ works well.
    data_dir = os.path.join(os.path.dirname(
        os.path.abspath('MachineLearningRating_v3.txt')), '../data')
    input_file_path = os.path.join(data_dir, 'MachineLearningRating_v3.txt')
    # Using parquet for efficiency
    output_file_path = os.path.join(data_dir, 'processed_ml_data.parquet')

    # Ensure the data directory exists
    os.makedirs(data_dir, exist_ok=True)

    try:
        # Load and preprocess data
        processed_df = load_and_preprocess_data(input_file_path)

        # Save the processed data for easier loading in EDA notebook
        processed_df.to_parquet(output_file_path, index=False)
        print(f"Processed data saved to: {output_file_path}")
        print(f"Processed DataFrame shape: {processed_df.shape}")
        print(f"Processed DataFrame info:\n{processed_df.info()}")

    except FileNotFoundError as e:
        print(e)
        print("Please ensure the 'ml.txt' file is located in a 'data' directory at the root of your project.")
    except ImportError:
        print("\nERROR: Parquet engine not found.")
        print("Please install 'pyarrow' or 'fastparquet' to save data in Parquet format.")
        print("You can install pyarrow using: pip install pyarrow")
    except Exception as e:
        print(f"An unexpected error occurred during data processing: {e}")

Loading data from: /home/yohannes/10Academy/Week3/End-to-End-Insurance-Analytics/notebooks/../data/MachineLearningRating_v3.txt
Initial data load successful. Performing initial preprocessing...


  df[col] = df[col].map(boolean_map).fillna(False).astype(bool)
  df[col] = df[col].map(boolean_map).fillna(False).astype(bool)
  df[col] = df[col].map(boolean_map).fillna(False).astype(bool)
  df[col] = df[col].map(boolean_map).fillna(False).astype(bool)
  df[col] = df[col].map(boolean_map).fillna(False).astype(bool)


Initial preprocessing complete.
Processed data saved to: /home/yohannes/10Academy/Week3/End-to-End-Insurance-Analytics/notebooks/../data/processed_ml_data.parquet
Processed DataFrame shape: (1000098, 52)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000098 entries, 0 to 1000097
Data columns (total 52 columns):
 #   Column                    Non-Null Count    Dtype         
---  ------                    --------------    -----         
 0   UnderwrittenCoverID       1000098 non-null  int64         
 1   PolicyID                  1000098 non-null  int64         
 2   TransactionMonth          1000098 non-null  datetime64[ns]
 3   IsVATRegistered           1000098 non-null  bool          
 4   Citizenship               104888 non-null   category      
 5   LegalType                 1000098 non-null  category      
 6   Title                     1000098 non-null  category      
 7   Language                  1000098 non-null  category      
 8   Bank                      1000098 non-