In [None]:
import pandas as pd
import numpy as np
import warnings


print("--- Starting Data Processing Pipeline ---")

# --- 0. Load Data (as in data_analysis.ipynb) ---

consum_dades = pd.read_parquet("Mostra Set de dades 4_Incidències en comptadors intel·ligents_Consum.parquet")
perimetre_dades = pd.read_parquet("Mostra Set de dades 4_Incidències en comptadors intel·ligents_Perímetre.parquet")
print("✓ Step 0: Data loaded successfully.")
print(f"  - 'consum_dades' shape: {consum_dades.shape}")
print(f"  - 'perimetre_dades' shape: {perimetre_dades.shape}")


if not consum_dades.empty and not perimetre_dades.empty:

    # --- 1. Merge Datasets ---
    # Merge consumption time series with technical meter information
    # Using 'inner' join to ensure relational consistency
    print("\n--- Step 1: Merging Datasets ---")
    unified_df = pd.merge(
        consum_dades,
        perimetre_dades,
        on="POLISSA_SUBM",
        how="inner"
    )
    print(f"✓ Datasets merged on 'POLISSA_SUBM'. New shape: {unified_df.shape}")

    # --- 2. Verify Integrity and Clean Data ---
    print("\n--- Step 2: Verifying Integrity and Cleaning ---")
    
    # Check for missing values introduced during the merge (should be 0)
    missing_vals = unified_df.isnull().sum().sum()
    print(f"  - Total missing values: {missing_vals}")
    
    # Remove duplicated time series records (same meter, same date)
    initial_rows = unified_df.shape[0]
    unified_df.drop_duplicates(subset=['POLISSA_SUBM', 'DATA'], keep='first', inplace=True)
    rows_after_dupes = unified_df.shape[0]
    print(f"  - Removed {initial_rows - rows_after_dupes} duplicate records (same meter & date).")
    
    # Remove invalid records (e.g., negative consumption)
    initial_rows = unified_df.shape[0]
    unified_df = unified_df[unified_df['CONSUM'] >= 0]
    rows_after_invalid = unified_df.shape[0]
    print(f"  - Removed {initial_rows - rows_after_invalid} invalid records (negative consumption).")
    
    print(f"✓ Data cleaned. Final shape: {unified_df.shape}")

    # --- 3. Assess Completeness ---
    print("\n--- Step 3: Assessing Completeness ---")
    
    # Convert DATA column to datetime for time-based analysis
    # The 'DATA' column was identified as 'object' in the EDA
    try:
        unified_df['DATA'] = pd.to_datetime(unified_df['DATA'])
        unified_df['DATA_INST_COMP'] = pd.to_datetime(unified_df['DATA_INST_COMP'])
        print("✓ Converted 'DATA' and 'DATA_INST_COMP' to datetime objects.")
    except Exception as e:
        print(f"Could not convert date columns: {e}")

    # Assess completeness across years
    if 'DATA' in unified_df.columns:
        min_date = unified_df['DATA'].min()
        max_date = unified_df['DATA'].max()
        print(f"\n  - Daily consumption data ranges from: {min_date.date()} to {max_date.date()}")
        
        unified_df['YEAR'] = unified_df['DATA'].dt.year
        print("\n  - Record count by year:")
        print(unified_df['YEAR'].value_counts().sort_index())

    # --- 5. Final Unified Dataset ---
    print("\n--- Step 5: Final Output ---")
    unified_clean_df = unified_df # Assign to the final variable name
    print("✓ Unified, clean, and enhanced dataset is ready in the 'unified_clean_df' DataFrame.")
    
    print("\n--- Final DataFrame Head ---")
    print(unified_clean_df.head())
    
    print("\n--- Final DataFrame Info ---")
    unified_clean_df.info()

else:
    print("\nPipeline skipped as one or more data files could not be loaded.")

print("\n--- Pipeline Finished ---")

--- Starting Data Processing Pipeline ---
✓ Step 0: Data loaded successfully.
  - 'consum_dades' shape: (14439, 3)
  - 'perimetre_dades' shape: (10, 6)

--- Step 1: Merging Datasets ---
✓ Datasets merged on 'POLISSA_SUBM'. New shape: (14439, 8)

--- Step 2: Verifying Integrity and Cleaning ---
  - Total missing values: 0
  - Removed 0 duplicate records (same meter & date).
  - Removed 0 invalid records (negative consumption).
✓ Data cleaned. Final shape: (14439, 8)

--- Step 3: Assessing Completeness ---
✓ Converted 'DATA' and 'DATA_INST_COMP' to datetime objects.

  - Daily consumption data ranges from: 2021-01-01 to 2024-12-31

  - Record count by year:
YEAR
2021    3602
2022    3605
2023    3644
2024    3588
Name: count, dtype: int64

--- Step 5: Final Output ---
✓ Unified, clean, and enhanced dataset is ready in the 'unified_clean_df' DataFrame.

--- Final DataFrame Head ---
       POLISSA_SUBM       DATA  CONSUM       NUM_COMPLET DATA_INST_COMP  \
0  4XFL2NAR75V6CQIG 2021-01-01   

: 