In [None]:
# --- ETL Extract Step ---
# 1. Load and preview raw_data.csv and incremental_data.csv
# 2. Display .head() and .info() for each, using tabulate for tables
# 3. Add observations about the data, also as tables
# 4. Save raw copies to data/ directory

import pandas as pd
from tabulate import tabulate

# Load the raw data
raw_df = pd.read_csv('data/raw_data.csv')
incremental_df = pd.read_csv('data/incremental_data.csv')

# Preview the first few rows of each DataFrame using tabulate (grid style)
print("=== raw_data.csv: .head() ===")
print(tabulate(raw_df.head(), headers='keys', tablefmt='grid'))
print("\n=== raw_data.csv: .info() ===")
raw_df.info()

print("\n=== incremental_data.csv: .head() ===")
print(tabulate(incremental_df.head(), headers='keys', tablefmt='grid'))
print("\n=== incremental_data.csv: .info() ===")
incremental_df.info()

# Observations as tables
print("\n--- Observations ---")

# Missing values as tables
print("\nMissing Values Summary")
missing_data = [
    ["File", "Column", "Missing Values"]
]
for df, name in zip([raw_df, incremental_df], ["raw_data.csv", "incremental_data.csv"]):
    for col, val in df.isnull().sum().items():
        missing_data.append([name, col, val])
print(tabulate(missing_data[1:], headers=missing_data[0], tablefmt='grid'))

# Duplicate rows as tables
print("\nDuplicate Rows Summary")
dup_data = [
    ["File", "Duplicate Rows"],
    ["raw_data.csv", raw_df.duplicated().sum()],
    ["incremental_data.csv", incremental_df.duplicated().sum()]
]
print(tabulate(dup_data[1:], headers=dup_data[0], tablefmt='grid'))

# Columns as tables
print("\nColumns in Each File")
col_data = [
    ["File", "Column"]
]
for df, name in zip([raw_df, incremental_df], ["raw_data.csv", "incremental_data.csv"]):
    for col in df.columns:
        col_data.append([name, col])
print(tabulate(col_data[1:], headers=col_data[0], tablefmt='grid'))

# Save raw copies (redundant if already in data/, but ensures a backup)
raw_df.to_csv('data/raw_data_copy.csv', index=False)
incremental_df.to_csv('data/incremental_data_copy.csv', index=False)
print("\nRaw copies saved as data/raw_data_copy.csv and data/incremental_data_copy.csv")

=== raw_data.csv: .head() ===
+----+------------+-----------------+-----------+------------+--------------+--------------+----------+
|    |   order_id | customer_name   | product   |   quantity |   unit_price | order_date   | region   |
|  0 |          1 | Diana           | Tablet    |        nan |          500 | 2024-01-20   | South    |
+----+------------+-----------------+-----------+------------+--------------+--------------+----------+
|  1 |          2 | Eve             | Laptop    |        nan |          nan | 2024-04-29   | North    |
+----+------------+-----------------+-----------+------------+--------------+--------------+----------+
|  2 |          3 | Charlie         | Laptop    |          2 |          250 | 2024-01-08   | nan      |
+----+------------+-----------------+-----------+------------+--------------+--------------+----------+
|  3 |          4 | Eve             | Laptop    |          2 |          750 | 2024-01-07   | West     |
+----+------------+---------------