In [1]:
# %% [markdown]
# # ETL Pipeline - Extract Phase
# **Kevin (ID: 656)**  
# DSA 2040 A US 2025 Mid Semester Exam

# %% [markdown]
# ## 1. Import Required Libraries

# %%
import pandas as pd
from IPython.display import display

# %% [markdown]
# ## 2. Load Raw Data Files

# %%
print("Loading data files...")
try:
    # Load main dataset
    raw_df = pd.read_csv('data/raw_data.csv')
    
    # Load incremental dataset
    inc_df = pd.read_csv('data/incremental_data.csv')
    
    print(" Data loaded successfully!")
except Exception as e:
    print(f" Error loading files: {e}")

# %% [markdown]
# ## 3. Initial Data Inspection

# %%
print("\n=== RAW DATA (MAIN DATASET) ===")
print(f"Number of records: {len(raw_df)}")
print(f"Number of columns: {len(raw_df.columns)}")
print("\nFirst 5 records:")
display(raw_df.head())

# %%
print("\n=== INCREMENTAL DATA ===")
print(f"Number of records: {len(inc_df)}")
print(f"Number of columns: {len(inc_df.columns)}")
print("\nFirst 5 records:")
display(inc_df.head())

# %% [markdown]
# ## 4. Data Quality Checks

# %%
print("\n=== DATA QUALITY REPORT ===")

# %%
print("\n1. Missing Values:")
print("\nMain Dataset:")
display(raw_df.isnull().sum())

print("\nIncremental Dataset:")
display(inc_df.isnull().sum())

# %%
print("\n2. Duplicate Records:")
print(f"Main dataset has {raw_df.duplicated().sum()} full duplicates")
print(f"Duplicate order IDs: {raw_df['order_id'].duplicated().sum()}")

# %%
print("\n3. Data Types:")
print("\nMain Dataset:")
display(raw_df.dtypes)

print("\nIncremental Dataset:")
display(inc_df.dtypes)

# %% [markdown]
# ## 5. Initial Observations

# %%
observations = """
### Key Observations:

1. **Missing Values**:
   - Customer names missing in both datasets
   - Several missing quantities and unit prices
   - Some missing regions and order dates

2. **Data Issues**:
   - Duplicate order_id (4) in raw data
   - Quantity should be integer but shows as float
   - Some unit prices seem unrealistic (e.g., $900 laptop)

3. **Structural Notes**:
   - Consistent columns across both datasets
   - Date format is standardized (YYYY-MM-DD)
   - Region values are categorical

### Transformations Needed:
1. Handle missing values appropriately
2. Remove duplicate records
3. Convert data types (quantity → integer)
4. Validate unit price ranges
5. Standardize region values
"""

print(observations)

# %% [markdown]
# ## 6. Save Raw Data Copies

# %%
# Save exact copies for reference
raw_df.to_csv('data/raw_data_backup.csv', index=False)
inc_df.to_csv('data/incremental_data_backup.csv', index=False)

print(" Backup copies saved in data/ folder")



Loading data files...
❌ Error loading files: [Errno 2] No such file or directory: 'data/raw_data.csv'

=== RAW DATA (MAIN DATASET) ===


NameError: name 'raw_df' is not defined