### Metadata Management for Data Quality
**Description**: Store and use metadata to manage data quality in a pipeline.

**Steps**:
1. Load metadata
2. Load data
3. Use metadata to validate data quality
4. Show valid data


In [1]:
# write your code from here
import pandas as pd

# === Step 1: Load Metadata ===
# Example metadata as a dictionary or CSV file describing expected column data types and nullability
metadata = {
    "id": {"dtype": "int64", "nullable": False},
    "name": {"dtype": "object", "nullable": False},
    "age": {"dtype": "float64", "nullable": True},
    "income": {"dtype": "float64", "nullable": True}
}

# Or load metadata from CSV:
# metadata_df = pd.read_csv('metadata.csv')
# Process accordingly

# === Step 2: Load Data ===
data_path = 'data.csv'  # Replace with your data path
df = pd.read_csv(data_path)

# === Step 3: Use Metadata to Validate Data Quality ===
valid_rows_mask = pd.Series(True, index=df.index)

for col, props in metadata.items():
    if col not in df.columns:
        print(f"❌ Missing column: {col}")
        valid_rows_mask &= False
        continue
    
    # Check data type (simple check)
    actual_dtype = str(df[col].dtype)
    expected_dtype = props["dtype"]
    if actual_dtype != expected_dtype:
        print(f"⚠️ Column '{col}' dtype mismatch: expected {expected_dtype}, got {actual_dtype}")
    
    # Check nullability
    if not props["nullable"]:
        null_count = df[col].isnull().sum()
        if null_count > 0:
            print(f"⚠️ Column '{col}' contains {null_count} null values but is marked non-nullable.")
            valid_rows_mask &= df[col].notnull()
    
# === Step 4: Show Valid Data ===
valid_df = df[valid_rows_mask]

print("\n✅ Valid rows based on metadata validation:")
print(valid_df)


FileNotFoundError: [Errno 2] No such file or directory: 'data.csv'