## Normalize Data from emr_alpha.csv 
We will first normalize the data from the CSV file into the required unified schema

In [1]:
# Import necessary libraries
import pandas as pd
from datetime import datetime

In [2]:
df = pd.read_csv('emr_alpha.csv')

In [3]:
df.head ()

Unnamed: 0,claim_id,patient_id,procedure_code,denial_reason,submitted_at,status
0,A123,P001,99213,Missing modifier,2025-07-01,denied
1,A124,P002,99214,Incorrect NPI,2025-07-10,denied
2,A125,,99215,Authorization expired,2025-07-05,denied
3,A126,P003,99381,,2025-07-15,approved
4,A127,P004,99401,Prior auth required,2025-07-20,denied


In [10]:
# Define the function to normalize data from CSV
def normalize_alpha(file_path):
    """
    This function reads the given CSV file and normalizes the data according to the schema.

    Returns:
    - List of normalized claims.
    """
    # Read the CSV data into a pandas dataframe
    df = pd.read_csv(file_path)

    # Initialize an empty list to store normalized records
    normalized_claims = []

    # Iterate through each row in the dataframe and normalize it
    for index, row in df.iterrows():
        normalized_claim = {
            "claim_id": row["claim_id"],
            "patient_id": row["patient_id"] if pd.notnull(row["patient_id"]) else None,  # Handle null patient_id
            "procedure_code": row["procedure_code"],
            "denial_reason": row["denial_reason"] if pd.notnull(row["denial_reason"]) else None,  # Handle null denial_reason
            "status": row["status"].lower(),  # Ensure consistent casing (lowercase)
            "submitted_at": datetime.strptime(row["submitted_at"], "%Y-%m-%d").isoformat(),  # Normalize date format
            "source_system": "alpha"  # Add source_system field based on file origin
        }
        normalized_claims.append(normalized_claim)

    return normalized_claims



In [13]:
# Testing the normalize_alpha function
normalized_claims = normalize_alpha('emr_alpha.csv')
# Print the first few normalized claims to verify the output
print("Normalized Claims:")
for claim in normalized_claims[:5]:  # Print first 5 claims
    print(claim)

Normalized Claims:
{'claim_id': 'A123', 'patient_id': 'P001', 'procedure_code': 99213, 'denial_reason': 'Missing modifier', 'status': 'denied', 'submitted_at': '2025-07-01T00:00:00', 'source_system': 'alpha'}
{'claim_id': 'A124', 'patient_id': 'P002', 'procedure_code': 99214, 'denial_reason': 'Incorrect NPI', 'status': 'denied', 'submitted_at': '2025-07-10T00:00:00', 'source_system': 'alpha'}
{'claim_id': 'A125', 'patient_id': None, 'procedure_code': 99215, 'denial_reason': 'Authorization expired', 'status': 'denied', 'submitted_at': '2025-07-05T00:00:00', 'source_system': 'alpha'}
{'claim_id': 'A126', 'patient_id': 'P003', 'procedure_code': 99381, 'denial_reason': None, 'status': 'approved', 'submitted_at': '2025-07-15T00:00:00', 'source_system': 'alpha'}
{'claim_id': 'A127', 'patient_id': 'P004', 'procedure_code': 99401, 'denial_reason': 'Prior auth required', 'status': 'denied', 'submitted_at': '2025-07-20T00:00:00', 'source_system': 'alpha'}


Explanation:

##### We use pandas to read the CSV file and iterate through each row.

For each record:

* We handle null values for patient_id and denial_reason (convert them to None if they are missing).

* We ensure the status is in lowercase for consistency.

* We convert the submitted_at field into ISO format using datetime.strptime.

* We add a new field source_system and set it to "alpha" since this data comes from the emr_alpha.csv file.