In [0]:
# Databricks notebook to generate a realistic MMR Flat File with all 73 fields using Faker
from datetime import datetime, timedelta
import random
import os
from faker import Faker

# Initialize Faker
fake = Faker()
fake.seed_instance(42)  # For reproducible results
random.seed(42)

# Define all 73 fields from MMR spec with exact positions and widths
mmr_fields = [
    {"name": "Contract Number", "width": 5, "type": "str", "pattern": "H####"},
    {"name": "Run Date", "width": 8, "type": "date", "format": "%Y%m%d"},
    {"name": "Payment Date", "width": 6, "type": "date", "format": "%Y%m"},
    {"name": "Beneficiary ID", "width": 12, "type": "hicn"},
    {"name": "Surname", "width": 7, "type": "surname"},
    {"name": "First Initial", "width": 1, "type": "first_initial"},
    {"name": "Sex Code", "width": 1, "type": "choice", "choices": ["M", "F"]},
    {"name": "Date of Birth", "width": 8, "type": "date", "format": "%Y%m%d"},
    {"name": "Filler1", "width": 4, "type": "filler"},
    {"name": "State & County Code", "width": 5, "type": "state_county"},
    {"name": "Out of Area Indicator", "width": 1, "type": "choice", "choices": ["Y", " "]},
    {"name": "Part A Entitlement", "width": 1, "type": "choice", "choices": ["Y", " "]},
    {"name": "Part B Entitlement", "width": 1, "type": "choice", "choices": ["Y", " "]},
    {"name": "Hospice", "width": 1, "type": "choice", "choices": ["Y", " "]},
    {"name": "ESRD", "width": 1, "type": "choice", "choices": ["Y", " "]},
    {"name": "Aged/Disabled MSP", "width": 1, "type": "choice", "choices": ["Y", "N"]},
    {"name": "Filler2", "width": 1, "type": "filler"},
    {"name": "Filler3", "width": 1, "type": "filler"},
    {"name": "New Medicare Beneficiary Medicaid Status Flag", "width": 1, "type": "choice", "choices": ["Y", "N", " "]},
    {"name": "LTI Flag", "width": 1, "type": "choice", "choices": ["Y", " "]},
    {"name": "Medicaid Add-on Factor Indicator", "width": 1, "type": "choice", "choices": ["Y", " "]},
    {"name": "Filler4", "width": 2, "type": "filler"},
    {"name": "Default Risk Factor Code", "width": 1, "type": "choice", "choices": ["1", "2", "3", "4", "5", "6", "7", " "]},
    {"name": "Risk Adjustment Factor A", "width": 7, "type": "decimal", "format": "##.#####"},
    {"name": "Risk Adjustment Factor B", "width": 7, "type": "decimal", "format": "##.#####"},
    {"name": "Number of Payment/Adjustment Months Part A", "width": 2, "type": "numeric", "range": [1, 12]},
    {"name": "Number of Payment/Adjustment Months Part B", "width": 2, "type": "numeric", "range": [1, 12]},
    {"name": "Adjustment Reason Code (ARC)", "width": 2, "type": "choice", "choices": ["  ", "60", "61", "94"]},
    {"name": "Payment/Adjustment Start Date", "width": 8, "type": "date", "format": "%Y%m%d"},
    {"name": "Payment/Adjustment End Date", "width": 8, "type": "date", "format": "%Y%m%d"},
    {"name": "Filler5", "width": 9, "type": "filler"},
    {"name": "Filler6", "width": 9, "type": "filler"},
    {"name": "Monthly Risk Adjusted Amount Part A", "width": 9, "type": "amount", "format": "-999999.99"},
    {"name": "Monthly Risk Adjusted Amount Part B", "width": 9, "type": "amount", "format": "-999999.99"},
    {"name": "LIS Premium Subsidy", "width": 8, "type": "amount", "format": "-9999.99"},
    {"name": "ESRD MSP Flag", "width": 1, "type": "choice", "choices": ["T", "P", " "]},
    {"name": "Medication Therapy Management (MTM) Add On", "width": 10, "type": "amount", "format": "999999.99", "force_zero": True},
    {"name": "Part D Manufacturer Discount Program Amount", "width": 8, "type": "amount", "format": "-9999.99", "force_zero": True},
    {"name": "Medicaid Full/Partial/Non-dual", "width": 1, "type": "choice", "choices": ["1", "0", " "]},
    {"name": "Risk Adjustment Age Group (RAAG)", "width": 4, "type": "age_group"},
    {"name": "Filler7", "width": 7, "type": "filler"},
    {"name": "Filler8", "width": 1, "type": "filler"},
    {"name": "Filler9", "width": 1, "type": "filler"},
    {"name": "Plan Benefit Package ID", "width": 3, "type": "numeric"},
    {"name": "Filler10", "width": 1, "type": "filler"},
    {"name": "Risk Adjustment Factor Type Code", "width": 2, "type": "choice", 
     "choices": ["C", "CF", "CP", "CN", "D", "D1", "D2", "E", "ED", "E1", "E2", "G1", "G2", "I", "SE"]},
    {"name": "Frailty Indicator (PACE/FIDE SNP only)", "width": 1, "type": "choice", "choices": ["Y", "N"]},
    {"name": "Original Reason for Entitlement Code (OREC)", "width": 1, "type": "choice", "choices": ["0", "1", "2", "3", "9"]},
    {"name": "Filler11", "width": 1, "type": "filler"},
    {"name": "Segment Number", "width": 3, "type": "numeric"},
    {"name": "Filler12", "width": 1, "type": "filler"},
    {"name": "EGHP Flag", "width": 1, "type": "choice", "choices": ["Y", "N"]},
    {"name": "Part C Basic Premium – Part A Amount", "width": 8, "type": "amount", "format": "-9999.99"},
    {"name": "Part C Basic Premium – Part B Amount", "width": 8, "type": "amount", "format": "-9999.99"},
    {"name": "Rebate for Part A Cost Sharing Reduction", "width": 8, "type": "amount", "format": "-9999.99"},
    {"name": "Rebate for Part B Cost Sharing Reduction", "width": 8, "type": "amount", "format": "-9999.99"},
    {"name": "Rebate for Other Part A Mandatory Supplemental Benefits", "width": 8, "type": "amount", "format": "-9999.99"},
    {"name": "Rebate for Other Part B Mandatory Supplemental Benefits", "width": 8, "type": "amount", "format": "-9999.99"},
    {"name": "Rebate for Part B Premium Reduction – Part A Amount", "width": 8, "type": "amount", "format": "-9999.99"},
    # Fields 58-76 (gap in original spec - padding with filler to maintain positions)
    {"name": "Filler13", "width": 135, "type": "filler"},  # Positions 255-389
    {"name": "Number of Payment/Adjustment Months Part D", "width": 2, "type": "numeric", "range": [1, 12]},
    {"name": "PACE Premium Add On", "width": 10, "type": "amount", "format": "-999999.99"},
    {"name": "PACE Cost Sharing Add-on", "width": 10, "type": "amount", "format": "-999999.99", "force_zero": True},
    {"name": "Part C Frailty Factor", "width": 7, "type": "decimal", "format": "##.#####"},
    {"name": "MSP Reduction Factor", "width": 7, "type": "decimal", "format": "##.#####"},
    {"name": "MSP Reduction Amount Part A", "width": 10, "type": "amount", "format": "-999999.99", "force_zero": True},
    {"name": "MSP Reduction Amount Part B", "width": 10, "type": "amount", "format": "-999999.99", "force_zero": True},
    {"name": "Medicaid Dual Status Code", "width": 2, "type": "choice", 
     "choices": ["00", "01", "02", "03", "04", "05", "06", "08", "09", "10", "99", "  "]},
    {"name": "Part D Coverage Gap Discount Amount", "width": 8, "type": "amount", "format": "-9999.99", "force_zero": True},
    {"name": "Part D Risk Adjustment Factor Type", "width": 2, "type": "choice", 
     "choices": ["D1", "D2", "D3", "D4", "D5", "D6", "D7", "D8", "D9", "R1", "R2", "R3", "R4", "I1", "I2", "N1", "N2"]},
    {"name": "Filler14", "width": 1, "type": "filler"},  # Position 458
    {"name": "Part A Monthly Rate for Payment or Adjustment", "width": 9, "type": "amount", "format": "-999999.99"},
    {"name": "Part B Monthly Rate for Payment or Adjustment", "width": 9, "type": "amount", "format": "-999999.99"},
    {"name": "Part D Monthly Rate for Payment or Adjustment", "width": 9, "type": "amount", "format": "-999999.99"},
    {"name": "Cleanup ID", "width": 10, "type": "str"}
]

# Global variables to store person data for consistency
current_person = {}

# Verify total width is 495 characters
total_width = sum(field["width"] for field in mmr_fields)
print(f"Total record width: {total_width} characters")

# Function to generate realistic data based on field type
def generate_field_value(field_spec):
    global current_person
    
    width = field_spec["width"]
    ftype = field_spec["type"]
    field_name = field_spec["name"]
    
    if ftype == "str":
        if "pattern" in field_spec:
            # Handle specific patterns like H####
            pattern = field_spec["pattern"]
            if pattern == "H####":
                return "H" + str(random.randint(1000, 9999))
            else:
                return fake.pystr(min_chars=width, max_chars=width).upper()
        else:
            if "Cleanup ID" in field_name:
                # Generate realistic cleanup/ticket IDs
                return fake.bothify(text='TICK######').ljust(width)[:width]
            else:
                return fake.pystr(min_chars=width, max_chars=width).upper()
    
    elif ftype == "hicn":
        # Generate realistic HICN format: 9 digits + 1 letter + 1 digit/letter
        digits = ''.join([str(random.randint(0, 9)) for _ in range(9)])
        letter = fake.random_letter().upper()
        suffix = fake.random_element(elements=('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', '1', '2', '3', '4', '5', '6', '7', '8', '9'))
        return f"{digits}{letter}{suffix}"
    
    elif ftype == "surname":
        # Generate realistic last name and store for consistency
        last_name = fake.last_name().upper()[:width]
        current_person['last_name'] = last_name
        return last_name.ljust(width)[:width]
    
    elif ftype == "first_initial":
        # Generate first initial based on sex code (if already determined)
        sex_code = current_person.get('sex_code')
        if sex_code == 'M':
            first_name = fake.first_name_male()
        elif sex_code == 'F':
            first_name = fake.first_name_female()
        else:
            first_name = fake.first_name()
        
        current_person['first_name'] = first_name
        return first_name[0].upper()
    
    elif ftype == "numeric":
        if "range" in field_spec:
            min_val, max_val = field_spec["range"]
            val = random.randint(min_val, max_val)
        else:
            val = random.randint(0, 10**width - 1)
        return str(val).zfill(width)
    
    elif ftype == "state_county":
        # Generate realistic state/county FIPS codes
        state_codes = ['01', '04', '06', '08', '12', '13', '17', '18', '22', '25', '26', '27', '29', '34', '36', '37', '39', '42', '48', '53']
        state = fake.random_element(elements=state_codes)
        county = f"{random.randint(1, 199):03d}"
        return f"{state}{county}"
    
    elif ftype == "choice":
        choices = field_spec.get("choices", [" "])
        choice = random.choice(choices)
        
        # Store sex code for name generation consistency
        if field_name == "Sex Code":
            current_person['sex_code'] = choice
            
        return choice
    
    elif ftype == "date":
        date_format = field_spec.get("format", "%Y%m%d")
        if date_format == "%Y%m":
            # Payment date - recent months
            start_date = datetime.strptime("2023-01-01", "%Y-%m-%d")
            end_date = datetime.strptime("2025-12-31", "%Y-%m-%d")
            random_date = fake.date_between(start_date=start_date, end_date=end_date)
        else:
            # Birth dates or other dates
            if "Date of Birth" in field_name:
                # Medicare-eligible ages (65+, born 1930-1958 for current seniors)
                birth_date = fake.date_of_birth(minimum_age=65, maximum_age=93)
                current_person['birth_date'] = birth_date
                return birth_date.strftime(date_format)
            else:
                start_date = datetime.strptime("2020-01-01", "%Y-%m-%d")
                end_date = datetime.strptime("2025-12-31", "%Y-%m-%d")
                random_date = fake.date_between(start_date=start_date, end_date=end_date)
        
        return random_date.strftime(date_format)
    
    elif ftype == "decimal":
        # Generate decimal values like risk adjustment factors
        if "Risk Adjustment Factor" in field_name:
            value = round(fake.pyfloat(min_value=0.5, max_value=2.5, right_digits=5), 5)
            return f"{value:07.5f}"
        elif "Frailty Factor" in field_name:
            if random.random() < 0.8:  # 80% chance of spaces (not applicable)
                return " " * width
            else:
                value = round(fake.pyfloat(min_value=1.0, max_value=1.5, right_digits=5), 5)
                return f"{value:07.5f}"
        else:
            value = round(fake.pyfloat(min_value=0.1, max_value=9.9, right_digits=5), 5)
            return f"{value:07.5f}"
    
    elif ftype == "amount":
        # Check if this field should be forced to zero
        if field_spec.get("force_zero", False):
            # Return zero formatted for the field width
            if width == 8:  # Format like 00000.00
                return "00000.00"
            elif width == 9:  # Format like 000000.00
                return "000000.00"
            elif width == 10:  # Format like 0000000.00
                return "0000000.00"
            else:
                return "0".ljust(width)
        
        # Generate realistic monetary amounts for non-zero fields
        if "Premium" in field_name or "Rebate" in field_name:
            # Premium/rebate amounts - typically smaller
            amount = fake.pyfloat(min_value=0, max_value=500, right_digits=2)
            if random.random() < 0.05:  # 5% chance of negative rebates
                amount = -amount
        elif "Risk Adjusted Amount" in field_name:
            # Risk adjusted amounts - larger Medicare payments
            amount = fake.pyfloat(min_value=800, max_value=1500, right_digits=2)
        elif "Rate" in field_name:
            # Monthly rates
            amount = fake.pyfloat(min_value=900, max_value=1200, right_digits=2)
        else:
            # General amounts
            amount = fake.pyfloat(min_value=0, max_value=999999, right_digits=2)
            if random.random() < 0.1:  # 10% chance of negative amounts
                amount = -amount
        
        # Format based on width
        if width == 8:  # Format like -9999.99
            amount = max(min(amount, 9999.99), -9999.99)
            formatted = f"{amount:8.2f}"
        elif width == 9:  # Format like -999999.99
            amount = max(min(amount, 999999.99), -999999.99)
            formatted = f"{amount:9.2f}"
        elif width == 10:  # Format like -9999999.99
            amount = max(min(amount, 9999999.99), -9999999.99)
            formatted = f"{amount:10.2f}"
        else:
            formatted = f"{amount:.2f}".zfill(width)
        
        return formatted.replace(" ", "0")
    
    elif ftype == "age_group":
        # Risk Adjustment Age Group format based on birth date
        birth_date = current_person.get('birth_date')
        if birth_date:
            age = datetime.now().year - birth_date.year
            if age < 70:
                return "6569"
            elif age < 75:
                return "7074"
            elif age < 80:
                return "7579"
            elif age < 85:
                return "8084"
            elif age < 90:
                return "8589"
            elif age < 95:
                return "9094"
            else:
                return "9599"
        else:
            # Fallback to random age group
            age_groups = ["6569", "7074", "7579", "8084", "8589", "9094", "9599"]
            return fake.random_element(elements=age_groups)
    
    elif ftype == "filler":
        return " " * width
    
    else:
        return " " * width

# Function to generate one mock row
def generate_mock_row(field_specs):
    global current_person
    current_person = {}  # Reset for each new person
    
    row = ""
    
    # First pass: generate sex code to ensure name consistency
    for field in field_specs:
        if field["name"] == "Sex Code":
            current_person['sex_code'] = random.choice(field["choices"])
            break
    
    # Second pass: generate all fields
    for field in field_specs:
        width = field["width"]
        value = generate_field_value(field)
        
        # Ensure value fits within field width
        if len(str(value)) > width:
            value = str(value)[:width]
        elif len(str(value)) < width:
            if field["type"] == "numeric":
                value = str(value).zfill(width)
            else:
                value = str(value).ljust(width)
        
        row += value
    
    return row

# Function to generate mock MMR file
def generate_mock_mmr_file(field_specs, num_rows):
    print(f"Generating {num_rows} realistic MMR records...")
    records = []
    
    for i in range(num_rows):
        if i % 100 == 0 and i > 0:
            print(f"Generated {i} records...")
        records.append(generate_mock_row(field_specs))
    
    return records

# Function to validate record length
def validate_records(records, expected_length=495):
    print(f"Validating record lengths (expected: {expected_length} characters)...")
    
    for i, record in enumerate(records[:5]):  # Check first 5 records
        actual_length = len(record)
        if actual_length != expected_length:
            print(f"Record {i+1}: Length mismatch - Expected: {expected_length}, Actual: {actual_length}")
        else:
            print(f"Record {i+1}: Length OK ({actual_length} characters)")
    
    # Show sample data
    print(f"\nSample record preview:")
    if records:
        sample = records[0]
        print(f"Contract: {sample[0:5]}")
        print(f"Surname: {sample[31:38]}")
        print(f"First Initial: {sample[38:39]}")
        print(f"Sex: {sample[39:40]}")
        print(f"DOB: {sample[40:48]}")

# Generate the mock MMR flat file
print("=== Realistic MMR Flat File Generator with Faker (Zero Fields Updated) ===")
print(f"Total fields defined: {len(mmr_fields)}")
print(f"Expected record width: {total_width} characters")

# Show which fields are being set to zero
zero_fields = [field["name"] for field in mmr_fields if field.get("force_zero", False)]
print(f"\nFields set to zero: {zero_fields}")
print()

# Generate records
num_rows = 1000
records = generate_mock_mmr_file(mmr_fields, num_rows)

# Validate records
validate_records(records)

# Write to file
output_filename = "mmr_mock_flatfile.txt"
print(f"\nWriting {len(records)} records to {output_filename}...")

with open(output_filename, "w") as f:
    for line in records:
        f.write(line + "\n")

print(f"✅ Successfully generated {output_filename}")
print(f"📊 Records created: {len(records)}")
print(f"📏 Record length: {len(records[0]) if records else 0} characters")

print(f"\n✅ MMR file generation complete!")
print(f"📁 Output file: {output_filename}")
print(f"🔗 Use this file with the MMR parser to convert to CSV format.")
print(f"👥 Generated realistic names, addresses, and Medicare data using Faker library")
print(f"💰 Set specified financial fields to zero for realistic data patterns")

Total record width: 495 characters
=== Realistic MMR Flat File Generator with Faker (Zero Fields Updated) ===
Total fields defined: 75
Expected record width: 495 characters

Fields set to zero: ['Medication Therapy Management (MTM) Add On', 'Part D Manufacturer Discount Program Amount', 'PACE Cost Sharing Add-on', 'MSP Reduction Amount Part A', 'MSP Reduction Amount Part B', 'Part D Coverage Gap Discount Amount']

Generating 1000 realistic MMR records...
Generated 100 records...
Generated 200 records...
Generated 300 records...
Generated 400 records...
Generated 500 records...
Generated 600 records...
Generated 700 records...
Generated 800 records...
Generated 900 records...
Validating record lengths (expected: 495 characters)...
Record 1: Length OK (495 characters)
Record 2: Length OK (495 characters)
Record 3: Length OK (495 characters)
Record 4: Length OK (495 characters)
Record 5: Length OK (495 characters)

Sample record preview:
Contract: H1409
Surname: WALKER 
First Initial: N
S