### **Part 0: Preprocessing** - Import dependencies, define aliases, clean data, normalize clinic names, and create utility functions

In [6]:
import pandas as pd
import numpy as np
import json
from rapidfuzz import process
from datetime import datetime

In [7]:
# aliases for filenames  
DATA_FILENAME = 'data.csv'
CLINICS_IN_RED_FILENAME = 'clinics.json'
TIER_COMPARISON_FILENAME = 'tier_comparison.json'

# aliases for relevant columns
ALIASES = {
    'CONCATENADO': 'ID',
    'Clinica1': 'CLINIC',
    'fe_declaracion': 'DATE',
    'procedimiento': 'PROCEDURE',
    'estatus_siniestro1': 'STATUS',
    'pais': 'COUNTRY',
    'Region': 'REGION',
    'tipo_siniestro1': 'TYPE',
    'monto usd': 'PAID_USD',   
}
# type definitions for relevant columns
DATA_TYPE_DICT = {
    'CONCATENADO':'object',
    'Clinica1':'object',
    'fe_declaracion':'object',
    'procedimiento':'object',
    'estatus_siniestro1':'object',
    'pais':'object',
    'Region':'object',
    'tipo_siniestro1':'object',
    'monto usd': 'float64',
}

# format floats
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.options.mode.copy_on_write = True

In [8]:
# read utils 
def df_data(filename): 
    """
    Reads .csv file and returns DataFrame with relevant columns (declared in DATA_TYPE_DICT).
    """
    # First, read the CSV without dtype specifications to see actual columns
    df = pd.read_csv(filename, encoding='utf-8', low_memory=False)
    
    # Print actual columns to help debug
    print("Available columns in CSV:", df.columns.tolist())
    
    # Create a mapping of available columns to their dtypes
    available_dtypes = {col: DATA_TYPE_DICT[col] for col in DATA_TYPE_DICT if col in df.columns}
    
    # Read again with correct dtypes
    df = pd.read_csv(filename, dtype=available_dtypes, encoding='utf-8', low_memory=False)
    
    # Select only the columns that exist
    existing_columns = [col for col in DATA_TYPE_DICT if col in df.columns]
    if not existing_columns:
        raise ValueError("None of the specified columns found in the CSV file!")
    
    df = df[existing_columns]
    return df

def json_data(filename): 
    """
    Reads filename and returns json object. 
    """      
    with open(filename) as file:
        return json.load(file)
# transform utils

def normalized(df, column, reference, threshold=90):
    """
    Normalizes a column given a reference name using closest matches. 
    Fixes inconsistent entries that refer to the same value.
    Matches only apply if the confidence score is above the given threshold.
    Displays the number of unique clinic names before and after normalization.
    Prints which names were normalized.
    """
    
    # Count unique values before normalization
    unique_before = df[column].nunique()

    # Create the mapping with confidence filtering
    mapping = {}
    normalized_log = []  # Track normalized entries
    for clinic in df[column].unique():
        match = process.extractOne(clinic, reference)
        if match and match[1] >= threshold:  # Check confidence score
            normalized_name = match[0]
            if clinic != normalized_name:
                normalized_log.append((clinic, normalized_name))  # Log changes
            mapping[clinic] = normalized_name
        else:
            mapping[clinic] = clinic  # Retain original name if confidence is low

    # Normalize the column
    df[column] = df[column].map(mapping)

    # Count unique values after normalization
    unique_after = df[column].nunique()

    # Print unique counts
    print(f"Unique clinic names before normalization: {unique_before}")
    print(f"Unique clinic names after normalization: {unique_after}")

    # Print what names were normalized
    if normalized_log:
        print("\nNormalized Names:")
        for original, normalized in normalized_log:
            print(f"  {original} -> {normalized}")
    else:
        print("\nNo names were normalized.")

    return df

In [9]:
# read json files as dictionaries
CLINICS_IN_RED = json_data(CLINICS_IN_RED_FILENAME) # clinic to joined_dates and tier
TIER_COMPARSION = json_data(TIER_COMPARISON_FILENAME) # comparison clinics for each tier

def in_red(entry): 
    """
    Determines if an entry was handled in red. 
    True if: 
        1. Clinic is in red for entry's type and 
        2. Entry happened on or after the clinic joined the red for entry's type
    False otherwise.    
    """ 
    clinic = entry['CLINIC']
    type = entry['TYPE']
    date = entry['DATE']
    if clinic not in CLINICS_IN_RED or type not in CLINICS_IN_RED[clinic]['join_date']:
        return False
    join_date = datetime.strptime(
        CLINICS_IN_RED[clinic]['join_date'][type], '%m/%d/%Y')
    if date < join_date:
        return False
    return True

# read
df = df_data(DATA_FILENAME)

# rename
df.rename(columns=ALIASES, inplace=True)

# filter
df_filtered = df[
    (df["STATUS"] == "CERRADO") & 
    (df["REGION"] == "Caracas") & 
    (df["TYPE"].isin(["EMERGENCIA", "CARTA AVAL"]))
]

# transform dates to datetime objects
df_filtered['DATE'] = pd.to_datetime(
        df_filtered['DATE'], format='%m/%d/%Y')

# normalize
standard_clinic_names = set(CLINICS_IN_RED.keys()) | set(
    c for t in TIER_COMPARSION.keys() for c in TIER_COMPARSION[t]
)
df_normalized = normalized(df_filtered, "CLINIC", standard_clinic_names)

# aggregate by ID, CLINIC, and TYPE
df_aggregated = df_normalized.groupby(["ID", "CLINIC", "TYPE"]).agg({
    'DATE': 'min',  # Earliest and latest dates
    'PROCEDURE': list,       # List of procedures
    'PAID_USD': 'sum',       # Sum of payments
}).reset_index()

# flatten multi-level column names
df_aggregated.columns = ['ID', 'CLINIC', 'TYPE', 'DATE', 'PROCEDURES', 'PAID_USD']

# classify
df_aggregated["IN_RED"] = df_aggregated.apply(in_red, axis=1)

# save to CSV
df_aggregated.to_csv('data_clean.csv', index=False)

# display the result
df_aggregated.head()

Available columns in CSV: ['sucursal', 'ramo', 'poliza', 'de_motivo_siniestro', 'causa_siniestro', 'cd_pais', 'cd_usuario', 'siniestro', 'tipo_siniestro', 'estatus_siniestro', 'facturado', 'monto_usd', 'fe_ocurrencia', 'fe_declaracion', 'enfermedad', 'procedimiento', 'especialidad', 'tratamiento', 'nombre_afectado', 'cedula_afectado', 'edad', 'concepto', 'rif_contratante', 'contratante', 'cd_mediador', 'nm_mediador', 'estado', 'rif_clin', 'clinica', 'tipo_proveedor', 'fe_ingreso', 'fe_egreso', 'pais', 'ciudad', 'conexion', 'localidad', 'monto usd', 'tipo_siniestro1', 'CONCATENADO', 'Mes', 'Año', 'Region', 'Ocurrencia', 'estatus_siniestro1', 'tipo de poliza', 'tipo_proc', 'Rango edad', 'Proveedor Internacional', 'Categoría', 'SA', 'Producto', 'Clinica1', 'enfermedad1', 'tasa ocurrencia', 'Facturado_usd', 'desglose1']
Unique clinic names before normalization: 231
Unique clinic names after normalization: 228

Normalized Names:
  POLICLINICA LA ARBOLEDA, C.A -> POLICLINICA LA ARBOLEDA, C.A

Unnamed: 0,ID,CLINIC,TYPE,DATE,PROCEDURES,PAID_USD,IN_RED
0,"00673554EMBARAZO, PARTO Y PUERPERIOoctubre2023...","HOSPITAL DE CLINICAS CARACAS, C.A.",EMERGENCIA,2023-10-30,[TRATAMIENTO MEDICO CON PROCEDIMIENTO NO BAREM...,5684.0,False
1,00673554NAUSEA Y VOMITOabril2023EMERGENCIA,"HOSPITAL DE CLINICAS CARACAS, C.A.",EMERGENCIA,2023-04-09,"[TRATAMIENTO MEDICO AMBULATORIO, TRATAMIENTO M...",633.0,False
2,069701191TRAUMATISMO SUPERFICIAL DE LA CADERA ...,A.C. CENTRO MEDICO DOCENTE LA TRINIDAD,EMERGENCIA,2023-07-26,[ Esguince Tobillo],1873.0,False
3,091139498FRACTURAS QUE AFECTAN MULTIPLES REGIO...,INTERVERTEBRA C.A.,EMERGENCIA,2023-05-13,[TRATAMIENTO MEDICO CON HOSPITALIZACION],7095.0,False
4,099809108TRAUMATISMO SUPERFICIAL DE LA CADERA ...,"GRUPO MEDICO SANTA PAULA, S.A. (GMSP S.A.)",EMERGENCIA,2024-03-05,"[ Esguince Tobillo, Esguince Tobillo]",805.0,False


### **Part 1: Descriptive Analysis**

#### Part 1.1: Overview

In [11]:
### Part 1: Descriptive Analysis

# Basic dataset statistics
print(f"Total number of claims: {len(df_aggregated):,}")
print(f"Date range: {df_aggregated['DATE'].min().strftime('%m/%d/%Y')} to {df_aggregated['DATE'].max().strftime('%m/%d/%Y')}")
print(f"Number of unique clinics: {df_aggregated['CLINIC'].nunique():,}")
print(f"Number of unique procedures: {df_normalized['PROCEDURE'].nunique():,}")
print(f"\nTotal amount paid (USD): ${df_aggregated['PAID_USD'].sum():,.2f}")
print(f"Average claim amount (USD): ${df_aggregated['PAID_USD'].mean():,.2f}")
print(f"Median claim amount (USD): ${df_aggregated['PAID_USD'].median():,.2f}")


top_clinics = df_aggregated['CLINIC'].value_counts().head(10)
print("Top 10 Clinics by Number of Claims:")
for clinic, count in top_clinics.items():
    total_amount = df_aggregated[df_aggregated['CLINIC'] == clinic]['PAID_USD'].sum()
    avg_amount = df_aggregated[df_aggregated['CLINIC'] == clinic]['PAID_USD'].mean()
    print(f"\n{clinic}")
    print(f"  Claims: {count:,}")
    print(f"  Total Amount: ${total_amount:,.2f}")
    print(f"  Average Amount: ${avg_amount:,.2f}")


Total number of claims: 47,427
Date range: 01/01/2023 to 05/30/2024
Number of unique clinics: 228
Number of unique procedures: 459

Total amount paid (USD): $135,670,167.00
Average claim amount (USD): $2,860.61
Median claim amount (USD): $1,370.00
Top 10 Clinics by Number of Claims:

A.C. CENTRO MEDICO DOCENTE LA TRINIDAD
  Claims: 5,228
  Total Amount: $19,584,993.00
  Average Amount: $3,746.17

POLICLINICA METROPOLITANA, C.A.
  Claims: 3,945
  Total Amount: $15,311,877.00
  Average Amount: $3,881.34

CLINICA SANTIAGO DE LEON, C.A.
  Claims: 3,087
  Total Amount: $7,261,662.00
  Average Amount: $2,352.34

CLINICA EL AVILA, C.A.
  Claims: 3,043
  Total Amount: $7,722,080.00
  Average Amount: $2,537.65

GRUPO MEDICO SANTA PAULA, S.A. (GMSP S.A.)
  Claims: 2,588
  Total Amount: $8,760,369.00
  Average Amount: $3,385.00

VENEURGENCIAS C.A
  Claims: 2,452
  Total Amount: $768,860.00
  Average Amount: $313.56

CENTRO CLINICO FENIX SALUD,C.A.
  Claims: 1,693
  Total Amount: $4,264,339.00
  A

In [15]:
# Top 10 procedures
top_procedures = df_normalized['PROCEDURE'].value_counts().head(13)
print("Top 13 Procedures (first 3 are too broad to categorize):")
for procedure, count in top_procedures.items():
    total_amount = df_normalized[df_normalized['PROCEDURE'] == procedure]['PAID_USD'].sum()
    avg_amount = df_normalized[df_normalized['PROCEDURE'] == procedure]['PAID_USD'].mean()
    print(f"\n{procedure}")
    print(f"  Claims: {count:,}")
    print(f"  Total Amount: ${total_amount:,.2f}")
    print(f"  Average Amount: ${avg_amount:,.2f}")


Top 13 Procedures (first 3 are too broad to categorize):

TRATAMIENTO MEDICO AMBULATORIO
  Claims: 35,606
  Total Amount: $21,998,432.00
  Average Amount: $617.83

TRATAMIENTO MEDICO CON HOSPITALIZACION
  Claims: 21,065
  Total Amount: $38,142,437.00
  Average Amount: $1,810.70

TRATAMIENTO MEDICO CON PROCEDIMIENTO NO BAREMIZADO
  Claims: 4,301
  Total Amount: $12,603,069.00
  Average Amount: $2,930.26

Cesárea
  Claims: 1,973
  Total Amount: $4,102,508.00
  Average Amount: $2,079.32

Facomulsificación + LIO
  Claims: 1,686
  Total Amount: $1,650,476.00
  Average Amount: $978.93

Facoemulsificación de Catarata con Implante de LIO Monofocal
  Claims: 1,016
  Total Amount: $1,058,181.00
  Average Amount: $1,041.52

Colecistectomia Laparoscopica
  Claims: 900
  Total Amount: $2,186,484.00
  Average Amount: $2,429.43

 Esguince Tobillo
  Claims: 586
  Total Amount: $349,600.00
  Average Amount: $596.59

Apendicectomía Laparoscópica
  Claims: 417
  Total Amount: $1,140,762.00
  Average Amou

In [17]:
# Network statistics
df_aggregated['IN_RED'] = df_aggregated['IN_RED'].astype('category')  # Convert to category
in_red_stats = df_aggregated.groupby('IN_RED').agg({
    'ID': 'count',
    'PAID_USD': ['sum', 'mean', 'median']
}).round(2)

print("Network vs Non-Network Statistics:")
for network_status in [True, False]:
    status_label = "In-Network" if network_status else "Out-of-Network"
    subset = df_aggregated[df_aggregated['IN_RED'] == network_status]
    
    if len(subset) > 0:
        print(f"\n{status_label}:")
        print(f"  Number of Claims: {len(subset):,}")
        print(f"  Total Amount: ${subset['PAID_USD'].sum():,.2f}")
        print(f"  Average Amount: ${subset['PAID_USD'].mean():,.2f}")
        print(f"  Median Amount: ${subset['PAID_USD'].median():,.2f}")
    else:
        print(f"\n{status_label}: No claims found")

Network vs Non-Network Statistics:

In-Network:
  Number of Claims: 1,974
  Total Amount: $5,817,076.00
  Average Amount: $2,946.85
  Median Amount: $1,485.50

Out-of-Network:
  Number of Claims: 45,453
  Total Amount: $129,853,091.00
  Average Amount: $2,856.87
  Median Amount: $1,365.00


  in_red_stats = df_aggregated.groupby('IN_RED').agg({


In [18]:
# Monthly trends
monthly_trends = df_aggregated.set_index('DATE').resample('M').agg({
    'ID': 'count',
    'PAID_USD': ['sum', 'mean']
}).round(2)

print("Monthly Trends (Last 6 months):")
for date, stats in monthly_trends.tail(6).iterrows():
    print(f"\n{date.strftime('%B %Y')}:")
    print(f"  Claims: {stats['ID']['count']:,}")
    print(f"  Total Amount: ${stats['PAID_USD']['sum']:,.2f}")
    print(f"  Average Amount: ${stats['PAID_USD']['mean']:,.2f}")

Monthly Trends (Last 6 months):

December 2023:
  Claims: 2,996.0
  Total Amount: $6,921,204.00
  Average Amount: $2,310.15

January 2024:
  Claims: 2,744.0
  Total Amount: $6,344,332.00
  Average Amount: $2,312.07

February 2024:
  Claims: 2,272.0
  Total Amount: $5,484,953.00
  Average Amount: $2,414.15

March 2024:
  Claims: 2,326.0
  Total Amount: $5,385,472.00
  Average Amount: $2,315.34

April 2024:
  Claims: 1,345.0
  Total Amount: $3,546,887.00
  Average Amount: $2,637.09

May 2024:
  Claims: 367.0
  Total Amount: $1,132,712.00
  Average Amount: $3,086.41


  monthly_trends = df_aggregated.set_index('DATE').resample('M').agg({


#### Part 1.2: Before vs. After Joining Network (Normalized Time Frames)

#### Part 1.3: Comparison Among Clinics In vs. Out of Network By Tier (Normalized Time Frames)