In [9]:
### **Part 0: Preprocessing** - Import dependencies, define aliases, clean data, normalize clinic names, and create utility functions

import pandas as pd
import json
from rapidfuzz import process
import datetime

In [10]:
# aliases for filenames  
DATA_FILENAME = 'data.csv'
CLINICS_IN_RED_FILENAME = 'clinics.json'
TIER_COMPARISON_FILENAME = 'tier_comparison.json'

# aliases for relevant columns
ALIASES = {
    'CONCATENADO': 'ID',
    'Clinica1': 'CLINIC',
    'fe_declaracion': 'DATE',
    'procedimiento': 'PROCEDURE',
    'estatus_siniestro1': 'STATUS',
    'pais': 'COUNTRY',
    'Region': 'REGION',
    'tipo_siniestro1': 'TYPE',
    'monto usd': 'PAID_USD',   
}
# type definitions for relevant columns
DATA_TYPE_DICT = {
    'CONCATENADO':'object',
    'Clinica1':'object',
    'fe_declaracion':'object',
    'procedimiento':'object',
    'estatus_siniestro1':'object',
    'pais':'object',
    'Region':'object',
    'tipo_siniestro1':'object',
    'monto usd': 'float64',
}

# aliases for new columns
IN_RED = 'in_red' 

# format floats
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [11]:
# read utils 
def df_data(filename): 
    """
    Reads .csv file and returns DataFrame with relevant columns (declared in DATA_TYPE_DICT).
    """
    # First, read the CSV without dtype specifications to see actual columns
    df = pd.read_csv(filename, encoding='utf-8', low_memory=False)
    
    # Print actual columns to help debug
    print("Available columns in CSV:", df.columns.tolist())
    
    # Create a mapping of available columns to their dtypes
    available_dtypes = {col: DATA_TYPE_DICT[col] for col in DATA_TYPE_DICT if col in df.columns}
    
    # Read again with correct dtypes
    df = pd.read_csv(filename, dtype=available_dtypes, encoding='utf-8', low_memory=False)
    
    # Select only the columns that exist
    existing_columns = [col for col in DATA_TYPE_DICT if col in df.columns]
    if not existing_columns:
        raise ValueError("None of the specified columns found in the CSV file!")
    
    df = df[existing_columns]
    return df

def json_data(filename): 
    """
    Reads filename and returns json object. 
    """      
    with open(filename) as file:
        return json.load(file)
# transform utils

def normalized(df, column, reference, threshold=90):
    """
    Normalizes a column given a reference name using closest matches. 
    Fixes inconsistent entries that refer to the same value.
    Matches only apply if the confidence score is above the given threshold.
    Displays the number of unique clinic names before and after normalization.
    Prints which names were normalized.
    """
    
    # Count unique values before normalization
    unique_before = df[column].nunique()

    # Create the mapping with confidence filtering
    mapping = {}
    normalized_log = []  # Track normalized entries
    for clinic in df[column].unique():
        match = process.extractOne(clinic, reference)
        if match and match[1] >= threshold:  # Check confidence score
            normalized_name = match[0]
            if clinic != normalized_name:
                normalized_log.append((clinic, normalized_name))  # Log changes
            mapping[clinic] = normalized_name
        else:
            mapping[clinic] = clinic  # Retain original name if confidence is low

    # Normalize the column
    df[column] = df[column].map(mapping)

    # Count unique values after normalization
    unique_after = df[column].nunique()

    # Print unique counts
    print(f"Unique clinic names before normalization: {unique_before}")
    print(f"Unique clinic names after normalization: {unique_after}")

    # Print what names were normalized
    if normalized_log:
        print("\nNormalized Names:")
        for original, normalized in normalized_log:
            print(f"  {original} -> {normalized}")
    else:
        print("\nNo names were normalized.")

    return df

In [12]:
# read json files as dictionaries
CLINICS_IN_RED = json_data(CLINICS_IN_RED_FILENAME) # clinic to joined_dates and tier
TIER_COMPARSION = json_data(TIER_COMPARISON_FILENAME) # comparison clinics for each tier

def in_red(entry): 
    """
    Determines if an entry was handled in red. 
    True if: 
        1. Clinic is in red for entry's type and 
        2. Entry happened on or after the clinic joined the red for entry's type
    False otherwise.    
    """ 
    clinic = entry['CLINIC']
    type = entry['TYPE']
    date = entry['DATE']
    if clinic not in CLINICS_IN_RED or type not in CLINICS_IN_RED[clinic]:
        return False
    joined_date = datetime.strptime(
        CLINICS_IN_RED[clinic][type], '%m/%d/%Y')
    if date < joined_date:
        return False
    return True

# read
df = df_data(DATA_FILENAME)

# rename
df.rename(columns=ALIASES, inplace=True)

# transform dates to datetime objects
df['DATE'] = pd.to_datetime(
        df['DATE'], format='%m/%d/%Y')

# normalize
standard_clinic_names = set(CLINICS_IN_RED.keys()) | set(
    c for t in TIER_COMPARSION.keys() for c in TIER_COMPARSION[t]
)
df_normalized = normalized(df, "CLINIC", standard_clinic_names)

# classify
df_normalized["IN_RED"] = df_normalized.apply(in_red, axis=1)

# new csv
df_normalized.to_csv('data_clean.csv', index=False)

df_normalized.head()

Available columns in CSV: ['sucursal', 'ramo', 'poliza', 'de_motivo_siniestro', 'causa_siniestro', 'cd_pais', 'cd_usuario', 'siniestro', 'tipo_siniestro', 'estatus_siniestro', 'facturado', 'monto_usd', 'fe_ocurrencia', 'fe_declaracion', 'enfermedad', 'procedimiento', 'especialidad', 'tratamiento', 'nombre_afectado', 'cedula_afectado', 'edad', 'concepto', 'rif_contratante', 'contratante', 'cd_mediador', 'nm_mediador', 'estado', 'rif_clin', 'clinica', 'tipo_proveedor', 'fe_ingreso', 'fe_egreso', 'pais', 'ciudad', 'conexion', 'localidad', 'monto usd', 'tipo_siniestro1', 'CONCATENADO', 'Mes', 'Año', 'Region', 'Ocurrencia', 'estatus_siniestro1', 'tipo de poliza', 'tipo_proc', 'Rango edad', 'Proveedor Internacional', 'Categoría', 'SA', 'Producto', 'Clinica1', 'enfermedad1', 'tasa ocurrencia', 'Facturado_usd', 'desglose1']
Unique clinic names before normalization: 851
Unique clinic names after normalization: 848

Normalized Names:
  POLICLINICA LA ARBOLEDA, C.A -> POLICLINICA LA ARBOLEDA, C.A

Unnamed: 0,ID,CLINIC,DATE,PROCEDURE,STATUS,COUNTRY,REGION,TYPE,PAID_USD,IN_RED
0,V-123938832OTITIS EXTERNA Y MEDIA SUPURATIVAma...,A.C. CENTRO MEDICO DOCENTE LA TRINIDAD,2023-03-01,TRATAMIENTO MEDICO AMBULATORIO,CERRADO,VENEZUELA,Caracas,EMERGENCIA,640.0,False
1,V-174282511OTITIS EXTERNA Y MEDIA SUPURATIVAma...,A.C. CENTRO MEDICO DOCENTE LA TRINIDAD,2023-05-09,TRATAMIENTO MEDICO AMBULATORIO,CERRADO,VENEZUELA,Caracas,EMERGENCIA,640.0,False
2,V-260896121INFECCION VIRALenero2023EMERGENCIA,A.C. CENTRO MEDICO DOCENTE LA TRINIDAD,2023-01-14,TRATAMIENTO MEDICO AMBULATORIO,CERRADO,VENEZUELA,Caracas,EMERGENCIA,640.0,False
3,V-117412230OTITIS EXTERNA Y MEDIA SUPURATIVAma...,A.C. CENTRO MEDICO DOCENTE LA TRINIDAD,2023-05-16,TRATAMIENTO MEDICO AMBULATORIO,CERRADO,VENEZUELA,Caracas,EMERGENCIA,640.0,False
4,V-117357381OTITIS EXTERNA Y MEDIA SUPURATIVAen...,A.C. CENTRO MEDICO DOCENTE LA TRINIDAD,2023-01-06,TRATAMIENTO MEDICO AMBULATORIO,CERRADO,VENEZUELA,Caracas,EMERGENCIA,642.0,False


In [13]:
### Part 1: Descriptive Analysis

# Basic dataset statistics
print(f"Total number of claims: {len(df_normalized):,}")
print(f"Date range: {df_normalized['DATE'].min().strftime('%m/%d/%Y')} to {df_normalized['DATE'].max().strftime('%m/%d/%Y')}")
print(f"Number of unique clinics: {df_normalized['CLINIC'].nunique():,}")
print(f"Number of unique procedures: {df_normalized['PROCEDURE'].nunique():,}")
print(f"\nTotal amount paid (USD): ${df_normalized['PAID_USD'].sum():,.2f}")
print(f"Average claim amount (USD): ${df_normalized['PAID_USD'].mean():,.2f}")
print(f"Median claim amount (USD): ${df_normalized['PAID_USD'].median():,.2f}")


top_clinics = df_normalized['CLINIC'].value_counts().head(10)
print("Top 10 Clinics by Number of Claims:")
for clinic, count in top_clinics.items():
    total_amount = df_normalized[df_normalized['CLINIC'] == clinic]['PAID_USD'].sum()
    avg_amount = df_normalized[df_normalized['CLINIC'] == clinic]['PAID_USD'].mean()
    print(f"\n{clinic}")
    print(f"  Claims: {count:,}")
    print(f"  Total Amount: ${total_amount:,.2f}")
    print(f"  Average Amount: ${avg_amount:,.2f}")


Total number of claims: 843,573
Date range: 01/01/2023 to 06/05/2024
Number of unique clinics: 848
Number of unique procedures: 601

Total amount paid (USD): $301,558,051.00
Average claim amount (USD): $357.48
Median claim amount (USD): $42.00
Top 10 Clinics by Number of Claims:

CLINICA SANTIAGO DE LEON, C.A.
  Claims: 31,583
  Total Amount: $10,033,386.00
  Average Amount: $317.68

CENTRO CLINICO FENIX SALUD,C.A.
  Claims: 26,945
  Total Amount: $5,749,766.00
  Average Amount: $213.40

FARMACIA PLAZA BOLIVAR, C.A.
  Claims: 16,847
  Total Amount: $352,172.00
  Average Amount: $20.91

GRUPO MEDIS SANTA FE, C.A.
  Claims: 14,201
  Total Amount: $1,518,083.00
  Average Amount: $106.90

A.C. CENTRO MEDICO DOCENTE LA TRINIDAD
  Claims: 12,297
  Total Amount: $21,739,560.00
  Average Amount: $1,767.88

CLINICA EL AVILA, C.A.
  Claims: 10,707
  Total Amount: $9,300,331.00
  Average Amount: $868.62

GRUPO MEDICO SANTA PAULA, S.A. (GMSP S.A.)
  Claims: 10,650
  Total Amount: $11,961,988.00
  

In [14]:
# Top 10 procedures
top_procedures = df_normalized['PROCEDURE'].value_counts().head(10)
print("Top 10 Procedures:")
for procedure, count in top_procedures.items():
    total_amount = df_normalized[df_normalized['PROCEDURE'] == procedure]['PAID_USD'].sum()
    avg_amount = df_normalized[df_normalized['PROCEDURE'] == procedure]['PAID_USD'].mean()
    print(f"\n{procedure}")
    print(f"  Claims: {count:,}")
    print(f"  Total Amount: ${total_amount:,.2f}")
    print(f"  Average Amount: ${avg_amount:,.2f}")


Top 10 Procedures:

TRATAMIENTO MEDICO AMBULATORIO
  Claims: 493,274
  Total Amount: $93,440,889.00
  Average Amount: $189.43

TRATAMIENTO MEDICO CON HOSPITALIZACION
  Claims: 41,650
  Total Amount: $67,476,603.00
  Average Amount: $1,620.09

TRATAMIENTO MEDICO CON PROCEDIMIENTO NO BAREMIZADO
  Claims: 14,668
  Total Amount: $21,408,660.00
  Average Amount: $1,459.55

Cesárea
  Claims: 7,274
  Total Amount: $7,781,987.00
  Average Amount: $1,069.84

Facomulsificación + LIO
  Claims: 3,806
  Total Amount: $2,265,519.00
  Average Amount: $595.25

Colecistectomia Laparoscopica
  Claims: 2,905
  Total Amount: $4,082,364.00
  Average Amount: $1,405.29

Facoemulsificación de Catarata con Implante de LIO Monofocal
  Claims: 2,192
  Total Amount: $1,397,698.00
  Average Amount: $637.64

Administración de quimioterapia, intravenoso; técnica infusión, una a 8 horas, cada hora adicional
  Claims: 1,702
  Total Amount: $2,211,587.00
  Average Amount: $1,299.40

 Esguince Tobillo
  Claims: 1,232
  

In [17]:
# Network statistics
df_normalized['IN_RED'] = df_normalized['IN_RED'].astype('category')  # Convert to category
in_red_stats = df_normalized.groupby('IN_RED').agg({
    'ID': 'count',
    'PAID_USD': ['sum', 'mean', 'median']
}).round(2)

print("Network vs Non-Network Statistics:")
for network_status in [True, False]:
    status_label = "In-Network" if network_status else "Out-of-Network"
    subset = df_normalized[df_normalized['IN_RED'] == network_status]
    
    if len(subset) > 0:
        print(f"\n{status_label}:")
        print(f"  Number of Claims: {len(subset):,}")
        print(f"  Total Amount: ${subset['PAID_USD'].sum():,.2f}")
        print(f"  Average Amount: ${subset['PAID_USD'].mean():,.2f}")
        print(f"  Median Amount: ${subset['PAID_USD'].median():,.2f}")
    else:
        print(f"\n{status_label}: No claims found")

  in_red_stats = df_normalized.groupby('IN_RED').agg({


Network vs Non-Network Statistics:

In-Network: No claims found

Out-of-Network:
  Number of Claims: 843,573
  Total Amount: $301,558,051.00
  Average Amount: $357.48
  Median Amount: $42.00


In [18]:
# Monthly trends
monthly_trends = df_normalized.set_index('DATE').resample('M').agg({
    'ID': 'count',
    'PAID_USD': ['sum', 'mean']
}).round(2)

print("Monthly Trends (Last 6 months):")
for date, stats in monthly_trends.tail(6).iterrows():
    print(f"\n{date.strftime('%B %Y')}:")
    print(f"  Claims: {stats['ID']['count']:,}")
    print(f"  Total Amount: ${stats['PAID_USD']['sum']:,.2f}")
    print(f"  Average Amount: ${stats['PAID_USD']['mean']:,.2f}")

  monthly_trends = df_normalized.set_index('DATE').resample('M').agg({


Monthly Trends (Last 6 months):

January 2024:
  Claims: 45,114.0
  Total Amount: $15,064,599.00
  Average Amount: $333.92

February 2024:
  Claims: 53,205.0
  Total Amount: $15,989,357.00
  Average Amount: $300.52

March 2024:
  Claims: 51,412.0
  Total Amount: $15,867,760.00
  Average Amount: $308.64

April 2024:
  Claims: 63,331.0
  Total Amount: $17,876,695.00
  Average Amount: $282.27

May 2024:
  Claims: 65,110.0
  Total Amount: $19,264,040.00
  Average Amount: $295.87

June 2024:
  Claims: 6,277.0
  Total Amount: $1,926,773.00
  Average Amount: $307.50
