### **Part 0: Preprocessing**

- Import dependencies
- Use aliases for filenames, relevant columns, etc
- Keep relevant columns
- Normalize partner clinic names
- Dump to new clean .csv files
- Define useful functions


In [86]:
# dependencies
import pandas as pd
import json
from rapidfuzz import process
import datetime

In [87]:
# aliases for filenames  
DATA_FILENAME = 'data.csv'
CLINICS_IN_RED_FILENAME = 'clinics.json'
TIER_COMPARISON_FILENAME = 'tier_comparison.json'

# aliases for relevant columns
ALIASES = {
    'CONCATENADO': 'ID',
    'Clinica1': 'CLINIC',
    'fe_declaracion': 'DATE',
    'procedimiento': 'PROCEDURE',
    'estatus_siniestro1': 'STATUS',
    'pais': 'COUNTRY',
    'Region': 'REGION',
    'tipo_siniestro1': 'TYPE',
    'monto usd': 'PAID_USD',   
}
# type definitions for relevant columns
DATA_TYPE_DICT = {
    'CONCATENADO':'object',
    'Clinica1':'object',
    'fe_declaracion':'object',
    'procedimiento':'object',
    'estatus_siniestro1':'object',
    'pais':'object',
    'Region':'object',
    'tipo_siniestro1':'object',
    'monto usd': 'float64',
}

# aliases for new columns
IN_RED = 'in_red' 

# format floats
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [88]:
# read utils 
def df_data(filename): 
    """
    Reads .csv file and returns DataFrame with relevant columns (declared in DATA_TYPE_DICT).
    """
    df = pd.read_csv(filename, dtype=DATA_TYPE_DICT,
                             encoding='utf-8', low_memory=False)
    df = df[list(DATA_TYPE_DICT)]
    return df

def json_data(filename): 
    """
    Reads filename and returns json object. 
    """      
    with open(filename) as file:
        return json.load(file)
# transform utils

def normalized(df, column, reference, threshold=90):
    """
    Normalizes a column given a reference name using closest matches. 
    Fixes inconsistent entries that refer to the same value.
    Matches only apply if the confidence score is above the given threshold.
    Displays the number of unique clinic names before and after normalization.
    Prints which names were normalized.
    """
    
    # Count unique values before normalization
    unique_before = df[column].nunique()

    # Create the mapping with confidence filtering
    mapping = {}
    normalized_log = []  # Track normalized entries
    for clinic in df[column].unique():
        match = process.extractOne(clinic, reference)
        if match and match[1] >= threshold:  # Check confidence score
            normalized_name = match[0]
            if clinic != normalized_name:
                normalized_log.append((clinic, normalized_name))  # Log changes
            mapping[clinic] = normalized_name
        else:
            mapping[clinic] = clinic  # Retain original name if confidence is low

    # Normalize the column
    df[column] = df[column].map(mapping)

    # Count unique values after normalization
    unique_after = df[column].nunique()

    # Print unique counts
    print(f"Unique clinic names before normalization: {unique_before}")
    print(f"Unique clinic names after normalization: {unique_after}")

    # Print what names were normalized
    if normalized_log:
        print("\nNormalized Names:")
        for original, normalized in normalized_log:
            print(f"  {original} -> {normalized}")
    else:
        print("\nNo names were normalized.")

    return df

In [89]:
# read json files as dictionaries
CLINICS_IN_RED = json_data(CLINICS_IN_RED_FILENAME) # clinic to joined_dates and tier
TIER_COMPARSION = json_data(TIER_COMPARISON_FILENAME) # comparison clinics for each tier

def in_red(entry): 
    """
    Determines if an entry was handled in red. 
    True if: 
        1. Clinic is in red for entry's type and 
        2. Entry happened on or after the clinic joined the red for entry's type
    False otherwise.    
    """ 
    clinic = entry['CLINIC']
    type = entry['TYPE']
    date = entry['DATE']
    if clinic not in CLINICS_IN_RED or type not in CLINICS_IN_RED[clinic]:
        return False
    joined_date = datetime.strptime(
        CLINICS_IN_RED[clinic][type], '%m/%d/%Y')
    if date < joined_date:
        return False
    return True

# read
df = df_data(DATA_FILENAME)

# rename
df.rename(columns=ALIASES, inplace=True)

# transform dates to datetime objects
df['DATE'] = pd.to_datetime(
        df['DATE'], format='%m/%d/%Y')

# normalize
standard_clinic_names = set(CLINICS_IN_RED.keys()) | set(
    c for t in TIER_COMPARSION.keys() for c in TIER_COMPARSION[t]
)
df_normalized = normalized(df, "CLINIC", standard_clinic_names)

# classify
df_normalized["IN_RED"] = df_normalized.apply(in_red, axis=1)

# new csv
df_normalized.to_csv('data_clean.csv', index=False)

df_normalized.head()

Unique clinic names before normalization: 851
Unique clinic names after normalization: 848

Normalized Names:
  POLICLINICA LA ARBOLEDA, C.A -> POLICLINICA LA ARBOLEDA, C.A.
  INSTITUTO CLINICO LA FLORIDA C.A -> INSTITUTO CLINICO LA FLORIDA, C.A.
  CLINICA SANATRIX C.A -> CLINICA SANATRIX, C.A.


Unnamed: 0,ID,CLINIC,DATE,PROCEDURE,STATUS,COUNTRY,REGION,TYPE,PAID_USD,IN_RED
0,V-123938832OTITIS EXTERNA Y MEDIA SUPURATIVAma...,A.C. CENTRO MEDICO DOCENTE LA TRINIDAD,2023-03-01,TRATAMIENTO MEDICO AMBULATORIO,CERRADO,VENEZUELA,Caracas,EMERGENCIA,640.0,False
1,V-174282511OTITIS EXTERNA Y MEDIA SUPURATIVAma...,A.C. CENTRO MEDICO DOCENTE LA TRINIDAD,2023-05-09,TRATAMIENTO MEDICO AMBULATORIO,CERRADO,VENEZUELA,Caracas,EMERGENCIA,640.0,False
2,V-260896121INFECCION VIRALenero2023EMERGENCIA,A.C. CENTRO MEDICO DOCENTE LA TRINIDAD,2023-01-14,TRATAMIENTO MEDICO AMBULATORIO,CERRADO,VENEZUELA,Caracas,EMERGENCIA,640.0,False
3,V-117412230OTITIS EXTERNA Y MEDIA SUPURATIVAma...,A.C. CENTRO MEDICO DOCENTE LA TRINIDAD,2023-05-16,TRATAMIENTO MEDICO AMBULATORIO,CERRADO,VENEZUELA,Caracas,EMERGENCIA,640.0,False
4,V-117357381OTITIS EXTERNA Y MEDIA SUPURATIVAen...,A.C. CENTRO MEDICO DOCENTE LA TRINIDAD,2023-01-06,TRATAMIENTO MEDICO AMBULATORIO,CERRADO,VENEZUELA,Caracas,EMERGENCIA,642.0,False
