In [56]:
import pandas as pd

In [65]:
auna = pd.read_csv('auna.csv', delimiter=';')
auna.head()
cmp_present_count = auna['cmp'].notnull().sum()
total_count = len(auna)
print(f'cmp present: {cmp_present_count} / {total_count} ({cmp_present_count/total_count:.2%})')


cmp present: 1012 / 1015 (99.70%)


In [66]:
auna_clean = auna.dropna(subset=['cmp'])
auna_clean = auna_clean.drop(columns=['rne','slug',"profile_url"])

In [67]:
auna_clean['loc_array'] = auna_clean['locations'].str.split(',')

lima_metropolitana_clinics = {
    'Clínica Delgado Auna': 'CLIN-3',
    'Auna Guardia Civil - Clínica': 'CLIN-55',
    'Auna Bellavista - Clínica': 'CLIN-2',
}

# Create a set of clinic names for fast lookup
lima_clinic_names = set(lima_metropolitana_clinics.keys())

# Define a function that checks if any clinic in loc_array is in lima_metropolitana_clinics
def is_available(loc_list):
    if not isinstance(loc_list, list):
        return False
    # Strip spaces from each location and check membership
    return any(loc.strip() in lima_clinic_names for loc in loc_list)

# New function to map clinic names to IDs in loc_array
def map_loc_ids(loc_list):
    if not isinstance(loc_list, list):
        return []
    return [
        lima_metropolitana_clinics[loc.strip()]
        for loc in loc_list
        if loc.strip() in lima_metropolitana_clinics
    ]

auna_clean['available'] = auna_clean['loc_array'].apply(is_available)
auna_clean['loc_id_arrays'] = auna_clean['loc_array'].apply(map_loc_ids)

# Ensure the 'cmp' column is of type integer (if possible)
auna_clean['cmp'] = auna_clean['cmp'].astype(int)



In [68]:
auna_clean = auna_clean[auna_clean['available']]
auna_clean = auna_clean.drop(columns=['locations', 'available', 'specialties','loc_array'])
auna_clean = auna_clean.rename(columns={'cmp': 'CMP'})
# Reorder columns to put 'CMP' first
columns = ['CMP'] + [col for col in auna_clean.columns if col != 'CMP']
auna_clean = auna_clean[columns]
auna_clean.to_csv('auna_clean.csv', index=False)