# Data Cleaning and Question/Answer Definition

## Libraries

In [41]:
import pandas as pd
import numpy as np
import os

## Mapping Questions/Answers

In [30]:
df_fusion = pd.read_csv('../../data/raw/fusion_challenge_datos.csv', delimiter=';')
df_melanoma = pd.read_csv('../../data/raw/base_datos_melanomas.csv', delimiter=';', encoding='utf-8')

In [None]:
df_fusion

In [None]:
df_melanoma

In [None]:
df_melanoma.columns

In [None]:
# We only need one identifier for the images
df_melanoma.drop(columns=['Nº de imagen', 'Identificador de la lesión'], inplace=True)

In [None]:
df_melanoma

In [None]:
# Check for NAs/missing values
df_fusion.isna().sum()

In [None]:
df_melanoma.isna().sum()

## Dataframe translation

In [None]:
# # Function to translate values from Spanish to English
# def translate_value(val):
#     if isinstance(val, str):
#         translation = Translator(from_lang='es', to_lang='en').translate(val)
#     else:
#         translation = val
#     return translation

In [None]:
# # Function to translate different categories
# def translate_categories(categories, fromLang = 'es', toLang='en'):
#     translator = Translator(from_lang = fromLang, to_lang = toLang)
#     translations = [translator.translate(category) for category in categories]
#     return dict(zip(categories, translations))

In [None]:
# # Function to translate column names and values in the entire DataFrame
# def translate_dataframe(df, fromLang = 'es', toLang='en'):

#     # Translate column names
#     df.columns = [Translator(from_lang=fromLang, to_lang = toLang).translate(col) for col in df.columns]

#     translations = {}
#     for col in df.columns:
#         unique_categories = df[col].unique()
#         translations[col] = translate_categories(unique_categories)

#     # Map the translations back to the DataFrame
#     for col in df.columns:
#         df[col] = df[col].map(translations[col]).fillna(df[col])

#     return df

In [None]:
# df_fusion = translate_dataframe(df_fusion)
# df_fusion

In [None]:
# df_melanoma.columns = [translate_value(col) for col in df_melanoma.columns]
# df_melanoma.columns

In [None]:
# df_melanoma = df_melanoma.applymap(translate_value)

In [None]:
df_melanoma.columns

In [None]:
melanomaData_map = {

    'Nombre del archivo (.jpg)': {
        'colName': 'File name (.jpg)',
        'values': {}
    },

    'Centro de Procedencia': {
        'colName': 'Center',
        'values': {
            'IMO': 'IMO',
            'CDI': 'CDI',
            'CR': 'CR'
        }
    },
    'Equipo de dermatoscopia': {
        'colName': 'Equipment',
        'values': {
            'Molemax-II': 'Molemax-II',
            'Molemax HD': 'Molemax HD',
            'Fotofinder': 'Fotofinder',
            'Heynedermaphot': 'Heynedermaphot',
            'Dermlite Photo': 'Dermlite Photo',
            'Dermlite DL3': 'Dermlite DL3'
        }
    },
    'Tipo de Luz': {
        'colName': 'Light',
        'values': {
            'Polarizada': 'Polarized',
            'No polarizada': 'Non-polarized'
        }
    },
    'Sexo': {
        'colName': 'Gender',
        'values': {
            'Varón': 'Male',
            'Mujer': 'Female'
        }
    },
    'Edad': {
        'colName': 'Age',
        'values': {}
    },
    'Localización': {
        'colName': 'Location',
        'values': {
            'Cuello': 'Neck',
            'Tronco antero - lateral': 'Anterolateral Trunk',
            'Espalda': 'Back',
            'Miembros superiores': 'Upper limbs',
            'Miembros inferiores': 'Lower limbs'
        }
    },
    'Silueta general de la lesión': {
        'colName': 'General Silhouette',
        'values': {
            'Plana': 'Flat',
            'Aplanada sobreelevada': 'Flattened Elevated',
            'Exofítica': 'Exophytic'
        }
    },
    'Diagnóstico histológico': {
        'colName': 'Histology Diagnostic',
        'values': {
            'Melanoma in situ (MIS)': 'Melanoma in situ (MIS)',
            'Melanoma de extensión superficial (MES)': 'Superficial Spreading Melanoma (SSM)',
            'Melanoma nodular (MN)': 'Nodular Melanoma (NM)',
            'Melanoma sobre nevo congénito (MsNCg)': 'Melanoma on Congenital Nevus (MCN)',
            'Melanoma spitzoide/Nevo de Spitz maligno (MS)': 'Spitzoid Melanoma/Malignant Spitz Nevus (MSN)',
            'Nevo azul maligno (NAM)': 'Malignant Blue Nevus (MBN)',
            'Nevo de Spitz (NS)': 'Spitz Nevus (SN)',
            'Nevo de Reed (NR)': 'Reed Nevus (NR)',
            'Nevo melanocítico displásico (NMD)': 'Dysplastic Melanocytic Nevus (DMN)',
            'Nevo melanocítico lentiginoso (NML)': 'Lentiginous Melanocytic Nevus (LMN)',
            'Nevo melanocítico congénito (NMCg)': 'Congenital Melanocytic Nevus (CMN)',
            'Nevo melanocítico juntural (NMJ)': 'Junctional Melanocytic Nevus (JMN)',
            'Nevo melanocítico compuesto (NMC)': 'Compound Melanocytic Nevus (CMN)',
            'Nevo melanocítico intradérmico (NMI)': 'Intradermal Melanocytic Nevus (IMN)',
            'Nevo azul (NA)': 'Blue Nevus (BN)',
            'Lunar no atípico y estable (L)': 'Non-atypical Stable Mole (M)'
        }
    },
    'Nivel de displasia': {
        'colName': 'Dysplasia',
        'values': {
            'No aplica': 'Not Applicable',
            'No disponible': 'Not Available',
            'Severa': 'Severe'
        }
    },
    'Espesor': {
        'colName': 'Thickness',
        'values': {}
    },
    'Nivel de Clark': {
        'colName': 'Clark Level',
        'values': {
            'I (melanoma in situ)': 'I (melanoma in situ)',
            'II': 'II',
            'III': 'III',
            'IV': 'IV',
            'V': 'V'
        }
    },
    'Impresión diagnóstica': {
        'colName': 'Diagnostic Impression',
        'values': {
            'Melanoma': 'Melanoma',
            'Descartar melanoma': 'Exclude Melanoma',
            'Lunar de Spitz/Reed': 'Spitz/Reed Nevus',
            'Lunar Atípico': 'Atypical Mole',
            'Lunar no atípico': 'Non-atypical Mole'
        }
    },
    'Dificultad diagnóstica': {
        'colName': 'Diagnostic Difficulty',
        'values': {
            'Baja': 'Low',
            'Media': 'Medium',
            'Alta': 'High'
        }
    },
    'Extirpación': {
        'colName': 'Excision',
        'values': {
            'No extirpado': 'Not Excised',
            'Primera consulta': 'First Consultation',
            'Seguimento, sin dermatoscopia previa': 'Follow-up, without previous dermoscopy',
            'Seguimiento con dermatoscopia previa, por cambios': 'Follow-up with previous dermoscopy, due to changes',
            'Seguimiento con dermatoscipa previa, por síntomas': 'Follow-up with previous dermoscopy, due to symptoms'

        }
    },
    'Patrón global': {
        'colName': 'Global Pattern',
        'values': {
            'Reticular difuso': 'Diffuse Reticular',
            'Reticular parcheado': 'Patchy Reticular',
            'Globular': 'Globular',
            'Empedrado': 'Cobblestone',
            'Homogéneo': 'Homogeneous',
            'Mixto (mezcla dos de los anteriores)': 'Mixed (mix of two of the above)',
            'Multicomponente ( mezcla 3 o más anteriores )': 'Multicomponent (mix of 3 or more above)',
            'Indefinido': 'Undefined'
        }
    },
    'Patrón globular periférico': {
        'colName': 'Peripheral Globular Pattern',
        'values': {
            'Ausente': 'Absent',
            'Regular convencional': 'Regular Conventional',
            'Regular en estallido': 'Regular Bursting',
            'Irregular convencional': 'Irregular Conventional',
            'Irregular en estallido': 'Irregular Bursting',
            'Focal convencional': 'Focal Conventional',
            'Focal en estallido': 'Focal Bursting'
        }
    },
    'Simetría': {
        'colName': 'Symmetry',
        'values': {
            'Simétrico': 'Symmetric',
            'Asimétrico en un eje': 'Asymmetric on one axis',
            'Asimétrico en dos ejes': 'Asymmetric on two axes'
        }
    },
    'Retículo pigmentado': {
        'colName': 'Pigmented Reticulum',
        'values': {
            'No': 'No',
            'Sí, sutil': 'Yes, subtle',
            'Sí, claro': 'Yes, clear',
            'Sí, claro y dominante': 'Yes, clear and dominant'
        }
    },
    'Retículo negativo': {
        'colName': 'Negative Reticulum',
        'values': {
            'No': 'No',
            'Sí, sutil': 'Yes, subtle',
            'Sí, claro': 'Yes, clear',
            'Sí, claro y dominante': 'Yes, clear and dominant'
        }
    },
    'Puntos/Glóbulos pequeños': {
        'colName': 'Small Dots/Globules',
        'values': {
            'No': 'No',
            'Sí, sutil': 'Yes, subtle',
            'Sí, claro': 'Yes, clear',
            'Sí, claro y dominante': 'Yes, clear and dominant'
        }
    },
    'Glóbulos grandes/Empedrado': {
        'colName': 'Large Globules/Cobblestone',
        'values': {
            'No': 'No',
            'Sí, sutil': 'Yes, subtle',
            'Sí, claro': 'Yes, clear',
            'Sí, claro y dominante': 'Yes, clear and dominant'
        }
    },
    'Glóbulos periféricos': {
        'colName': 'Peripheral Globules',
        'values': {
            'No': 'No',
            'Sí, sutil': 'Yes, subtle',
            'Sí, claro': 'Yes, clear',
            'Sí, claro y dominante': 'Yes, clear and dominant'
        }
    },
    'Proyecciones periféricas': {
        'colName': 'Peripheral Projections',
        'values': {
            'No': 'No',
            'Sí, sutil': 'Yes, subtle',
            'Sí, claro': 'Yes, clear',
            'Sí, claro y dominante': 'Yes, clear and dominant'
        }
    },
    'Área homogénea sin vasos': {
        'colName': 'Homogeneous Area without Vessels',
        'values': {
            'No': 'No',
            'Sí, sutil': 'Yes, subtle',
            'Sí, claro': 'Yes, clear',
            'Sí, claro y dominante': 'Yes, clear and dominant'
        }
    },
    'Área homogénea con vasos': {
        'colName': 'Homogeneous Area with Vessels',
        'values': {
            'No': 'No',
            'Sí, sutil': 'Yes, subtle',
            'Sí, claro': 'Yes, clear',
            'Sí, claro y dominante': 'Yes, clear and dominant'
        }
    },
    'Área indefinida': {
        'colName': 'Undefined Area',
        'values': {
            'No': 'No',
            'Sí, sutil': 'Yes, subtle',
            'Sí, claro': 'Yes, clear',
            'Sí, claro y dominante': 'Yes, clear and dominant'
        }
    },
    'Regresión gris': {
        'colName': 'Gray Regression',
        'values': {
            'No': 'No',
            'Sí, sutil': 'Yes, subtle',
            'Sí, claro': 'Yes, clear',
            'Sí, claro y dominante': 'Yes, clear and dominant'
        }
    },
    'Regresión blanca': {
        'colName': 'White Regression',
        'values': {
            'No': 'No',
            'Sí, sutil': 'Yes, subtle',
            'Sí, claro': 'Yes, clear',
            'Sí, claro y dominante': 'Yes, clear and dominant'
        }
    },
    'Velo azul blanquecino': {
        'colName': 'Whitish Blue Veil',
        'values': {
            'No': 'No',
            'Sí, sutil': 'Yes, subtle',
            'Sí, claro': 'Yes, clear',
            'Sí, claro y dominante': 'Yes, clear and dominant'
        }
    },
    'Crisálidas': {
        'colName': 'Chrysalis',
        'values': {
            'No': 'No',
            'Sí, sutil': 'Yes, subtle',
            'Sí, claro': 'Yes, clear',
            'Sí, claro y dominante': 'Yes, clear and dominant'
        }
    }
}

In [None]:
# Create a copy of the dataframe
df_melanoma_mapped = df_melanoma.copy()

# Columns to map
columns_to_map = df_melanoma.columns

# Iterate through all the columns
for col in columns_to_map:

    # Apply the mapping for the variable name using the dictionary
    newColName = melanomaData_map[col]['colName']

    # Clean all trailing blank spaces from strings before mapping
    # Example: '  Lunar no atípico y estable (L)   ' -> 'Lunar no atípico y estable (L)'
    if df_melanoma_mapped[col].dtype == 'O':
        df_melanoma_mapped[col] = df_melanoma_mapped[col].str.strip()

    # Avoid NaN generation for numerical variables or IDs without translation
    if melanomaData_map[col]['values']:
        df_melanoma_mapped[newColName] = df_melanoma_mapped[col].map(melanomaData_map[col]['values'])
    else:
        df_melanoma_mapped[newColName] = df_melanoma[col]

    # Drop the columns in Spanish
    df_melanoma_mapped.drop(columns=col, inplace=True)

In [None]:
df_melanoma_mapped.columns

In [None]:
na_sum = df_melanoma_mapped.isna().sum()
na_sum

In [None]:
# Store those columns with NAs
columns_with_na = na_sum[na_sum > 0].index.to_list()
columns_with_na

In [None]:
# Compare the amount of NAs in those columns to check if there has been NA injection
na_sum_original_melanoma = df_melanoma.isna().sum().values
na_sum_mapped_melanoma = df_melanoma_mapped.isna().sum().values

In [None]:
na_sum_comparison = pd.DataFrame([na_sum_original_melanoma, na_sum_mapped_melanoma], index=['df_melanoma', 'df_melanoma_mapped']).T
na_sum_comparison.index = df_melanoma_mapped.columns

In [None]:
na_sum_comparison

In [None]:
excision_missing_index = df_melanoma_mapped[df_melanoma_mapped['Excision'].isna()].index
global_pattern_missing_index = df_melanoma_mapped[df_melanoma_mapped['Global Pattern'].isna()].index

In [None]:
df_melanoma["Extirpación"][excision_missing_index]

In [None]:
df_melanoma["Patrón global"][global_pattern_missing_index]

In [None]:
df_melanoma_mapped

In [None]:
df_melanoma_mapped.drop(columns={'Thickness', 'Clark Level'}, inplace=True)

In [None]:
df_melanoma_mapped.columns

In [None]:
diagnostic_variables = ['General Silhouette', 'Histology Diagnostic', 'Dysplasia', 'Diagnostic Impression',
             'Diagnostic Difficulty', 'Excision', 'Global Pattern', 'Peripheral Globular Pattern',
             'Symmetry', 'Pigmented Reticulum', 'Negative Reticulum', 'Small Dots/Globules',
             'Large Globules/Cobblestone', 'Peripheral Globules', 'Peripheral Projections',
             'Homogeneous Area without Vessels', 'Homogeneous Area with Vessels', 'Undefined Area',
             'Gray Regression', 'White Regression', 'Whitish Blue Veil', 'Chrysalis']

variable_values = {}
for variable in diagnostic_variables:
    variable_values[variable] = df_melanoma_mapped[variable].unique().tolist()

In [None]:
variable_values

In [None]:
df_melanoma_mapped.to_csv('../../data/processed/df_melanoma_clean.csv', index=False)

In [31]:
fusionData_map = {

    'Nombre del archivo (.jpg)': {
        'colName': 'File name (.jpg)',
        'values': {}
    },
    
    'Patrón global': {
        'colName': 'Global Pattern',
        'values': {
            'Reticular difuso': 'Diffuse Reticular',
            'Reticular parcheado': 'Patchy Reticular',
            'Globular': 'Globular',
            'Empedrado': 'Cobblestone',
            'Homogéneo': 'Homogeneous',
            'Mixto (mezcla dos de los anteriores)': 'Mixed (mix of two of the above)',
            'Multicomponente ( mezcla 3 o más anteriores )': 'Multicomponent (mix of 3 or more above)',
            'Indefinido': 'Undefined'
        }
    },

    'Retículo pigmentado': {
        'colName': 'Pigmented Reticulum',
        'values': {
            'No': 'No',
            'Sí, sutil': 'Yes, subtle',
            'Sí, claro': 'Yes, clear',
            'Sí, claro y dominante': 'Yes, clear and dominant'
        }
    },
    'Retículo negativo': {
        'colName': 'Negative Reticulum',
        'values': {
            'No': 'No',
            'Sí, sutil': 'Yes, subtle',
            'Sí, claro': 'Yes, clear',
            'Sí, claro y dominante': 'Yes, clear and dominant'
        }
    },
    'Puntos/Glóbulos pequeños': {
        'colName': 'Small Dots/Globules',
        'values': {
            'No': 'No',
            'Sí, sutil': 'Yes, subtle',
            'Sí, claro': 'Yes, clear',
            'Sí, claro y dominante': 'Yes, clear and dominant'
        }
    },
    'Glóbulos grandes/Empedrado': {
        'colName': 'Large Globules/Cobblestone',
        'values': {
            'No': 'No',
            'Sí, sutil': 'Yes, subtle',
            'Sí, claro': 'Yes, clear',
            'Sí, claro y dominante': 'Yes, clear and dominant'
        }
    },
    'Glóbulos periféricos': {
        'colName': 'Peripheral Globules',
        'values': {
            'No': 'No',
            'Sí, sutil': 'Yes, subtle',
            'Sí, claro': 'Yes, clear',
            'Sí, claro y dominante': 'Yes, clear and dominant'
        }
    },
    'Proyecciones periféricas': {
        'colName': 'Peripheral Projections',
        'values': {
            'No': 'No',
            'Sí, sutil': 'Yes, subtle',
            'Sí, claro': 'Yes, clear',
            'Sí, claro y dominante': 'Yes, clear and dominant'
        }
    },
    'Área homogénea sin vasos': {
        'colName': 'Homogeneous Area without Vessels',
        'values': {
            'No': 'No',
            'Sí, sutil': 'Yes, subtle',
            'Sí, claro': 'Yes, clear',
            'Sí, claro y dominante': 'Yes, clear and dominant'
        }
    },
    'Área homogénea con vasos': {
        'colName': 'Homogeneous Area with Vessels',
        'values': {
            'No': 'No',
            'Sí, sutil': 'Yes, subtle',
            'Sí, claro': 'Yes, clear',
            'Sí, claro y dominante': 'Yes, clear and dominant'
        }
    },
    'Área indefinida': {
        'colName': 'Undefined Area',
        'values': {
            'No': 'No',
            'Sí, sutil': 'Yes, subtle',
            'Sí, claro': 'Yes, clear',
            'Sí, claro y dominante': 'Yes, clear and dominant'
        }
    },
    'Regresión gris': {
        'colName': 'Gray Regression',
        'values': {
            'No': 'No',
            'Sí, sutil': 'Yes, subtle',
            'Sí, claro': 'Yes, clear',
            'Sí, claro y dominante': 'Yes, clear and dominant'
        }
    },
    'Regresión blanca': {
        'colName': 'White Regression',
        'values': {
            'No': 'No',
            'Sí, sutil': 'Yes, subtle',
            'Sí, claro': 'Yes, clear',
            'Sí, claro y dominante': 'Yes, clear and dominant'
        }
    },
    'Velo azul blanquecino': {
        'colName': 'Whitish Blue Veil',
        'values': {
            'No': 'No',
            'Sí, sutil': 'Yes, subtle',
            'Sí, claro': 'Yes, clear',
            'Sí, claro y dominante': 'Yes, clear and dominant'
        }
    },
    'Crisálidas': {
        'colName': 'Chrysalis',
        'values': {
            'No': 'No',
            'Sí, sutil': 'Yes, subtle',
            'Sí, claro': 'Yes, clear',
            'Sí, claro y dominante': 'Yes, clear and dominant'
        }
    },
    'Etiqueta binaria': {
        'colName': 'Binary label',
        'values': {}
    }
}

In [32]:
# Create a copy of the dataframe
df_fusion_mapped = df_fusion.copy()

# Columns to map
columns_to_map = df_fusion.columns

# Iterate through all the columns
for col in columns_to_map:

    # Apply the mapping for the variable name using the dictionary
    newColName = fusionData_map[col]['colName']

    # Clean all trailing blank spaces from strings before mapping
    # Example: '  Lunar no atípico y estable (L)   ' -> 'Lunar no atípico y estable (L)'
    if df_fusion_mapped[col].dtype == 'O':
        df_fusion_mapped[col] = df_fusion_mapped[col].str.strip()

    # Avoid NaN generation for numerical variables or IDs without translation
    if fusionData_map[col]['values']:
        df_fusion_mapped[newColName] = df_fusion_mapped[col].map(fusionData_map[col]['values'])
    else:
        df_fusion_mapped[newColName] = df_fusion_mapped[col]

    # Drop the columns in Spanish
    df_fusion_mapped.drop(columns=col, inplace=True)

In [33]:
df_fusion_mapped.columns

Index(['File name (.jpg)', 'Global Pattern', 'Pigmented Reticulum',
       'Negative Reticulum', 'Small Dots/Globules',
       'Large Globules/Cobblestone', 'Peripheral Globules',
       'Peripheral Projections', 'Homogeneous Area without Vessels',
       'Homogeneous Area with Vessels', 'Undefined Area', 'Gray Regression',
       'White Regression', 'Whitish Blue Veil', 'Chrysalis', 'Binary label'],
      dtype='object')

In [34]:
na_sum = df_fusion_mapped.isna().sum()
na_sum

File name (.jpg)                    0
Global Pattern                      0
Pigmented Reticulum                 0
Negative Reticulum                  0
Small Dots/Globules                 0
Large Globules/Cobblestone          0
Peripheral Globules                 0
Peripheral Projections              0
Homogeneous Area without Vessels    0
Homogeneous Area with Vessels       0
Undefined Area                      0
Gray Regression                     0
White Regression                    0
Whitish Blue Veil                   0
Chrysalis                           0
Binary label                        0
dtype: int64

In [35]:
# Compare the amount of NAs in those columns to check if there has been NA injection
na_sum_original_fusion = df_fusion.isna().sum().values
na_sum_mapped_fusion = df_fusion_mapped.isna().sum().values
na_sum_comparison = pd.DataFrame([na_sum_original_fusion, na_sum_mapped_fusion], index=['df_fusion', 'df_fusion_mapped']).T
na_sum_comparison.index = df_fusion_mapped.columns
na_sum_comparison

Unnamed: 0,df_fusion,df_fusion_mapped
File name (.jpg),0,0
Global Pattern,0,0
Pigmented Reticulum,0,0
Negative Reticulum,0,0
Small Dots/Globules,0,0
Large Globules/Cobblestone,0,0
Peripheral Globules,0,0
Peripheral Projections,0,0
Homogeneous Area without Vessels,0,0
Homogeneous Area with Vessels,0,0


In [36]:
diagnostic_variables = ['Global Pattern', 'Pigmented Reticulum', 'Negative Reticulum', 'Small Dots/Globules',
             'Large Globules/Cobblestone', 'Peripheral Globules', 'Peripheral Projections',
             'Homogeneous Area without Vessels', 'Homogeneous Area with Vessels', 'Undefined Area',
             'Gray Regression', 'White Regression', 'Whitish Blue Veil', 'Chrysalis', 'Binary label']

variable_values = {}
for variable in diagnostic_variables:
    variable_values[variable] = df_fusion_mapped[variable].unique().tolist()
    print(variable_values[variable])

['Diffuse Reticular', 'Mixed (mix of two of the above)', 'Multicomponent (mix of 3 or more above)', 'Patchy Reticular', 'Globular', 'Homogeneous', 'Undefined', 'Cobblestone']
['Yes, clear and dominant', 'Yes, subtle', 'No', 'Yes, clear']
['No', 'Yes, subtle', 'Yes, clear and dominant', 'Yes, clear']
['Yes, subtle', 'No', 'Yes, clear', 'Yes, clear and dominant']
['No', 'Yes, clear and dominant', 'Yes, subtle', 'Yes, clear']
['No', 'Yes, clear', 'Yes, subtle', 'Yes, clear and dominant']
['Yes, subtle', 'No', 'Yes, clear', 'Yes, clear and dominant']
['No', 'Yes, subtle', 'Yes, clear and dominant', 'Yes, clear']
['No', 'Yes, clear and dominant', 'Yes, clear', 'Yes, subtle']
['Yes, subtle', 'No', 'Yes, clear', 'Yes, clear and dominant']
['No', 'Yes, subtle', 'Yes, clear']
['No', 'Yes, clear', 'Yes, subtle']
['No', 'Yes, subtle', 'Yes, clear']
['No', 'Yes, clear', 'Yes, subtle']
['benign', 'malignant']


In [37]:
variable_values

{'Global Pattern': ['Diffuse Reticular',
  'Mixed (mix of two of the above)',
  'Multicomponent (mix of 3 or more above)',
  'Patchy Reticular',
  'Globular',
  'Homogeneous',
  'Undefined',
  'Cobblestone'],
 'Pigmented Reticulum': ['Yes, clear and dominant',
  'Yes, subtle',
  'No',
  'Yes, clear'],
 'Negative Reticulum': ['No',
  'Yes, subtle',
  'Yes, clear and dominant',
  'Yes, clear'],
 'Small Dots/Globules': ['Yes, subtle',
  'No',
  'Yes, clear',
  'Yes, clear and dominant'],
 'Large Globules/Cobblestone': ['No',
  'Yes, clear and dominant',
  'Yes, subtle',
  'Yes, clear'],
 'Peripheral Globules': ['No',
  'Yes, clear',
  'Yes, subtle',
  'Yes, clear and dominant'],
 'Peripheral Projections': ['Yes, subtle',
  'No',
  'Yes, clear',
  'Yes, clear and dominant'],
 'Homogeneous Area without Vessels': ['No',
  'Yes, subtle',
  'Yes, clear and dominant',
  'Yes, clear'],
 'Homogeneous Area with Vessels': ['No',
  'Yes, clear and dominant',
  'Yes, clear',
  'Yes, subtle'],
 'Undef

In [38]:
df_fusion_mapped[:20]

Unnamed: 0,File name (.jpg),Global Pattern,Pigmented Reticulum,Negative Reticulum,Small Dots/Globules,Large Globules/Cobblestone,Peripheral Globules,Peripheral Projections,Homogeneous Area without Vessels,Homogeneous Area with Vessels,Undefined Area,Gray Regression,White Regression,Whitish Blue Veil,Chrysalis,Binary label
0,ISIC_0000000,Diffuse Reticular,"Yes, clear and dominant",No,"Yes, subtle",No,No,"Yes, subtle",No,No,"Yes, subtle",No,No,No,No,benign
1,ISIC_0000001,Diffuse Reticular,"Yes, clear and dominant",No,"Yes, subtle",No,No,No,"Yes, subtle",No,No,No,No,No,No,benign
2,ISIC_0000002,Mixed (mix of two of the above),"Yes, clear and dominant",No,No,No,No,No,"Yes, clear and dominant",No,"Yes, clear","Yes, subtle",No,No,No,malignant
3,ISIC_0000004,Multicomponent (mix of 3 or more above),"Yes, subtle",No,No,No,No,No,"Yes, clear and dominant","Yes, clear and dominant","Yes, clear and dominant","Yes, clear","Yes, clear","Yes, subtle",No,malignant
4,ISIC_0000006,Patchy Reticular,"Yes, clear and dominant",No,No,No,No,No,No,No,No,No,No,No,No,benign
5,ISIC_0000007,Diffuse Reticular,"Yes, clear and dominant","Yes, subtle",No,No,No,No,No,No,No,No,No,No,No,benign
6,ISIC_0000008,Globular,No,No,No,"Yes, clear and dominant","Yes, clear",No,"Yes, clear",No,No,No,No,No,No,benign
7,ISIC_0000009,Globular,"Yes, subtle",No,No,"Yes, clear and dominant",No,No,No,No,No,No,No,No,No,benign
8,ISIC_0000010,Mixed (mix of two of the above),"Yes, clear and dominant",No,No,No,No,No,"Yes, clear and dominant",No,No,No,No,No,No,benign
9,ISIC_0000011,Homogeneous,"Yes, subtle",No,No,No,No,No,"Yes, clear and dominant",No,No,No,No,No,No,benign


In [39]:
# Extract the numerical part of the "File name (.jpg)" column and convert it to integer
df_fusion_mapped["Numeric ID"] = df_fusion_mapped["File name (.jpg)"].str.split("_").str[1].astype(int)

# Sort the DataFrame based on the "Numeric ID" column
df_fusion_mapped_sorted = df_fusion_mapped.sort_values(by="Numeric ID")

# Reset the index after sorting
df_fusion_mapped_sorted = df_fusion_mapped_sorted.reset_index(drop=True)

# Drop the temporary "Numeric ID" column if needed
df_fusion_mapped_sorted.drop(columns=["Numeric ID"], inplace=True)

In [40]:
df_fusion_mapped_sorted[:50]

Unnamed: 0,File name (.jpg),Global Pattern,Pigmented Reticulum,Negative Reticulum,Small Dots/Globules,Large Globules/Cobblestone,Peripheral Globules,Peripheral Projections,Homogeneous Area without Vessels,Homogeneous Area with Vessels,Undefined Area,Gray Regression,White Regression,Whitish Blue Veil,Chrysalis,Binary label
0,ISIC_0000000,Diffuse Reticular,"Yes, clear and dominant",No,"Yes, subtle",No,No,"Yes, subtle",No,No,"Yes, subtle",No,No,No,No,benign
1,ISIC_0000001,Diffuse Reticular,"Yes, clear and dominant",No,"Yes, subtle",No,No,No,"Yes, subtle",No,No,No,No,No,No,benign
2,ISIC_0000002,Mixed (mix of two of the above),"Yes, clear and dominant",No,No,No,No,No,"Yes, clear and dominant",No,"Yes, clear","Yes, subtle",No,No,No,malignant
3,ISIC_0000004,Multicomponent (mix of 3 or more above),"Yes, subtle",No,No,No,No,No,"Yes, clear and dominant","Yes, clear and dominant","Yes, clear and dominant","Yes, clear","Yes, clear","Yes, subtle",No,malignant
4,ISIC_0000006,Patchy Reticular,"Yes, clear and dominant",No,No,No,No,No,No,No,No,No,No,No,No,benign
5,ISIC_0000007,Diffuse Reticular,"Yes, clear and dominant","Yes, subtle",No,No,No,No,No,No,No,No,No,No,No,benign
6,ISIC_0000008,Globular,No,No,No,"Yes, clear and dominant","Yes, clear",No,"Yes, clear",No,No,No,No,No,No,benign
7,ISIC_0000009,Globular,"Yes, subtle",No,No,"Yes, clear and dominant",No,No,No,No,No,No,No,No,No,benign
8,ISIC_0000010,Mixed (mix of two of the above),"Yes, clear and dominant",No,No,No,No,No,"Yes, clear and dominant",No,No,No,No,No,No,benign
9,ISIC_0000011,Homogeneous,"Yes, subtle",No,No,No,No,No,"Yes, clear and dominant",No,No,No,No,No,No,benign


In [123]:
# Define the paths to the train and test folders
train_folder = '/Users/JUAN/Desktop/databases_qa/ISIC_2016/images/ISBI2016_ISIC_Part3_Training_Data_orig'
test_folder = '/Users/JUAN/Desktop/databases_qa/ISIC_2016/images/ISBI2016_ISIC_Part1_Test_Data_orig'

# Get the list of image filenames in the train and test folders
train_files = os.listdir(train_folder)
test_files = os.listdir(test_folder)

# Load the CSV file
df = df_fusion_mapped_sorted.copy()

In [136]:
test_files

['ISIC_0002829.jpg',
 'ISIC_0010587.jpg',
 'ISIC_0009954.jpg',
 'ISIC_0010034.jpg',
 'ISIC_0000227.jpg',
 'ISIC_0010020.jpg',
 'ISIC_0000233.jpg',
 'ISIC_0010183.jpg',
 'ISIC_0001299.jpg',
 'ISIC_0005000.jpg',
 'ISIC_0000420.jpg',
 'ISIC_0010009.jpg',
 'ISIC_0000226.jpg',
 'ISIC_0009564.jpg',
 'ISIC_0000540.jpg',
 'ISIC_0009955.jpg',
 'ISIC_0010592.jpg',
 'ISIC_0011101.jpg',
 'ISIC_0009982.jpg',
 'ISIC_0011129.jpg',
 'ISIC_0010553.jpg',
 'ISIC_0009994.jpg',
 'ISIC_0009980.jpg',
 'ISIC_0010584.jpg',
 'ISIC_0009943.jpg',
 'ISIC_0010023.jpg',
 'ISIC_0000230.jpg',
 'ISIC_0010037.jpg',
 'ISIC_0000378.jpg',
 'ISIC_0000393.jpg',
 'ISIC_0010180.jpg',
 'ISIC_0010369.jpg',
 'ISIC_0011089.jpg',
 'ISIC_0010368.jpg',
 'ISIC_0010340.jpg',
 'ISIC_0000392.jpg',
 'ISIC_0011300.jpg',
 'ISIC_0009201.jpg',
 'ISIC_0000231.jpg',
 'ISIC_0009956.jpg',
 'ISIC_0010591.jpg',
 'ISIC_0010234.jpg',
 'ISIC_0000027.jpg',
 'ISIC_0010552.jpg',
 'ISIC_0011112.jpg',
 'ISIC_0000037.jpg',
 'ISIC_0010556.jpg',
 'ISIC_000002

In [125]:
# Create a mapping dictionary for the train and test filenames
train_mapping = {filename.split('.')[0]: f"train_{i:07d}" for i, filename in enumerate(train_files)}
test_mapping = {filename.split('.')[0]: f"test_{i:07d}" for i, filename in enumerate(test_files)}

In [126]:
test_mapping

{'ISIC_0002829': 'test_0000000',
 'ISIC_0010587': 'test_0000001',
 'ISIC_0009954': 'test_0000002',
 'ISIC_0010034': 'test_0000003',
 'ISIC_0000227': 'test_0000004',
 'ISIC_0010020': 'test_0000005',
 'ISIC_0000233': 'test_0000006',
 'ISIC_0010183': 'test_0000007',
 'ISIC_0001299': 'test_0000008',
 'ISIC_0005000': 'test_0000009',
 'ISIC_0000420': 'test_0000010',
 'ISIC_0010009': 'test_0000011',
 'ISIC_0000226': 'test_0000012',
 'ISIC_0009564': 'test_0000013',
 'ISIC_0000540': 'test_0000014',
 'ISIC_0009955': 'test_0000015',
 'ISIC_0010592': 'test_0000016',
 'ISIC_0011101': 'test_0000017',
 'ISIC_0009982': 'test_0000018',
 'ISIC_0011129': 'test_0000019',
 'ISIC_0010553': 'test_0000020',
 'ISIC_0009994': 'test_0000021',
 'ISIC_0009980': 'test_0000022',
 'ISIC_0010584': 'test_0000023',
 'ISIC_0009943': 'test_0000024',
 'ISIC_0010023': 'test_0000025',
 'ISIC_0000230': 'test_0000026',
 'ISIC_0010037': 'test_0000027',
 'ISIC_0000378': 'test_0000028',
 'ISIC_0000393': 'test_0000029',
 'ISIC_001

In [142]:
# Iterate over each observation in the dataframe
for index, row in df.iterrows():
    file_name = row['File name (.jpg)']
    
    # Check if the file name is in the train_mapping dictionary
    if file_name in train_mapping:
        new_id = train_mapping[file_name]
    # Check if the file name is in the test_mapping dictionary
    elif file_name in test_mapping:
        new_id = test_mapping[file_name]
    else:
        new_id = None
    
    # Update the 'New Image ID' variable with the new_id value
    df.loc[index, 'New Image ID'] = new_id


In [143]:
# Reorder the columns with 'New Image ID' in the second position
column_order = ['File name (.jpg)', 'New Image ID'] + [col for col in df.columns if col not in ['File name (.jpg)', 'New Image ID']]
df = df.reindex(columns=column_order)
# Filter train data
train_df = df[df['New Image ID'].str.startswith('train', na=False)]
# Filter test data
test_df = df[df['New Image ID'].str.startswith('test', na=False)]

In [146]:
train_df

Unnamed: 0,File name (.jpg),New Image ID,Global Pattern,Pigmented Reticulum,Negative Reticulum,Small Dots/Globules,Large Globules/Cobblestone,Peripheral Globules,Peripheral Projections,Homogeneous Area without Vessels,Homogeneous Area with Vessels,Undefined Area,Gray Regression,White Regression,Whitish Blue Veil,Chrysalis,Binary label
2,ISIC_0000002,train_0000398,Mixed (mix of two of the above),"Yes, clear and dominant",No,No,No,No,No,"Yes, clear and dominant",No,"Yes, clear","Yes, subtle",No,No,No,malignant
3,ISIC_0000004,train_0000347,Multicomponent (mix of 3 or more above),"Yes, subtle",No,No,No,No,No,"Yes, clear and dominant","Yes, clear and dominant","Yes, clear and dominant","Yes, clear","Yes, clear","Yes, subtle",No,malignant
14,ISIC_0000021,train_0000103,Diffuse Reticular,"Yes, clear and dominant",No,"Yes, clear",No,No,No,No,No,"Yes, subtle",No,"Yes, subtle",No,No,benign
17,ISIC_0000026,train_0000033,Multicomponent (mix of 3 or more above),"Yes, subtle",No,"Yes, clear",No,No,No,"Yes, clear and dominant","Yes, clear and dominant","Yes, clear",No,"Yes, clear",No,No,malignant
18,ISIC_0000028,train_0000439,Diffuse Reticular,"Yes, clear and dominant",No,No,No,No,No,"Yes, subtle",No,No,No,No,No,No,benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
893,ISIC_0011387,train_0000338,Undefined,No,No,No,No,No,No,No,No,"Yes, clear and dominant","Yes, subtle",No,No,No,malignant
894,ISIC_0011390,train_0000310,Undefined,"Yes, subtle",No,"Yes, clear",No,No,No,No,No,No,"Yes, subtle",No,No,No,malignant
896,ISIC_0011397,train_0000246,Undefined,No,No,"Yes, clear",No,"Yes, subtle",No,No,No,No,"Yes, clear",No,No,No,benign
897,ISIC_0011398,train_0000179,Diffuse Reticular,"Yes, clear and dominant",No,"Yes, subtle",No,No,No,No,No,No,No,No,No,No,benign


In [None]:
df_fusion_mapped.to_csv('../../data/processed/df_fusion_clean.csv', index=False)