## 0. Libraries 📚

In [None]:
import pandas as pd
import ast
from utils import read_cie10_file

## 1. Load data 📥

In [None]:
diagnoses_df = pd.read_csv("data/ground_truth_df.csv")
diagnoses_df['Codigos_diagnosticos'] = diagnoses_df['Codigos_diagnosticos'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
diagnoses_df['Diagnosticos_estandar'] = diagnoses_df['Diagnosticos_estandar'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
diagnoses_df

In [None]:
cie10_map = read_cie10_file("data/diagnosticos_tipos.csv")
cie10_codes_list = set(cie10_map.keys())

## 2. CIE-10 and CIE-9 filters methods

In [None]:
CIE10_FILTER = r'\b[Ff]+\d+(?:\.\d+)?\b'
CIE9_FILTER  = r'\d+(?:\.\d+)?'

In [None]:
import re

def cie10_filter(text):
    return bool(re.search(CIE10_FILTER, text))

tests = [
    "I am 23 years old",
    "Code F32.1",
    "Code F2.1",
    "Something else f10",
    "The grade was 3.14",
    "FF10",
    "abc12"
]

for s in tests:
    print(f"{s!r:20} → {cie10_filter(s)}")

In [None]:
import re

def cie9_filter(text):
    cie10_filter = re.compile(CIE10_FILTER)
    cie9_filter  = re.compile(CIE9_FILTER)
    no_cie10_codes = cie10_filter.sub(' ', text)
    return bool(cie9_filter.search(no_cie10_codes))

tests = [
    "I am 23 years old",
    "Code F32.1",
    "Code F2.1",
    "Something else f10",
    "The grade was 3.14",
    "FF10",
    "abc12"
]

for s in tests:
    print(f"{s!r:20} → {cie9_filter(s)}")

## 3. Extract CIE-10 and CIE-9 codes methods

In [None]:
import re
from utils import clean_cie10_code

def extract_cie10_codes(text: str) -> list[str]:
    pattern = re.compile(CIE10_FILTER)
    codes = pattern.findall(text)
    return [clean_cie10_code(code) for code in codes]


def extract_cie9_codes(text: str) -> list[str]:
    cie10_pattern = re.compile(CIE10_FILTER)
    text_without_cie10 = cie10_pattern.sub(' ', text)

    cie9_pattern = re.compile(CIE9_FILTER)
    return cie9_pattern.findall(text_without_cie10)

## 4. Process CIE-10 codes

In [None]:
cie10_list = pd.read_csv('data/cie10_list.csv', delimiter=';')
cie10_dict = cie10_list.set_index('Código')['Descripción'].to_dict()
cie10_dict

In [None]:
# Filas con códigos tipo F seguido de números y un punto decimal
rows_with_cie10 = diagnoses_df[diagnoses_df['Descripcion_diagnosticos'].apply(cie10_filter)].copy()
rows_with_cie10

In [None]:
rows_with_cie10['Extracted_codes'] = rows_with_cie10['Descripcion_diagnosticos'].apply(extract_cie10_codes)
rows_with_cie10

In [None]:
for index, row in rows_with_cie10.iterrows():
    codes = row["Extracted_codes"]
    description = row["Descripcion_diagnosticos"]
    for code in codes:
        code_key = code[:-2] if code.endswith('.0') else code # Remove .0 from FX.0 codes
        code_description = cie10_dict.get(code_key, None)
        if code_description is not None:
            description = description.replace(code, code_description)
    rows_with_cie10.at[index, "Descripcion_diagnosticos"] = description

rows_with_cie10

In [None]:
diagnoses_df['Descripcion_diagnosticos'].update(rows_with_cie10['Descripcion_diagnosticos'])
diagnoses_df

## 5. Process CIE-9 codes

In [None]:
cie9_list = pd.read_csv('data/cie9_list.csv', delimiter=';')
cie9_dict = cie9_list.set_index('CIE9')['LITERAL9'].to_dict()
cie9_dict

In [None]:
# Filas con códigos tipo X.X que no están precedidos por 'F'
rows_with_cie9 = diagnoses_df[diagnoses_df['Descripcion_diagnosticos'].apply(cie9_filter)].copy()
rows_with_cie9

In [None]:
rows_with_cie9['Extracted_codes'] = rows_with_cie9['Descripcion_diagnosticos'].apply(extract_cie9_codes)
rows_with_cie9

In [None]:
for index, row in rows_with_cie9.iterrows():
    codes = row["Extracted_codes"]
    description = row["Descripcion_diagnosticos"]
    for code in codes:
        code_key = code.replace('.', '')[:4]
        code_description = cie9_dict.get(code_key, None)
        if code_description is not None:
            description = description.replace(code, code_description)
    rows_with_cie9.at[index, "Descripcion_diagnosticos"] = description

rows_with_cie9

In [None]:
diagnoses_df['Descripcion_diagnosticos'].update(rows_with_cie9['Descripcion_diagnosticos'])
diagnoses_df

## 6. Clean 'Descripcion_diagnosticos'

In [None]:
import pandas as pd
import re
import unicodedata

def clean_text(text):
    # Convert to string and lowercase
    text = str(text).lower()

    # Remove accents and diacritics
    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')

    # Replace newlines, tabs, and multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)

    # Remove special characters but keep letters, numbers, spaces, and periods
    text = re.sub(r'[^a-zñ0-9. ]', '', text)

    # Trim leading and trailing whitespace
    text = text.strip()

    return text

diagnoses_df['Descripcion_diagnosticos_limpio'] = diagnoses_df['Descripcion_diagnosticos'].apply(clean_text)
diagnoses_df

## 7. Save results

In [None]:
diagnoses_df.to_csv("data/ground_truth_df.csv", index=False)