In [None]:
import json
import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **NLP basic / section_impression**

In [None]:
# Load datasets
train = pd.read_csv("/content/drive/MyDrive/Diagnovision/filtered_data/train_filtered.csv")
val = pd.read_csv("/content/drive/MyDrive/Diagnovision/filtered_data/val_filtered.csv")
test= pd.read_csv("/content/drive/MyDrive/Diagnovision/filtered_data/test_filtered.csv")

In [None]:
# valores nulos en cada DataFrame
def check_nans(df, name):
    print(f"\nValores nulos en {name}:")
    print(df.isna().sum())

check_nans(train, "Train")
check_nans(val, "Validation")
check_nans(test, "Test")



Valores nulos en Train:
path_to_image                  0
deid_patient_id                0
section_impression            29
Enlarged Cardiomediastinum     0
Cardiomegaly                   0
Lung Opacity                   0
Lung Lesion                    0
Edema                          0
Consolidation                  0
Pneumonia                      0
Atelectasis                    0
Pneumothorax                   0
Pleural Effusion               0
Pleural Other                  0
Fracture                       0
dtype: int64

Valores nulos en Validation:
path_to_image                 0
deid_patient_id               0
section_impression            5
Enlarged Cardiomediastinum    0
Cardiomegaly                  0
Lung Opacity                  0
Lung Lesion                   0
Edema                         0
Consolidation                 0
Pneumonia                     0
Atelectasis                   0
Pneumothorax                  0
Pleural Effusion              0
Pleural Other        

## **section_impression findings:**

-Lowercase all text to ensure uniformity.

-Remove enumerations (1., 2.) explicitly

-Remove newline characters/formatting (\n) and extra spaces.

-Convert isolated numbers into words.

-Keep numbers relevant to clinical data (e.g., "2 cm" → "two cm").

-Remove punctuation and irrelevant text (phone numbers, timestamps)



## **2. Text cleaning**

We need to preserve the puntuaction (.) on phrases since we are going to split the text into sentences in order to analize independently to determine if diseases were present, negated, or uncertain.

In [None]:
pip install num2words


Collecting num2words
  Downloading num2words-0.5.14-py3-none-any.whl.metadata (13 kB)
Collecting docopt>=0.6.2 (from num2words)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading num2words-0.5.14-py3-none-any.whl (163 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.5/163.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13706 sha256=92ad2884b78dd94c45453c197cf02e06e19fcc297eb4271fef4c12e5cc770283
  Stored in directory: /root/.cache/pip/wheels/1a/b0/8c/4b75c4116c31f83c8f9f047231251e13cc74481cca4a78a9ce
Successfully built docopt
Installing collected packages: docopt, num2words
Successfully installed docopt-0.6.2 num2words-0.5.14


In [None]:
import pandas as pd
import re
from num2words import num2words

# Función para convertir números a palabras
def numbers_to_words(text):
    return re.sub(r'\b\d+\b', lambda x: num2words(int(x.group())), text)

# Función de limpieza de texto
def clean_text(text):
    if pd.isna(text):
        return text

    text = text.lower()

    # Eliminar enumeraciones numéricas de listas
    text = re.sub(r'\n+\s*\d+\.', ' ', text)
    text = re.sub(r'\n', ' ', text)

    # Eliminar números de teléfono y fechas explícitamente
    text = re.sub(r'\(?\b\d{3}\)?[-.\s]*\d{3}[-.\s]*\d{4}\b', ' ', text)
    text = re.sub(r'\b\d{1,2}-\d{1,2}-\d{4}\b', ' ', text)

    # Convertir valores numéricos independientes a palabras
    text = numbers_to_words(text)

    # Eliminar puntuación excepto puntos finales para preservar la segmentación de frases
    text = re.sub(r'[^\w\s\.]', ' ', text)  # <-- Mantener los puntos (.)
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Aplicar la limpieza a cada conjunto de datos
for df in [train, val, test]:
    df['clean_impression'] = df['section_impression'].apply(clean_text)


print(train[['section_impression', 'clean_impression']].head())
print(val[['section_impression', 'clean_impression']].head())
print(test[['section_impression', 'clean_impression']].head())


                                  section_impression  \
0  \n1. STATUS POST EXTUBATION.\n2. INTERVAL DEVE...   
1  \nSTATUS POST PLACEMENT OF ENDOTRACHEAL TUBE. ...   
2   \n \n 1.   INTERVAL RETRACTION OF THE PICC LI...   
3  \n1. KNEE VIEWS DEMONSTRATE NO EVIDENCE FOR FR...   
4  \n1. SWAN-GANZ CATHETER PLACEMENT WITH DEVELOP...   

                                    clean_impression  
0  status post extubation. interval development o...  
1  status post placement of endotracheal tube. no...  
2  interval retraction of the picc line with tip ...  
3  knee views demonstrate no evidence for fractur...  
4  swan ganz catheter placement with development ...  
                                  section_impression  \
0  \n1. PORTABLE AP SUPINE VIEW OF THE CHEST DEMO...   
1  \n1. LUNGS ARE CLEAR BILATERALLY. NO EVIDENCE ...   
2  \nAP and lateral views of the lumbar spine rev...   
3  \n1. LEFT PICC LINE TIP AT DISTAL SUPERIOR VEN...   
4  \n1. SINGLE VIEW OF THE CHEST IS LIMITED SECON... 

In [None]:

print("Train DataFrame:")
print(train.info())
print(train.head())

print("\nValidation DataFrame:")
print(val.info())
print(val.head())

print("\nTest DataFrame:")
print(test.info())
print(test.head())


Train DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34144 entries, 0 to 34143
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   path_to_image               34144 non-null  object 
 1   deid_patient_id             34144 non-null  object 
 2   section_impression          34115 non-null  object 
 3   Enlarged Cardiomediastinum  34144 non-null  float64
 4   Cardiomegaly                34144 non-null  float64
 5   Lung Opacity                34144 non-null  float64
 6   Lung Lesion                 34144 non-null  float64
 7   Edema                       34144 non-null  float64
 8   Consolidation               34144 non-null  float64
 9   Pneumonia                   34144 non-null  float64
 10  Atelectasis                 34144 non-null  float64
 11  Pneumothorax                34144 non-null  float64
 12  Pleural Effusion            34144 non-null  float64
 13  Pleural Other 

In [None]:
# Guardar los DataFrames limpios para NLP SpaCY(tienen las dos columnas clean y original)
train.to_csv("/content/drive/MyDrive/train_filtered_paranlp.csv", index=False)
val.to_csv("/content/drive/MyDrive/val_filtered_paranlp.csv", index=False)
test.to_csv("/content/drive/MyDrive/test_filtered_paranlp.csv", index=False)

print("Archivos guardados correctamente:")
print("- train_filtered_cleaned.csv")
print("- val_filtered_cleaned.csv")
print("- test_filtered_cleaned.csv")


Archivos guardados correctamente:
- train_filtered_cleaned.csv
- val_filtered_cleaned.csv
- test_filtered_cleaned.csv


In [None]:
# Eliminar la columna 'section_impression' de cada DataFrame
for df in [train, val, test]:
    df.drop(columns=['section_impression'], inplace=True)

# Confirmar que la columna ha sido eliminada
print("Columnas en Train:", train.columns)
print("Columnas en Validation:", val.columns)
print("Columnas en Test:", test.columns)


Columnas en Train: Index(['path_to_image', 'deid_patient_id', 'Enlarged Cardiomediastinum',
       'Cardiomegaly', 'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation',
       'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion',
       'Pleural Other', 'Fracture', 'clean_impression'],
      dtype='object')
Columnas en Validation: Index(['path_to_image', 'deid_patient_id', 'Enlarged Cardiomediastinum',
       'Cardiomegaly', 'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation',
       'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion',
       'Pleural Other', 'Fracture', 'clean_impression'],
      dtype='object')
Columnas en Test: Index(['path_to_image', 'deid_patient_id', 'Enlarged Cardiomediastinum',
       'Cardiomegaly', 'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation',
       'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion',
       'Pleural Other', 'Fracture', 'clean_impression'],
      dtype='object')


In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34144 entries, 0 to 34143
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   path_to_image               34144 non-null  object 
 1   deid_patient_id             34144 non-null  object 
 2   Enlarged Cardiomediastinum  34144 non-null  float64
 3   Cardiomegaly                34144 non-null  float64
 4   Lung Opacity                34144 non-null  float64
 5   Lung Lesion                 34144 non-null  float64
 6   Edema                       34144 non-null  float64
 7   Consolidation               34144 non-null  float64
 8   Pneumonia                   34144 non-null  float64
 9   Atelectasis                 34144 non-null  float64
 10  Pneumothorax                34144 non-null  float64
 11  Pleural Effusion            34144 non-null  float64
 12  Pleural Other               34144 non-null  float64
 13  Fracture                    341

In [None]:
val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7317 entries, 0 to 7316
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   path_to_image               7317 non-null   object 
 1   deid_patient_id             7317 non-null   object 
 2   Enlarged Cardiomediastinum  7317 non-null   float64
 3   Cardiomegaly                7317 non-null   float64
 4   Lung Opacity                7317 non-null   float64
 5   Lung Lesion                 7317 non-null   float64
 6   Edema                       7317 non-null   float64
 7   Consolidation               7317 non-null   float64
 8   Pneumonia                   7317 non-null   float64
 9   Atelectasis                 7317 non-null   float64
 10  Pneumothorax                7317 non-null   float64
 11  Pleural Effusion            7317 non-null   float64
 12  Pleural Other               7317 non-null   float64
 13  Fracture                    7317 

In [None]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7317 entries, 0 to 7316
Data columns (total 15 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   path_to_image               7317 non-null   object 
 1   deid_patient_id             7317 non-null   object 
 2   Enlarged Cardiomediastinum  7317 non-null   float64
 3   Cardiomegaly                7317 non-null   float64
 4   Lung Opacity                7317 non-null   float64
 5   Lung Lesion                 7317 non-null   float64
 6   Edema                       7317 non-null   float64
 7   Consolidation               7317 non-null   float64
 8   Pneumonia                   7317 non-null   float64
 9   Atelectasis                 7317 non-null   float64
 10  Pneumothorax                7317 non-null   float64
 11  Pleural Effusion            7317 non-null   float64
 12  Pleural Other               7317 non-null   float64
 13  Fracture                    7317 

In [None]:
# Guardar los DataFrames limpios para MeDViT (tienen nans)
train.to_csv("/content/drive/MyDrive/Diagnovision/filtered_data/train_filtered_clean.csv", index=False)
val.to_csv("/content/drive/MyDrive/Diagnovision/filtered_data/val_filtered_clean.csv", index=False)
test.to_csv("/content/drive/MyDrive/Diagnovision/filtered_data/test_filtered_clean.csv", index=False)

print("Archivos guardados correctamente:")
print("- train_filtered.csv")
print("- val_filtered.csv")
print("- test_filtered.csv")

Archivos guardados correctamente:
- train_filtered.csv
- val_filtered.csv
- test_filtered.csv


## **Corpus analysis** - word count

In [None]:
# Calcular la cantidad de palabras y mostrar estadísticas
def word_count_stats(df, name):
    df['word_count'] = df['clean_impression'].apply(lambda x: len(str(x).split()))  # Evitar errores con NaN
    print(f"\nEstadísticas de word_count en {name}:")
    print(df['word_count'].describe())

word_count_stats(train, "Train")
word_count_stats(val, "Validation")
word_count_stats(test, "Test")




Estadísticas de word_count en Train:
count    34144.000000
mean        40.282978
std         23.619437
min          0.000000
25%         24.000000
50%         36.000000
75%         51.000000
max        317.000000
Name: word_count, dtype: float64

Estadísticas de word_count en Validation:
count    7317.000000
mean       40.420391
std        24.702181
min         0.000000
25%        24.000000
50%        36.000000
75%        51.000000
max       402.000000
Name: word_count, dtype: float64

Estadísticas de word_count en Test:
count    7317.000000
mean       39.738281
std        23.026935
min         0.000000
25%        24.000000
50%        36.000000
75%        51.000000
max       263.000000
Name: word_count, dtype: float64


## Frequent word analysis

In [None]:
from collections import Counter

# Calcular y mostrar las palabras más frecuentes en cada conjunto de datos
def word_frequency(df, name):
    # Unir todas las impresiones limpias en un solo texto
    all_text = ' '.join(df['clean_impression'].dropna())

    # Tokenizar en palabras
    all_words = all_text.split()

    # Contar la frecuencia de las palabras
    word_freq = Counter(all_words)

    # Obtener las 20 palabras más comunes
    common_words = word_freq.most_common(20)

    print(f"\nPalabras más frecuentes en {name}:")
    for word, freq in common_words:
        print(f"{word}: {freq}")


word_frequency(train, "Train")
word_frequency(val, "Validation")
word_frequency(test, "Test")



Palabras más frecuentes en Train:
the: 72207
of: 50367
and: 31452
with: 30544
left: 27780
right: 27223
no: 25032
is: 23254
in: 22172
lung: 17148
pleural: 16780
a: 16181
chest: 15537
pulmonary: 15514
or: 13544
interval: 13090
are: 11273
pneumothorax.: 10302
to: 10185
tube: 9218

Palabras más frecuentes en Validation:
the: 15723
of: 10896
and: 6707
with: 6598
right: 5922
left: 5858
no: 5255
is: 5002
in: 4755
lung: 3643
pleural: 3595
a: 3519
chest: 3392
pulmonary: 3275
or: 2815
interval: 2685
are: 2408
pneumothorax.: 2220
to: 2119
there: 2061

Palabras más frecuentes en Test:
the: 15171
of: 10508
and: 6759
with: 6665
right: 5761
left: 5744
no: 5226
is: 4831
in: 4683
lung: 3537
pleural: 3529
pulmonary: 3427
a: 3313
chest: 3240
or: 2887
interval: 2738
are: 2269
pneumothorax.: 2177
to: 2151
there: 1883


  Most frequent words are function words ("the", "of", "and", "with", "no", etc.). These are typical in English texts, clearly indicating we should use stop-word removal or embedding methods that naturally ignore these.

  Clinically relevant frequent terms such as "left", "right", "pleural", "lung", "pulmonary", "pneumothorax" confirm this corpus is heavily medical and specifically radiological.

  The presence of "no" suggests negations are frequent, strongly supporting the need for explicit negation detection in the NLP pipeline.

## vocabulary cardinality

In [None]:
# Calcular y mostrar # palabras únicas en cada conjunto de datos
def vocabulary_size(df, name):
    all_text = ' '.join(df['clean_impression'].dropna())
    all_words = all_text.split()
    unique_words = len(set(all_words))
    print(f"\nTamaño del vocabulario en {name}: {unique_words}")


vocabulary_size(train, "Train")
vocabulary_size(val, "Validation")
vocabulary_size(test, "Test")




Tamaño del vocabulario en Train: 9180

Tamaño del vocabulario en Validation: 5001

Tamaño del vocabulario en Test: 4916
