In [None]:
# clone RAPIDS AI rapidsai-csp-utils scripts repo
import os
if not os.path.exists('rapidsai-csp-utils'):
    !git clone https://github.com/rapidsai/rapidsai-csp-utils.git

Cloning into 'rapidsai-csp-utils'...
remote: Enumerating objects: 586, done.[K
remote: Counting objects: 100% (152/152), done.[K
remote: Compressing objects: 100% (70/70), done.[K
remote: Total 586 (delta 122), reused 82 (delta 82), pack-reused 434 (from 3)[K
Receiving objects: 100% (586/586), 191.99 KiB | 923.00 KiB/s, done.
Resolving deltas: 100% (296/296), done.


In [None]:
# install RAPIDS
!bash rapidsai-csp-utils/colab/rapids-colab.sh 0.15

PLEASE READ FOR 21.06
********************************************************************************************************
Another release, another script change.  We had to revise the script, which now:
1. Does a more comprehensive install
2. Includes BlazingSQL
3. is far easier for everyone to understand and maintain

The script will require you to add these 5 cells to your notebook.  We have also created a new startup template: 
https://colab.research.google.com/drive/1TAAi_szMfWqRfHVfjGSqnGVLr_ztzUM9?usp=sharing

CHANGES T
CELL 1:
    # This get the RAPIDS-Colab install files and test check your GPU.  Run cells 1 and 2 only.
    # Please read the output of this cell.  If your Colab Instance is not RAPIDS compatible, it will warn you and give you remediation steps.
    !git clone https://github.com/rapidsai/rapidsai-csp-utils.git
    !python rapidsai-csp-utils/colab/env-check.py

CELL 2:
    # This will update the Colab environment and restart the kernel.
    !bash rapidsai-csp-

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
# Load datasets
train = pd.read_csv("/content/drive/MyDrive/Diagnovision/filtered_df/train_df.csv")
val = pd.read_csv("/content/drive/MyDrive/Diagnovision/filtered_df/val_df.csv")
test = pd.read_csv("/content/drive/MyDrive/Diagnovision/filtered_df/test_df.csv")


In [None]:
import torch
# ✅ Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f" Using device: {device}")

 Using device: cuda


In [None]:
#from google.colab import files

train.to_csv('train_final.csv', index=False)
#files.download('train_final.csv')

val.to_csv('validation_final.csv', index=False)
#files.download('val_final.csv')

test.to_csv('test_final.csv', index=False)
#files.download('test_final.csv')


In [None]:
import pandas as pd
import numpy as np
import cudf
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm.auto import tqdm
import sys

# 1. Cargar los conjuntos de datos ya divididos
print("Cargando datos...")
train_df = pd.read_csv('train_final.csv')
test_df = pd.read_csv('test_final.csv')
validation_df = pd.read_csv('validation_final.csv')

# 2. Extraer características y etiquetas
X_train = cudf.Series(train_df['clean_impression'].values)

# Obtener los nombres de las 12 columnas de patologías
patologia_cols = [
    'Enlarged Cardiomediastinum', 'Cardiomegaly',
    'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation',
    'Pneumonia', 'Atelectasis', 'Pneumothorax',
    'Pleural Effusion', 'Pleural Other', 'Fracture'
]

# Crear matrices de etiquetas para cada conjunto
y_train = train_df[patologia_cols].values
y_test = test_df[patologia_cols].values
y_validation = validation_df[patologia_cols].values

# 3. Vectorización del texto usando TF-IDF en GPU
print("Vectorizando textos en GPU...")
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(cudf.Series(test_df['clean_impression'].values))
X_validation_tfidf = vectorizer.transform(cudf.Series(validation_df['clean_impression'].values))

# Convertir matrices sparse a arrays densos
X_train_dense = X_train_tfidf.todense()
X_test_dense = X_test_tfidf.todense()
X_validation_dense = X_validation_tfidf.todense()

# 4. Entrenar el modelo de clasificación multilabel (RandomForest) en GPU
print("Entrenando modelo en GPU (esto puede tardar)...")
models = {}
predictions_test = {}
predictions_val = {}

print("Iniciando entrenamiento en GPU...")
for i, col in enumerate(tqdm(patologia_cols, desc="Entrenando modelos por patología")):
    # Entrenar un modelo para cada patología
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train_dense, y_train[:, i])
    models[col] = rf

    # Predecir para test y validación
    predictions_test[col] = rf.predict(X_test_dense)
    predictions_val[col] = rf.predict(X_validation_dense)

    sys.stdout.flush()  # Forzar actualización de la salida

print("Entrenamiento completado!")





Cargando datos...
Vectorizando textos en GPU...
Entrenando modelo en GPU (esto puede tardar)...
Iniciando entrenamiento en GPU...


Entrenando modelos por patología:   0%|          | 0/12 [00:00<?, ?it/s]

  return func(**kwargs)


Entrenamiento completado!


In [None]:
# 5. Evaluación del modelo
print("Evaluando el modelo...")

# Crear matrices para almacenar las predicciones
y_pred_test = np.zeros_like(y_test)
y_pred_val = np.zeros_like(y_validation)

# Llenar las matrices con las predicciones
for i, col in enumerate(patologia_cols):
    y_pred_test[:, i] = predictions_test[col].get()
    y_pred_val[:, i] = predictions_val[col].get()

# Convert predictions to binary (0 or 1)
#y_pred_test = (y_pred_test > 0.5).astype(int)
#y_pred_val = (y_pred_val > 0.5).astype(int)

# ----> Ensure y_validation and y_test are also binary <----
y_validation = y_validation.astype(int)
y_test = y_test.astype(int)


# Evaluar en conjunto de validación
print("\nResultados en conjunto de validación:")
print(f"Accuracy global: {accuracy_score(y_validation.flatten(), y_pred_val.flatten()):.4f}")

# Evaluar cada patología individualmente
print("\nMétricas por patología (validación):")
for i, col in enumerate(patologia_cols):
    precision = classification_report(y_validation[:, i], y_pred_val[:, i], output_dict=True)
    print(f"{col}:")
    print(f"  Precision: {precision['weighted avg']['precision']:.4f}")
    print(f"  Recall: {precision['weighted avg']['recall']:.4f}")
    print(f"  F1-score: {precision['weighted avg']['f1-score']:.4f}")

# Evaluar en conjunto de prueba
print("\nResultados en conjunto de prueba:")
print(f"Accuracy global: {accuracy_score(y_test.flatten(), y_pred_test.flatten()):.4f}")

# Evaluar cada patología individualmente
print("\nMétricas por patología (prueba):")
for i, col in enumerate(patologia_cols):
    precision = classification_report(y_test[:, i], y_pred_test[:, i], output_dict=True)
    print(f"{col}:")
    print(f"  Precision: {precision['weighted avg']['precision']:.4f}")
    print(f"  Recall: {precision['weighted avg']['recall']:.4f}")
    print(f"  F1-score: {precision['weighted avg']['f1-score']:.4f}")

# 6. Calcular métricas adicionales para evaluación multilabel
from sklearn.metrics import hamming_loss, jaccard_score

# Hamming Loss (menor es mejor)
hl_val = hamming_loss(y_validation, y_pred_val)
hl_test = hamming_loss(y_test, y_pred_test)
print(f"\nHamming Loss (validación): {hl_val:.4f}")
print(f"Hamming Loss (prueba): {hl_test:.4f}")

# Jaccard Score (mayor es mejor)
js_val = jaccard_score(y_validation, y_pred_val, average='samples')
js_test = jaccard_score(y_test, y_pred_test, average='samples')
print(f"Jaccard Score (validación): {js_val:.4f}")
print(f"Jaccard Score (prueba): {js_test:.4f}")

# 7. Visualización de resultados
import matplotlib.pyplot as plt
import seaborn as sns

# Crear matriz de confusión para cada patología (solo para visualización)
def plot_confusion_matrices(y_true, y_pred, set_name):
    fig, axes = plt.subplots(3, 4, figsize=(20, 15))
    axes = axes.flatten()

    for i, col in enumerate(patologia_cols):
        cm = confusion_matrix(y_true[:, i], y_pred[:, i])
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i])
        axes[i].set_title(f'{col}')
        axes[i].set_xlabel('Predicted')
        axes[i].set_ylabel('True')

    plt.tight_layout()
    plt.savefig(f'confusion_matrices_{set_name}.png')
    plt.close()

# Importar confusion_matrix después de definir la función
from sklearn.metrics import confusion_matrix

# Generar y guardar matrices de confusión
plot_confusion_matrices(y_validation, y_pred_val, 'validation')
plot_confusion_matrices(y_test, y_pred_test, 'test')

# 8. Guardar resultados en un archivo
with open('model_evaluation_results.txt', 'w', encoding='utf-8') as f:
    f.write("RESULTADOS DE EVALUACIÓN DEL MODELO\n")
    f.write("==================================\n\n")

    f.write("Resultados en conjunto de validación:\n")
    f.write(f"Accuracy global: {accuracy_score(y_validation.flatten(), y_pred_val.flatten()):.4f}\n\n")

    f.write("Métricas por patología (validación):\n")
    for i, col in enumerate(patologia_cols):
        precision = classification_report(y_validation[:, i], y_pred_val[:, i], output_dict=True)
        f.write(f"{col}:\n")
        f.write(f"  Precision: {precision['weighted avg']['precision']:.4f}\n")
        f.write(f"  Recall: {precision['weighted avg']['recall']:.4f}\n")
        f.write(f"  F1-score: {precision['weighted avg']['f1-score']:.4f}\n\n")

    f.write("Resultados en conjunto de prueba:\n")
    f.write(f"Accuracy global: {accuracy_score(y_test.flatten(), y_pred_test.flatten()):.4f}\n\n")

    f.write("Métricas por patología (prueba):\n")
    for i, col in enumerate(patologia_cols):
        precision = classification_report(y_test[:, i], y_pred_test[:, i], output_dict=True)
        f.write(f"{col}:\n")
        f.write(f"  Precision: {precision['weighted avg']['precision']:.4f}\n")
        f.write(f"  Recall: {precision['weighted avg']['recall']:.4f}\n")
        f.write(f"  F1-score: {precision['weighted avg']['f1-score']:.4f}\n\n")

    f.write(f"Hamming Loss (validación): {hl_val:.4f}\n")
    f.write(f"Hamming Loss (prueba): {hl_test:.4f}\n")
    f.write(f"Jaccard Score (validación): {js_val:.4f}\n")
    f.write(f"Jaccard Score (prueba): {js_test:.4f}\n")

print("Evaluación completa. Resultados guardados en 'model_evaluation_results.txt'")


Evaluando el modelo...

Resultados en conjunto de validación:
Accuracy global: 0.8965

Métricas por patología (validación):
Enlarged Cardiomediastinum:
  Precision: 0.9462
  Recall: 0.9430
  F1-score: 0.9170
Cardiomegaly:
  Precision: 0.7971
  Recall: 0.8928
  F1-score: 0.8423
Lung Opacity:
  Precision: 0.7778
  Recall: 0.7762
  F1-score: 0.7726
Lung Lesion:
  Precision: 0.9150
  Recall: 0.9566
  F1-score: 0.9353
Edema:
  Precision: 0.7838
  Recall: 0.7896
  F1-score: 0.6996
Consolidation:
  Precision: 0.8947
  Recall: 0.9459
  F1-score: 0.9195
Pneumonia:
  Precision: 0.9545
  Recall: 0.9770
  F1-score: 0.9656
Atelectasis:
  Precision: 0.7064
  Recall: 0.8405
  F1-score: 0.7676
Pneumothorax:
  Precision: 0.8779
  Recall: 0.9370
  F1-score: 0.9065
Pleural Effusion:
  Precision: 0.7717
  Recall: 0.7630
  F1-score: 0.7237
Pleural Other:
  Precision: 0.9741
  Recall: 0.9870
  F1-score: 0.9805
Fracture:
  Precision: 0.9452
  Recall: 0.9496
  F1-score: 0.9285

Resultados en conjunto de prueb

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Enlarged Cardiomediastinum:
  Precision: 0.9386
  Recall: 0.9442
  F1-score: 0.9195
Cardiomegaly:
  Precision: 0.8003
  Recall: 0.8946
  F1-score: 0.8449
Lung Opacity:
  Precision: 0.7889
  Recall: 0.7872
  F1-score: 0.7840
Lung Lesion:
  Precision: 0.9337
  Recall: 0.9663
  F1-score: 0.9497
Edema:
  Precision: 0.7834
  Recall: 0.7859
  F1-score: 0.6958
Consolidation:
  Precision: 0.8895
  Recall: 0.9431
  F1-score: 0.9155
Pneumonia:
  Precision: 0.9561
  Recall: 0.9778
  F1-score: 0.9668
Atelectasis:
  Precision: 0.7191
  Recall: 0.8480
  F1-score: 0.7783
Pneumothorax:
  Precision: 0.8723
  Recall: 0.9339
  F1-score: 0.9020
Pleural Effusion:
  Precision: 0.7705
  Recall: 0.7657
  F1-score: 0.7278
Pleural Other:
  Precision: 0.9739
  Recall: 0.9868
  F1-score: 0.9803
Fracture:
  Precision: 0.9406
  Recall: 0.9478
  F1-score: 0.9256

Hamming Loss (validación): 0.1035
Hamming Loss (prueba): 0.1015


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Jaccard Score (validación): 0.1939
Jaccard Score (prueba): 0.2000
Evaluación completa. Resultados guardados en 'model_evaluation_results.txt'


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [None]:
def clasificar_patologias(texto_clinico, vectorizer=vectorizer, models=models, patologia_cols=patologia_cols, umbral=0.5):
    # Vectorizar el texto
    texto_serie = cudf.Series([texto_clinico])
    texto_tfidf = vectorizer.transform(texto_serie)
    texto_dense = texto_tfidf.todense()

    # Realizar predicciones para cada patología
    resultados = {}

    for col in patologia_cols:
        # Obtener el modelo para esta patología
        modelo = models[col]

        # Predecir probabilidad
        try:
          # Intentar obtener probabilidades
          prob = modelo.predict_proba(texto_dense)[:, 1].item()

          if prob >= 0.5:
              estado = "Positivo"
          elif 0.35 <= prob < 0.5:
              estado = "Incierto"
          else:
              estado = "Negativo"

          resultados[col] = {
              "estado": estado,
              "valor": round(float(prob), 4),
              "probabilidad": True
          }

        except:
            pred = modelo.predict(texto_dense).item()
            resultados[col] = {
                "estado": "Positivo" if pred == 1 else "Negativo",
                "valor": int(pred),
                "probabilidad": False
            }

    return resultados

# Ejemplo de uso
nuevo_caso = "stable tracheostomy tube and redemonstration of feeding tube coursing into the upper abdomen. surgical drains overlie the neck. new right pleural pigtail catheter. stable abnormal mediastinal contour. mild improvement in right pleural effusion. increased left pleural effusion. mildly improved right basilar opacities."
resultados = clasificar_patologias(nuevo_caso)


print("Texto radiólogo:", nuevo_caso)
print("\nResultados de la predicción:")
print("__________________________________________________________")
for patologia, info in resultados.items():
    if info['probabilidad']:
        print(f"{patologia}: {info['estado']} (Prob: {info['valor']:.4f})")
    else:
        print(f"{patologia}: {info['estado']} (Valor: {info['valor']})")


Texto radiólogo: stable tracheostomy tube and redemonstration of feeding tube coursing into the upper abdomen. surgical drains overlie the neck. new right pleural pigtail catheter. stable abnormal mediastinal contour. mild improvement in right pleural effusion. increased left pleural effusion. mildly improved right basilar opacities.

Resultados de la predicción:
__________________________________________________________
Enlarged Cardiomediastinum: Negativo (Prob: 0.1412)
Cardiomegaly: Negativo (Prob: 0.0919)
Lung Opacity: Positivo (Prob: 0.5736)
Lung Lesion: Negativo (Prob: 0.0326)
Edema: Negativo (Prob: 0.2209)
Consolidation: Negativo (Prob: 0.0679)
Pneumonia: Negativo (Prob: 0.0197)
Atelectasis: Negativo (Prob: 0.1764)
Pneumothorax: Negativo (Prob: 0.0656)
Pleural Effusion: Incierto (Prob: 0.4574)
Pleural Other: Negativo (Prob: 0.0093)
Fracture: Negativo (Prob: 0.0322)


In [None]:

import pickle

# Save the trained models and vectorizer to a pickle file
with open('trained_models.pkl', 'wb') as f:
    pickle.dump({'models': models, 'vectorizer': vectorizer}, f)

print("Modelos guardados en 'trained_models.pkl'")


Modelos guardados en 'trained_models.pkl'


In [None]:
with open('trained_models.pkl', 'wb') as f:
    pickle.dump({'models': models, 'vectorizer': vectorizer}, f)

print("Modelos guardados en 'trained_models.pkl'")


Modelos guardados en 'trained_models.pkl'
