## **Multimodal: Text and Image using GloVe and VGG16**

In [131]:
import tensorflow as tf
import os

# FORZAR usar solo GPU 0
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

print("✅ Usando solo GPU 0")
print("✅ GPUs visibles:", tf.config.list_physical_devices('GPU'))

✅ Usando solo GPU 0
✅ GPUs visibles: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]


In [132]:
import os

# Configurar TensorFlow para usar menos memoria de GPU
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [109]:
import tensorflow as tf
import gc
import os

# Limpiar memoria sin modificar configuraciones de GPU existentes
tf.keras.backend.clear_session()
gc.collect()

# Verificar estado actual de GPU
print("=== Estado GPU ===")
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    print(f"✅ GPUs detectadas: {len(gpus)}")
    for gpu in gpus:
        print(f"  - {gpu}")
        
    # Verificar uso de memoria
    try:
        from tensorflow.python.client import device_lib
        print(f"✅ GPU disponible para uso")
    except:
        print("⚠️  No se puede verificar detalles de GPU")
else:
    print("❌ No se detectaron GPUs")

=== Estado GPU ===
✅ GPUs detectadas: 2
  - PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
  - PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')
✅ GPU disponible para uso


In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import random
from tensorflow.keras.utils import to_categorical
from PIL import Image
import numpy as np

### **Preprocesamiento**

#### **Text**
"we preprocess them by removing stop words, non-ASCII charachters, numbers, URLs and hashtag signs. we also replace all punctuation marks with white spaces"

In [111]:
df_text = pd.read_csv('/home/jacruz/crisis/data/crisis_texts_dataset.csv')
df_text.head()

Unnamed: 0,tweet_id,text_info,text_info_conf,tweet_text
0,917791044158185473,informative,1.0,RT @Gizmodo: Wildfires raging through Northern...
1,917791130590183424,informative,1.0,PHOTOS: Deadly wildfires rage in California ht...
2,917791291823591425,informative,0.6813,RT @Cal_OES: PLS SHARE: Weâ€™re capturing wild...
3,917792092100988929,informative,0.6727,RT @TIME: California's raging wildfires as you...
4,917792147700465664,informative,0.7143,Wildfires Threaten Californiaâ€™s First Legal ...


In [112]:
# Para el manejo de stopwords

#nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words.update(['rt', 'RT'])

print(stop_words)


{'does', "doesn't", 'further', 'him', 'very', 'i', "you'll", 'not', 'mightn', 'me', 'we', 'd', 'are', 'is', 'at', 'isn', 'too', 'between', 'hadn', 'doing', 'needn', 'ma', 'any', 'few', 'you', "they've", 'hasn', 'will', 'against', "needn't", 'while', "you've", 'on', 'ain', 'he', 'each', 'RT', "mightn't", 'by', 'yourselves', "it's", "won't", "shouldn't", 'such', 'own', 'as', 'both', "i've", 'm', 'o', 'this', 'myself', 'mustn', "i'm", 'his', 'hers', 'did', 'again', 'an', 'whom', 'itself', 'that', 'down', 'weren', 'off', 'haven', 'no', "wouldn't", "i'd", 'of', 'my', 'their', 'had', 'she', 'was', 'being', "hasn't", 'into', 'nor', 'now', "haven't", 'won', 'here', 'rt', 'ours', "should've", "they'd", "he'd", "wasn't", 'why', 'who', "it'd", 'with', 'our', 'more', "she'll", 'a', 'shouldn', "weren't", 'because', 'through', 'himself', "that'll", 'after', 'just', 'most', 'the', 'has', 'were', 'been', 'don', "she'd", 'when', 'until', "couldn't", 've', "we'll", 'once', 'ourselves', 'these', "we'd", 

In [113]:
def clean_text(text):
    # Eliminar URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Eliminar caracteres no ASCII
    text = text.encode('ascii', 'ignore').decode('ascii')
    # Eliminar números
    text = re.sub(r'\d+', '', text)
    # Reemplazar signos de puntuación con espacios
    text = re.sub(r'[^\w\s]', ' ', text)
    # Convertir a minúsculas y eliminar stopwords
    text = ' '.join([word.lower() for word in text.split() if word.lower() not in stop_words])
    return text

In [114]:
df_text['clean_text'] = df_text['tweet_text'].apply(clean_text)
df_text.head(100)

Unnamed: 0,tweet_id,text_info,text_info_conf,tweet_text,clean_text
0,917791044158185473,informative,1.0000,RT @Gizmodo: Wildfires raging through Northern...,gizmodo wildfires raging northern california t...
1,917791130590183424,informative,1.0000,PHOTOS: Deadly wildfires rage in California ht...,photos deadly wildfires rage california
2,917791291823591425,informative,0.6813,RT @Cal_OES: PLS SHARE: Weâ€™re capturing wild...,cal_oes pls share capturing wildfire response ...
3,917792092100988929,informative,0.6727,RT @TIME: California's raging wildfires as you...,time california raging wildfires never seen
4,917792147700465664,informative,0.7143,Wildfires Threaten Californiaâ€™s First Legal ...,wildfires threaten californias first legal can...
...,...,...,...,...,...
95,917833987824979968,informative,0.6783,Satellite image of California last night. Thos...,satellite image california last night arent li...
96,917834588814155776,informative,0.6854,RT @AuroraWorldView: Deadly #California #wildf...,auroraworldview deadly california wildfires fo...
97,917834628290920448,informative,0.6667,Fire swept through Northern California early M...,fire swept northern california early monday mo...
98,917835067069788162,informative,1.0000,"Couple, ages 100 and 98, die in CaliforniaÂ wi...",couple ages die california wildfire


In [115]:
# Muestra aleatoria de textos limpios
random.sample(list(df_text['clean_text']), 10)

['vatf yesterday eastern puerto rico red squad ks hurricanemaria rescuedogs',
 'lebron cavs may sounding like jim mora soon hope win game nbafinals',
 'gophers basketball pav benefit puerto rico',
 'mommy dont cry year old girl surviving hurricane maria unicef connect',
 'emily swick reports stuco efforts perrysburg help harvey victims',
 'proud pdalesd student council ss making difference hurricaneharvey relief achieves tigerready',
 'hometown jayuya pr destroyed hurricane maria',
 'early evening wednesday forecast track hurricane maria wsav wsavkrisa kyledenniswx wsavariellas',
 'bid twofux makeup case auction benef glamberts affected hurricanes harvey irma']

#### **Image**
- Redimensionar y normalizar (224x224)
- Valores de los pixeles entre 0 y 1
- Normalizar los canales de color con respecto al dataset ImageNet

In [116]:
# Los datos ya fueron preprocesados y guardados como archivos .npy
# Falta cargar los datos
default_path = "/home/jacruz/crisis/data/preprocessed_images/"
df = pd.read_csv("/home/jacruz/crisis/data/crisis_images_dataset.csv")

image_names = df['image_name'].tolist()
labels = df['image_info'].tolist()

In [117]:
image_paths = [default_path + name + '.npy' for name in image_names]
image_paths[:5]

['/home/jacruz/crisis/data/preprocessed_images/917791044158185473_0.npy',
 '/home/jacruz/crisis/data/preprocessed_images/917791130590183424_0.npy',
 '/home/jacruz/crisis/data/preprocessed_images/917791291823591425_0.npy',
 '/home/jacruz/crisis/data/preprocessed_images/917791291823591425_1.npy',
 '/home/jacruz/crisis/data/preprocessed_images/917792092100988929_0.npy']

In [118]:
image_path = image_paths[0]
if not os.path.exists(image_path):
                print(f"Archivo no encontrado: {image_path}")

In [119]:
def load_images_from_paths(image_paths, target_size=(224, 224)):
    """
    Carga imágenes desde rutas de archivo
    """
    images = []
    valid_indices = []
    
    print(f"Cargando {len(image_paths)} imágenes desde rutas...")
    
    for i, npy_path in enumerate(image_paths):
        try:
            # Verificar si el archivo existe
            if not os.path.exists(npy_path):
                print(f"Archivo no encontrado: {npy_path}")
                continue
                
            # Cargar array numpy
            img_array = np.load(npy_path)
            
            images.append(img_array)
            valid_indices.append(i)
            
        except Exception as e:
            print(f"Error cargando imagen {npy_path}: {e}")
            print(f"Forma del array: {img_array.shape}" if 'img_array' in locals() else "No se pudo cargar el array.")
            continue
            
        # Mostrar progreso cada 1000 imágenes
        if (i + 1) % 1000 == 0:
            print(f"Procesadas {i + 1}/{len(image_paths)} imágenes")
    
    print(f"Imágenes cargadas exitosamente: {len(images)}/{len(image_paths)}")
    
    return np.array(images), valid_indices

In [120]:

def load_and_prepare_data():
    """
    Carga los arrays numpy preprocesados (.npy) y prepara para entrenamiento
    """
    # Cargar los arrays
    X_paths = np.array(image_paths)
    y_labels = np.array(labels)
    
    # Verificar formas
    print(f"Número de rutas de imágenes: {len(X_paths)}")
    print(f"Número de etiquetas: {len(y_labels)}")

    # Cargar imágenes desde las rutas
    X_images, valid_indices = load_images_from_paths(X_paths)

    # Filtrar etiquetas para mantener solo las correspondientes a imágenes cargadas exitosamente
    y_labels_filtered = y_labels[valid_indices]
    
    # Mapeo manual de labels a números
    # Según el paper: 0 = Not-informative, 1 = Informative
    label_mapping = {'not_informative': 0, 'informative': 1}
    
    # Convertir etiquetas de texto a numéricas
    y_numeric = np.array([label_mapping[label] for label in y_labels_filtered])
    
    print(f"Distribución de etiquetas:")
    print(f"Non-informative (0): {np.sum(y_numeric == 0)}")
    print(f"Informative (1): {np.sum(y_numeric == 1)}")

    # Se asegura de tener el formato correcto
    X_images = X_images.astype('float32')

    y_categorical = to_categorical(y_numeric, num_classes=2)

    print(f"Rango de valores en X: min {X_images.min():.3f}, max {X_images.max():.3f}")

    print(f"Forma final de X: {X_images.shape}")
    print(f"Forma final de y: {y_categorical.shape}")
    return X_images, y_categorical

In [121]:
# Cargar dataset de imágenes
df_image = pd.read_csv('/home/jacruz/crisis/data/crisis_images_dataset.csv')
print(f"Imágenes cargadas: {len(df_image)}")

Imágenes cargadas: 18082


#### **Multimodal**

In [122]:
def align_multimodal_pairs_paper(df_text, df_image, image_base_path):
    """
    Alinea pares texto-imagen con misma etiqueta
    """
    # Asegurar que tenemos las columnas necesarias
    print("Alineando pares texto-imagen con misma etiqueta...")
    
    aligned_pairs = []
    
    # Contadores para debugging
    total_texts = len(df_text)
    matches_found = 0
    
    for idx, text_row in df_text.iterrows():
        tweet_id = text_row['tweet_id']
        text_label = text_row['text_info']  # 'informative' o 'not_informative'
        clean_text = text_row['clean_text']
        
        # Buscar imagen con mismo tweet_id y MISMA etiqueta
        matching_images = df_image[
            (df_image['tweet_id'] == tweet_id) & 
            (df_image['image_info'] == text_label)
        ]
        
        if len(matching_images) > 0:
            # Tomar la primera imagen que coincide
            image_row = matching_images.iloc[0]
            image_name = image_row['image_name']
            image_path = f"{image_base_path}/{image_name}.npy"
            
            # Verificar que el archivo de imagen existe
            if os.path.exists(image_path):
                aligned_pairs.append({
                    'tweet_id': tweet_id,
                    'text': clean_text,
                    'image_path': image_path,
                    'label': text_label,
                    'label_numeric': 1 if text_label == 'informative' else 0
                })
                matches_found += 1
        
        # Mostrar progreso
        if (idx + 1) % 1000 == 0:
            print(f"Procesados {idx + 1}/{total_texts} textos, encontrados {matches_found} pares")
    
    print(f"Pares alineados encontrados: {matches_found}/{total_texts}")
    
    # Convertir a DataFrame
    aligned_df = pd.DataFrame(aligned_pairs)
    
    # Mostrar distribución
    if len(aligned_df) > 0:
        informative_count = sum(aligned_df['label_numeric'] == 1)
        non_informative_count = sum(aligned_df['label_numeric'] == 0)
        
        print(f"Distribución de pares alineados:")
        print(f"  Informative: {informative_count} ({informative_count/len(aligned_df)*100:.1f}%)")
        print(f"  Non-informative: {non_informative_count} ({non_informative_count/len(aligned_df)*100:.1f}%)")
    else:
        print("⚠️  No se encontraron pares alineados. Revisa los datos.")
    
    return aligned_df

In [123]:
# Alinear pares
aligned_df = align_multimodal_pairs_paper(
    df_text, 
    df_image, 
    image_base_path="/home/jacruz/crisis/data/preprocessed_images"
)

Alineando pares texto-imagen con misma etiqueta...
Procesados 1000/16058 textos, encontrados 783 pares
Procesados 2000/16058 textos, encontrados 1547 pares
Procesados 3000/16058 textos, encontrados 2380 pares
Procesados 4000/16058 textos, encontrados 3080 pares
Procesados 5000/16058 textos, encontrados 3793 pares
Procesados 6000/16058 textos, encontrados 4499 pares
Procesados 7000/16058 textos, encontrados 5199 pares
Procesados 8000/16058 textos, encontrados 5829 pares
Procesados 9000/16058 textos, encontrados 6428 pares
Procesados 10000/16058 textos, encontrados 7042 pares
Procesados 11000/16058 textos, encontrados 7752 pares
Procesados 12000/16058 textos, encontrados 8443 pares
Procesados 13000/16058 textos, encontrados 9103 pares
Procesados 14000/16058 textos, encontrados 9750 pares
Procesados 15000/16058 textos, encontrados 10540 pares
Procesados 16000/16058 textos, encontrados 11354 pares
Pares alineados encontrados: 11398/16058
Distribución de pares alineados:
  Informative: 7624

In [124]:
aligned_df

Unnamed: 0,tweet_id,text,image_path,label,label_numeric
0,917791044158185473,gizmodo wildfires raging northern california t...,/home/jacruz/crisis/data/preprocessed_images/9...,informative,1
1,917791130590183424,photos deadly wildfires rage california,/home/jacruz/crisis/data/preprocessed_images/9...,informative,1
2,917791291823591425,cal_oes pls share capturing wildfire response ...,/home/jacruz/crisis/data/preprocessed_images/9...,informative,1
3,917792092100988929,time california raging wildfires never seen,/home/jacruz/crisis/data/preprocessed_images/9...,informative,1
4,917792147700465664,wildfires threaten californias first legal can...,/home/jacruz/crisis/data/preprocessed_images/9...,informative,1
...,...,...,...,...,...
11393,916027579882029056,sun earthquake model matches mexico,/home/jacruz/crisis/data/preprocessed_images/9...,informative,1
11394,916075589131436032,wave natural disasters strike mexico,/home/jacruz/crisis/data/preprocessed_images/9...,informative,1
11395,916099461444710400,podcast shines light volunteers rushed help me...,/home/jacruz/crisis/data/preprocessed_images/9...,informative,1
11396,916112796194021376,entercom san francisco stations raise funds me...,/home/jacruz/crisis/data/preprocessed_images/9...,informative,1


#### **Split de Datos**

In [125]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def create_multimodal_splits_paper(aligned_df, test_size=0.15, val_size=0.15, max_seq_len=50):
    """
    Crea splits 70-15-15
    """
    if len(aligned_df) == 0:
        raise ValueError("No hay datos alineados para crear splits")
    
    print("Creando splits...")
    
    # 1. Split estratificado (70% train, 30% temp)
    train_df, temp_df = train_test_split(
        aligned_df, 
        test_size=test_size + val_size, 
        random_state=42,
        stratify=aligned_df['label_numeric']
    )
    
    # 2. Split temp en validation y test (15% cada uno)
    val_df, test_df = train_test_split(
        temp_df,
        test_size=0.5,  # 15%/(15%+15%) = 0.5
        random_state=42,
        stratify=temp_df['label_numeric']
    )
    
    print(f"Split final (70-15-15):")
    print(f"  Train: {len(train_df)} samples")
    train_informative = sum(train_df['label_numeric'] == 1)
    train_non_informative = sum(train_df['label_numeric'] == 0)
    print(f"    Informative: {train_informative} ({train_informative/len(train_df)*100:.1f}%)")
    print(f"    Non-informative: {train_non_informative} ({train_non_informative/len(train_df)*100:.1f}%)")

    val_informative = sum(val_df['label_numeric'] == 1)
    val_non_informative = sum(val_df['label_numeric'] == 0)
    print(f"  Validation: {len(val_df)} samples")
    print(f"    Informative: {val_informative} ({val_informative/len(val_df)*100:.1f}%)")
    print(f"    Non-informative: {val_non_informative} ({val_non_informative/len(val_df)*100:.1f}%)")
    
    test_informative = sum(test_df['label_numeric'] == 1)
    test_non_informative = sum(test_df['label_numeric'] == 0)
    print(f"  Test: {len(test_df)} samples")
    print(f"    Informative: {test_informative} ({test_informative/len(test_df)*100:.1f}%)")
    print(f"    Non-informative: {test_non_informative} ({test_non_informative/len(test_df)*100:.1f}%)")
    
    # 3. Tokenización de textos
    print("Tokenizando textos...")
    tokenizer = Tokenizer(num_words=50000, oov_token="<OOV>")
    tokenizer.fit_on_texts(train_df['text'])
    
    # Convertir textos a secuencias
    X_text_train = tokenizer.texts_to_sequences(train_df['text'])
    X_text_val = tokenizer.texts_to_sequences(val_df['text'])
    X_text_test = tokenizer.texts_to_sequences(test_df['text'])
    
    # Padding (max_seq_len=50 como en tu implementación)
    X_text_train = pad_sequences(X_text_train, maxlen=max_seq_len, padding='post')
    X_text_val = pad_sequences(X_text_val, maxlen=max_seq_len, padding='post')
    X_text_test = pad_sequences(X_text_test, maxlen=max_seq_len, padding='post')
    
    # 4. Cargar imágenes
    print("Cargando imágenes...")
    X_image_train, train_img_indices = load_images_from_paths(train_df['image_path'].tolist())
    X_image_val, val_img_indices = load_images_from_paths(val_df['image_path'].tolist())
    X_image_test, test_img_indices = load_images_from_paths(test_df['image_path'].tolist())
    
    # 5. Filtrar textos para coincidir con imágenes cargadas exitosamente
    X_text_train = X_text_train[train_img_indices]
    X_text_val = X_text_val[val_img_indices]
    X_text_test = X_text_test[test_img_indices]
    
    # 6. Filtrar labels
    y_train = train_df['label_numeric'].values[train_img_indices]
    y_val = val_df['label_numeric'].values[val_img_indices]
    y_test = test_df['label_numeric'].values[test_img_indices]
    
    print(f"\nDatasets finales multimodales:")
    print(f"  X_text_train: {X_text_train.shape}, X_image_train: {X_image_train.shape}, y_train: {y_train.shape}")
    print(f"  X_text_val: {X_text_val.shape}, X_image_val: {X_image_val.shape}, y_val: {y_val.shape}")
    print(f"  X_text_test: {X_text_test.shape}, X_image_test: {X_image_test.shape}, y_test: {y_test.shape}")
    
    return (X_text_train, X_image_train, y_train,
            X_text_val, X_image_val, y_val, 
            X_text_test, X_image_test, y_test, tokenizer)


In [126]:
# Crear splits multimodales
(X_text_train, X_image_train, y_train,
 X_text_val, X_image_val, y_val,
 X_text_test, X_image_test, y_test, tokenizer) = create_multimodal_splits_paper(aligned_df)

Creando splits...
Split final (70-15-15):
  Train: 7978 samples
    Informative: 5336 (66.9%)
    Non-informative: 2642 (33.1%)
  Validation: 1710 samples
    Informative: 1144 (66.9%)
    Non-informative: 566 (33.1%)
  Test: 1710 samples
    Informative: 1144 (66.9%)
    Non-informative: 566 (33.1%)
Tokenizando textos...
Cargando imágenes...
Cargando 7978 imágenes desde rutas...
Procesadas 1000/7978 imágenes
Procesadas 2000/7978 imágenes
Procesadas 3000/7978 imágenes
Procesadas 4000/7978 imágenes
Procesadas 5000/7978 imágenes
Procesadas 6000/7978 imágenes
Procesadas 7000/7978 imágenes
Imágenes cargadas exitosamente: 7978/7978
Cargando 1710 imágenes desde rutas...
Procesadas 1000/1710 imágenes
Imágenes cargadas exitosamente: 1710/1710
Cargando 1710 imágenes desde rutas...
Procesadas 1000/1710 imágenes
Imágenes cargadas exitosamente: 1710/1710

Datasets finales multimodales:
  X_text_train: (7978, 50), X_image_train: (7978, 224, 224, 3), y_train: (7978,)
  X_text_val: (1710, 50), X_imag

### **Modelo Word2Vec**

In [None]:
# Jugando con Word2Vec
sentences = X_text_train['text'].tolist()  # Se pasa a lista de listas de tokens
w2v = Word2Vec(sentences, vector_size=300, window=5, min_count=5, workers=4)

vocab = list(w2v.wv.key_to_index.keys())

palabra_a_buscar = 'mexico'
print(f"El vocabulario tiene {len(vocab)} palabras")
print(f"Algunas palabras aleatorias del vocabulario: {sample(vocab, 10)}")

# Buscar palabras similares
w2v.wv.most_similar(palabra_a_buscar)
print(f"Palabras más similares a '{palabra_a_buscar}': {w2v.wv.most_similar(palabra_a_buscar)}")

# Similitud entre dos palabras
palabra1 = 'help'
palabra2 = 'support'
similarity = w2v.wv.similarity(palabra1, palabra2)
print(f"La similitud entre '{palabra1}' y '{palabra2}' es: {similarity}")

def create_embedding_matrix(word_index, w2v_model, embedding_dim=300):
    vocab_size = len(word_index) + 1
    # matriz de embedding inicializada en ceros
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in word_index.items():
        if word in w2v_model.wv:
            # si la palabra está en el vocabulario de word2vec
            embedding_matrix[i] = w2v_model.wv[word]
        else:
            # si la palabra no está, vector aleatorio
            embedding_matrix[i] = np.random.normal(0, 0.1, embedding_dim)
    return embedding_matrix

In [None]:
# Crear matriz de embedding
vocab_size = min(len(tokenizer.word_index) + 1, 50000)
embedding_matrix = create_embedding_matrix(tokenizer.word_index, w2v, embedding_dim=300)

Cargando embeddings GloVe...
Se cargaron 400000 vectores de palabras.
Palabras encontradas en GloVe: 10470/14333
Palabras no encontradas: 3862


### **Arquitectura Multimodal**

In [129]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications import VGG16

def create_multimodal_paper_exact(vocab_size, embedding_matrix, embedding_dim=300, max_seq_len=50):
    """
    Implementación EXACTA del paper - entrenamiento desde cero
    """
    print("Creando arquitectura multimodal exacta del paper...")

    # Usar mixed precision para ahorrar memoria
    policy = tf.keras.mixed_precision.Policy('mixed_float16')
    tf.keras.mixed_precision.set_global_policy(policy)

    with tf.device('/GPU:0'):
        # ==================== RAMA TEXTO ====================
        text_input = layers.Input(shape=(max_seq_len,), name='text_input')
        
        # Embedding layer (igual que paper)
        x_text = layers.Embedding(
            input_dim=vocab_size,
            output_dim=embedding_dim,
            weights=[embedding_matrix],
            input_length=max_seq_len,
            trainable=False  # Como en el paper - embeddings congelados
        )(text_input)
        
        # CNN con múltiples filtros (EXACTO como paper)
        conv_blocks = []
        
        # Filter 1: 100 filters, size 2
        conv1 = layers.Conv1D(filters=100, kernel_size=2, activation='relu', padding='same')(x_text)
        pool1 = layers.GlobalMaxPooling1D()(conv1)
        conv_blocks.append(pool1)
        
        # Filter 2: 150 filters, size 3  
        conv2 = layers.Conv1D(filters=150, kernel_size=3, activation='relu', padding='same')(x_text)
        pool2 = layers.GlobalMaxPooling1D()(conv2)
        conv_blocks.append(pool2)
        
        # Filter 3: 200 filters, size 4
        conv3 = layers.Conv1D(filters=200, kernel_size=4, activation='relu', padding='same')(x_text)
        pool3 = layers.GlobalMaxPooling1D()(conv3)
        conv_blocks.append(pool3)
        
        # Concatenar salidas de convoluciones
        concat_text = layers.Concatenate()(conv_blocks)
        
        # Dropout (0.02 como en paper)
        x_text = layers.Dropout(0.02)(concat_text)
        
        # Capa densa de 1000 unidades antes de fusión
        text_features = layers.Dense(1000, activation='relu', name='text_features')(x_text)
        
        # ==================== RAMA IMAGEN ====================
        image_input = layers.Input(shape=(224, 224, 3), name='image_input')
        
        # VGG16 desde CERO (NO pre-entrenado) - como en paper
        base_model = VGG16(
            weights=None,  # ⚠️ IMPORTANTE: Random initialization
            include_top=False,
            input_shape=(224, 224, 3)
        )
        
        # Todas las capas entrenables (paper no menciona freezing)
        for layer in base_model.layers:
            layer.trainable = True
        
        x_image = base_model(image_input)
        x_image = layers.GlobalAveragePooling2D()(x_image)
        
        # Capa fc2 de 1000 unidades (penultimate layer)
        image_features = layers.Dense(1000, activation='relu', name='image_features')(x_image)
        
        # ==================== FUSIÓN MULTIMODAL ====================
        # Concatenación (Early Fusion)
        concatenated = layers.Concatenate(name='fusion')([text_features, image_features])
        
        # Capa oculta adicional después de fusión
        hidden = layers.Dense(512, activation='relu')(concatenated)
        hidden = layers.Dropout(0.02)(hidden)
        
        # Capa de salida final
        output = layers.Dense(2, activation='softmax', name='output')(hidden)
        
        # Modelo final
        model = models.Model(
            inputs=[text_input, image_input], 
            outputs=output,
            name='multimodal_paper_exact'
        )
    
    return model

# Crear modelo multimodal
multimodal_model = create_multimodal_paper_exact(
    vocab_size=vocab_size,
    embedding_matrix=embedding_matrix,
    max_seq_len=50
)

print("✅ Modelo multimodal creado (configuración exacta paper)")
multimodal_model.summary()

Creando arquitectura multimodal exacta del paper...
✅ Modelo multimodal creado (configuración exacta paper)




### **Entrenamiento**

In [133]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

def compile_and_train_multimodal_paper(model, X_text_train, X_image_train, y_train, X_text_val, X_image_val, y_val):
    """
    Compila y entrena el modelo EXACTO como paper
    """
    # Compilar (Adam con learning rate por defecto como paper)
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy', 'precision', 'recall']
    )
    
    # Callbacks como en paper
    callbacks = [
        # Early stopping con patience=10 (página 5 del paper)
        EarlyStopping(
            monitor='val_accuracy',
            patience=10,
            restore_best_weights=True,
            verbose=1
        ),
        # Paper menciona reducir learning rate pero no especifica parámetros
        ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=5,
            min_lr=1e-7,
            verbose=1
        )
    ]
    
    # Entrenar (50 épocas máximo como paper)
    print("Iniciando entrenamiento multimodal según paper...")
    history = model.fit(
        x=[X_text_train, X_image_train],
        y=y_train,
        batch_size=8,  # Minibatch size del paper: 32
        epochs=50,      # Máximo según paper
        validation_data=([X_text_val, X_image_val], y_val),
        callbacks=callbacks,
        verbose=1
    )
    
    return history

# Entrenar modelo
history = compile_and_train_multimodal_paper(
    multimodal_model,
    X_text_train, X_image_train, y_train,
    X_text_val, X_image_val, y_val
)

Iniciando entrenamiento multimodal según paper...


2025-11-26 06:05:14.711577: E external/local_xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.cc:361] gpu_async_0 cuMemAllocAsync failed to allocate 4803649536 bytes: RESOURCE_EXHAUSTED: : CUDA_ERROR_OUT_OF_MEMORY: out of memory
 Reported by CUDA: Free memory/Total memory: 243007488/51041271808
2025-11-26 06:05:14.711604: E external/local_xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.cc:366] Stats: Limit:                       478937088
InUse:                       171518048
MaxInUse:                    269994416
NumAllocs:                        1269
MaxAllocSize:                 17199600
Reserved:                            0
PeakReserved:                        0
LargestFreeBlock:                    0

2025-11-26 06:05:14.711619: E external/local_xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.cc:70] Histogram of current allocation: (allocation_size_in_bytes, nb_allocation_of_that_sizes), ...;
2025-11-26 06:05:14.711622: E external/local_xla/xla/stream

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.

ernal/local_xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.cc:106] CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT: 469762048
2025-11-26 06:05:14.711676: E external/local_xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.cc:108] CU_MEMPOOL_ATTR_USED_MEM_CURRENT: 171518048
2025-11-26 06:05:14.711678: E external/local_xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.cc:109] CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: 704643072
2025-11-26 06:05:14.711680: E external/local_xla/xla/stream_executor/gpu/gpu_cudamallocasync_allocator.cc:110] CU_MEMPOOL_ATTR_USED_MEM_HIGH: 269994416
