## **Multimodal: Text and Image using GloVe and VGG16**

In [None]:
import os

# Configurar TensorFlow para usar menos memoria de GPU
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import random
from tensorflow.keras.utils import to_categorical
from PIL import Image
import numpy as np

### **Preprocesamiento**

#### **Text**
"we preprocess them by removing stop words, non-ASCII charachters, numbers, URLs and hashtag signs. we also replace all punctuation marks with white spaces"

In [None]:
df_text = pd.read_csv('/home/jacruz/crisis/data/crisis_texts_dataset.csv')
df_text.head()

Unnamed: 0,tweet_id,text_info,text_info_conf,tweet_text
0,917791044158185473,informative,1.0,RT @Gizmodo: Wildfires raging through Northern...
1,917791130590183424,informative,1.0,PHOTOS: Deadly wildfires rage in California ht...
2,917791291823591425,informative,0.6813,RT @Cal_OES: PLS SHARE: Weâ€™re capturing wild...
3,917792092100988929,informative,0.6727,RT @TIME: California's raging wildfires as you...
4,917792147700465664,informative,0.7143,Wildfires Threaten Californiaâ€™s First Legal ...


In [None]:
# Para el manejo de stopwords

#nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words.update(['rt', 'RT'])

print(stop_words)


{'until', 's', 'those', "wasn't", 'will', 'which', 'haven', "i've", 'an', 'on', "shouldn't", "i'm", "won't", 'yourself', 'itself', 'aren', "he'll", "hadn't", 'didn', 'himself', 'doesn', 'here', "aren't", 'this', 'me', 'hadn', 'because', 'couldn', 'mustn', 'a', 'down', 'with', 'as', 'once', 'such', "weren't", "we'll", "we're", "haven't", 'ma', 'and', 'theirs', "we've", 'it', 'our', 'been', 'while', 'so', 'more', "needn't", 'or', "it'd", 'if', 'them', "doesn't", "didn't", 'from', 'but', 'wouldn', 'does', 'too', 'who', 'how', 'other', "you've", "mightn't", 'over', 'is', 'has', 'out', "hasn't", 'the', 'these', 'o', 'needn', 'again', 'don', 'before', 'just', "you're", 'are', 'for', 'i', 'during', 'm', 'y', 'shouldn', 'under', 'off', 'being', 'isn', 'there', 'won', 'each', 'ours', 'why', 'herself', 'myself', 'hers', "she's", 'my', 'all', 'nor', 'RT', 'only', 'into', 've', 'yours', 'now', 'him', 'further', 'his', 'hasn', "they've", 'of', 'by', 'be', "she'll", 'their', 'than', "they'd", 'throu

In [None]:
def clean_text(text):
    # Eliminar URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Eliminar caracteres no ASCII
    text = text.encode('ascii', 'ignore').decode('ascii')
    # Eliminar números
    text = re.sub(r'\d+', '', text)
    # Reemplazar signos de puntuación con espacios
    text = re.sub(r'[^\w\s]', ' ', text)
    # Convertir a minúsculas y eliminar stopwords
    text = ' '.join([word.lower() for word in text.split() if word.lower() not in stop_words])
    return text

In [None]:
df_text['clean_text'] = df_text['tweet_text'].apply(clean_text)
df_text.head(100)

Unnamed: 0,tweet_id,text_info,text_info_conf,tweet_text,clean_text
0,917791044158185473,informative,1.0000,RT @Gizmodo: Wildfires raging through Northern...,gizmodo wildfires raging northern california t...
1,917791130590183424,informative,1.0000,PHOTOS: Deadly wildfires rage in California ht...,photos deadly wildfires rage california
2,917791291823591425,informative,0.6813,RT @Cal_OES: PLS SHARE: Weâ€™re capturing wild...,cal_oes pls share capturing wildfire response ...
3,917792092100988929,informative,0.6727,RT @TIME: California's raging wildfires as you...,time california raging wildfires never seen
4,917792147700465664,informative,0.7143,Wildfires Threaten Californiaâ€™s First Legal ...,wildfires threaten californias first legal can...
...,...,...,...,...,...
95,917833987824979968,informative,0.6783,Satellite image of California last night. Thos...,satellite image california last night arent li...
96,917834588814155776,informative,0.6854,RT @AuroraWorldView: Deadly #California #wildf...,auroraworldview deadly california wildfires fo...
97,917834628290920448,informative,0.6667,Fire swept through Northern California early M...,fire swept northern california early monday mo...
98,917835067069788162,informative,1.0000,"Couple, ages 100 and 98, die in CaliforniaÂ wi...",couple ages die california wildfire


In [None]:
# Muestra aleatoria de textos limpios
random.sample(list(df_text['clean_text']), 10)

['houston hobby airport shutdown notice hurricaneharvey',
 'deck republikkklans rnc gop voted disasters harvey irma katrina thank democrat',
 'fire destroy couple married years die side side',
 'utilities delayed effort map power line wildfire risk facing california probe',
 'numbers explain hurricane maria devastating toll puerto rico',
 'come iga nhs selling baked goods support hurricane relief puerto rico',
 'relocate young kids pre disaster prep hysteria period house feed clothe clean safe harvey irma fema',
 'lets rebuild better smarter cuomo says thats puerto rico',
 'ianbremmer little different yesterday irma',
 'jennifer lopez personally affected hurricane maria']

#### **Image**
- Redimensionar y normalizar (224x224)
- Valores de los pixeles entre 0 y 1
- Normalizar los canales de color con respecto al dataset ImageNet

In [None]:
# Los datos ya fueron preprocesados y guardados como archivos .npy
# Falta cargar los datos
default_path = "/home/jacruz/crisis/data/preprocessed_images/"
df = pd.read_csv("/home/jacruz/crisis/data/crisis_images_dataset.csv")

image_names = df['image_name'].tolist()
labels = df['image_info'].tolist()

In [None]:
image_paths = [default_path + name + '.npy' for name in image_names]
image_paths[:5]

['/home/jacruz/crisis/data/preprocessed_images/917791044158185473_0.npy',
 '/home/jacruz/crisis/data/preprocessed_images/917791130590183424_0.npy',
 '/home/jacruz/crisis/data/preprocessed_images/917791291823591425_0.npy',
 '/home/jacruz/crisis/data/preprocessed_images/917791291823591425_1.npy',
 '/home/jacruz/crisis/data/preprocessed_images/917792092100988929_0.npy']

In [None]:
image_path = image_paths[0]
if not os.path.exists(image_path):
                print(f"Archivo no encontrado: {image_path}")

In [None]:
def load_images_from_paths(image_paths, target_size=(224, 224)):
    """
    Carga imágenes desde rutas de archivo
    """
    images = []
    valid_indices = []
    
    print(f"Cargando {len(image_paths)} imágenes desde rutas...")
    
    for i, npy_path in enumerate(image_paths):
        try:
            # Verificar si el archivo existe
            if not os.path.exists(npy_path):
                print(f"Archivo no encontrado: {npy_path}")
                continue
                
            # Cargar array numpy
            img_array = np.load(npy_path)
            
            images.append(img_array)
            valid_indices.append(i)
            
        except Exception as e:
            print(f"Error cargando imagen {npy_path}: {e}")
            print(f"Forma del array: {img_array.shape}" if 'img_array' in locals() else "No se pudo cargar el array.")
            continue
            
        # Mostrar progreso cada 1000 imágenes
        if (i + 1) % 1000 == 0:
            print(f"Procesadas {i + 1}/{len(image_paths)} imágenes")
    
    print(f"Imágenes cargadas exitosamente: {len(images)}/{len(image_paths)}")
    
    return np.array(images), valid_indices

In [None]:

def load_and_prepare_data():
    """
    Carga los arrays numpy preprocesados (.npy) y prepara para entrenamiento
    """
    # Cargar los arrays
    X_paths = np.array(image_paths)
    y_labels = np.array(labels)
    
    # Verificar formas
    print(f"Número de rutas de imágenes: {len(X_paths)}")
    print(f"Número de etiquetas: {len(y_labels)}")

    # Cargar imágenes desde las rutas
    X_images, valid_indices = load_images_from_paths(X_paths)

    # Filtrar etiquetas para mantener solo las correspondientes a imágenes cargadas exitosamente
    y_labels_filtered = y_labels[valid_indices]
    
    # Mapeo manual de labels a números
    # Según el paper: 0 = Not-informative, 1 = Informative
    label_mapping = {'not_informative': 0, 'informative': 1}
    
    # Convertir etiquetas de texto a numéricas
    y_numeric = np.array([label_mapping[label] for label in y_labels_filtered])
    
    print(f"Distribución de etiquetas:")
    print(f"Non-informative (0): {np.sum(y_numeric == 0)}")
    print(f"Informative (1): {np.sum(y_numeric == 1)}")

    # Se asegura de tener el formato correcto
    X_images = X_images.astype('float32')

    y_categorical = to_categorical(y_numeric, num_classes=2)

    print(f"Rango de valores en X: min {X_images.min():.3f}, max {X_images.max():.3f}")

    print(f"Forma final de X: {X_images.shape}")
    print(f"Forma final de y: {y_categorical.shape}")
    return X_images, y_categorical