In [106]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

In [107]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mafit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [108]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mafit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\mafit\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [109]:
lemmatizer = WordNetLemmatizer()

In [110]:
# Función para reducir caracteres repetidos (nooooo -> noo)
def reduce_repeated_chars(text):
    return re.sub(r'(.)\1{2,}', r'\1\1', text)

In [111]:
ruta= "train.csv"
df= pd.read_csv(ruta)

In [112]:
df = df.drop(columns=['id','location'])

In [113]:
print("Información general:")
print(df.info())

# Mostrar primeras filas
print("\nPrimeras filas:")
print(df.head())

# Revisar nulos
print("\nNulos por columna:")
print(df.isnull().sum())

# Ver distribución de la variable target
print("\nDistribución de target:")
print(df['target'].value_counts())



Información general:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   keyword  7552 non-null   object
 1   text     7613 non-null   object
 2   target   7613 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 178.6+ KB
None

Primeras filas:
  keyword                                               text  target
0     NaN  Our Deeds are the Reason of this #earthquake M...       1
1     NaN             Forest fire near La Ronge Sask. Canada       1
2     NaN  All residents asked to 'shelter in place' are ...       1
3     NaN  13,000 people receive #wildfires evacuation or...       1
4     NaN  Just got sent this photo from Ruby #Alaska as ...       1

Nulos por columna:
keyword    61
text        0
target      0
dtype: int64

Distribución de target:
0    4342
1    3271
Name: target, dtype: int64


In [114]:
# Llenar nulos de keyword y location
df['keyword'] = df['keyword'].fillna('unknown')

In [115]:
# Preprocesar texto (limpieza general)
def clean_text(text):
    text = text.lower()
    text = reduce_repeated_chars(text)  # Ejemplo: nooooo -> noo
    text = re.sub(r'http\S+|www.\S+', '', text)  # Quitar URLs
    text = re.sub(r'@\w+', '', text)  # Quitar menciones
    text = re.sub(r'#', '', text)  # Quitar hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Solo letras
    text = re.sub(r'\s+', ' ', text).strip()  # Espacios extra
    
    # Lematización (convertir a la raíz de la palabra)
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Aplicar al dataframe
df['text'] = df['text'].apply(clean_text)

# Preprocesar keyword (limpieza general)
def clean_kw(keyword):
    keyword = keyword.lower()
    keyword = reduce_repeated_chars(keyword)  # Ejemplo: goooooal -> goal
    keyword = re.sub(r'http\S+|www.\S+', '', keyword)
    keyword = re.sub(r'@\w+', '', keyword)
    keyword = re.sub(r'#', '', keyword)
    keyword = re.sub(r'[^a-zA-Z\s]', '', keyword)
    keyword = re.sub(r'\s+', ' ', keyword).strip()
    keyword = re.sub(r'%20', ' ', keyword).strip()
    keyword = re.sub(r'20', ' ', keyword).strip()
    
    # Lematización de keywords también
    words = keyword.split()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Aplicar al dataframe
df['keyword'] = df['keyword'].apply(clean_kw)

In [116]:

# Stopwords personalizadas (preservando términos de desastres)
default_stopwords = set(stopwords.words('english'))
disaster_terms = {
    'fire', 'flood', 'earthquake', 'hurricane', 'storm', 'tornado', 'tsunami', 
    'explosion', 'disaster', 'emergency', 'evacuate', 'help', 'rain', 'smoke', 
    'wind', 'eruption', 'ablaze', 'floods', 'flooding', 'quake', 'aftershock',
    'eruption', 'wildfire', 'landslide', 'mudslide', 'volcano', 'tremor',
    'typhoon', 'cyclone', 'avalanche', 'blizzard', 'heatwave', 'drought', 
    'sinkhole', 'hailstorm', 'sandstorm', 'firestorm', 'catastrophe', 
    'calamity', 'collapse', 'rescue', 'survivor', 'emergency', 'alert',
    'crisis', 'evacuation', 'shelter', 'stormsurge', 'firenado', 'flashflood',
    'aftershocks', 'earthquacke', 'hurricance', 'huricane', 'wildfires', 
    'landslides', 'firenados', 'cyclones', 'typhoons', 'tsunamis', 'avalanches',
    'disaters', 'eruption', 'erthquake', 'emergncy', 'emerjency', 'disasterous',
    'volcanic', 'epicenter', 'shockwave', 'tremblor', 'blackout', 'poweroutage',
    'hazmat', 'biohazard', 'collapse', 'derailment', 'hazard', 'sinkholes',
    'tsnuami', 'firefighters', 'firefighter', 'firstresponders', 'firstresponder',
    'searchandrescue', 'relief', 'aftermath', 'emergencyresponse', 
    'hazardous', 'catastrophic', 'prevention', 'preparedness', 'evacuated',
    'fatalities', 'casualties', 'displaced', 'sos', 'mayday', 'helpme', 'needhelp',
    'emergancy', 'volcan', 'smokecloud', 'ashfall', 'stormwatch', 'disasterrelief',
    'calamities', 'seismic', 'quakes', 'weatheralert', 'weatherwarning', 
    'damage', 'destroyed', 'devastation', 'wreckage', 'firenation', 
    'explosions', 'explosiv', 'earthquakes', 'floodwaters'
}

custom_stopwords = [word for word in default_stopwords if word not in disaster_terms]

def remove_custom_stopwords(text):
    words = text.split()
    return ' '.join([word for word in words if word not in custom_stopwords])

df['text'] = df['text'].apply(remove_custom_stopwords)

In [117]:
df = df.dropna(subset=['target'])

In [119]:
df.to_excel("prueba.xlsx")

In [118]:
# Combinar keyword, location y text en un solo campo (para vectorización o embeddings)
df['full_text'] = df['keyword'] + ' ' + df['text']

In [120]:
# Vectorizar texto (TF-IDF)
vectorizer = TfidfVectorizer(max_features=5000, min_df=2, max_df=0.95)
X = vectorizer.fit_transform(df['full_text'])

y = df['target']

In [121]:
#limpiamos test para posterior prueba
df_test = pd.read_csv('test.csv')

# Asegúrate de aplicar la misma limpieza que al train (por consistencia)
def clean_text(text):
    import re
    text = text.lower()
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_test['keyword'] = df_test['keyword'].fillna('unknown')
df_test['location'] = df_test['location'].fillna('unknown')
df_test['location'] = df_test['location'].str.lower().str.strip()

df_test['text'] = df_test['text'].apply(clean_text)
df_test['full_text'] = df_test['keyword'] + ' ' + df_test['location'] + ' ' + df_test['text']

# Vectorizar usando el mismo vectorizer que usaste en el train
X_test = vectorizer.transform(df_test['full_text'])

In [122]:
#SEPARAMOS LOS VALORES PARA ENTRENAR LA REGRESION
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [123]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


In [124]:
#MODELO REGRESION
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_val)

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.87      0.83       874
           1       0.80      0.71      0.75       649

    accuracy                           0.80      1523
   macro avg       0.80      0.79      0.79      1523
weighted avg       0.80      0.80      0.80      1523



In [125]:
#PREPARACION PARA BERT

X = df['full_text'].tolist()
y = df['target'].values

# Split train/val
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [128]:
import tensorflow as tf
print("Dispositivos físicos:", tf.config.list_physical_devices())


Dispositivos físicos: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]


In [127]:
import tensorflow as tf
import numpy as np
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import AdamWeightDecay
from sklearn.metrics import classification_report
from tf_keras import layers, models

# Forzar uso de GPU si está disponible
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        print(f"✅ Usando GPU: {gpus[0].name}")
    except RuntimeError as e:
        print(e)
else:
    print("⚠️ No se detectó GPU, corriendo en CPU.")
# Cargar el tokenizador BERT (puedes cambiar el modelo si prefieres otro idioma o versión)
bert_model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

# Función para tokenizar textos (BERT necesita input_ids y attention_mask)
def encode_texts(texts, tokenizer, max_len=128):
    encodings = tokenizer(
        texts,
        truncation=True,
        padding='max_length',
        max_length=max_len,
        return_tensors='tf'
    )
    return {
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask']
    }

# Tokenizar train y validation
X_train_enc = encode_texts(X_train, tokenizer)
X_val_enc = encode_texts(X_val, tokenizer)

# Convertir las etiquetas a tensores
y_train_enc = tf.convert_to_tensor(y_train, dtype=tf.int32)
y_val_enc = tf.convert_to_tensor(y_val, dtype=tf.int32)

# Cargar modelo preentrenado BERT para clasificación binaria (2 etiquetas: 0 y 1)
model = TFBertForSequenceClassification.from_pretrained(bert_model_name, num_labels=2)

# Definir optimizador (AdamWeightDecay es recomendado por Hugging Face)
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)

# Compilar modelo con pérdida para clasificación binaria
model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

# Entrenar modelo
history = model.fit(
    X_train_enc,
    y_train_enc,
    validation_data=(X_val_enc, y_val_enc),
    epochs=3,
    batch_size=16
)

# Evaluar el modelo y predecir en validación
y_val_pred_logits = model.predict(X_val_enc).logits
y_val_pred = np.argmax(y_val_pred_logits, axis=1)

# Métricas de validación
print(classification_report(y_val, y_val_pred))


⚠️ No se detectó GPU, corriendo en CPU.


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


KeyboardInterrupt: 

In [72]:
y_test_pred = logreg.predict(X_test)

# Guardar predicciones
df_test['target'] = y_test_pred
df_test[['id', 'target']].to_excel('predictions_logreg.xlsx', index=False)

print("Predicciones guardadas en predictions_logreg.xlsx")

Predicciones guardadas en predictions_logreg.xlsx


In [77]:
# Paso 1: Tokenizar el test set
X_test_texts = df_test['full_text'].tolist()

# Tokenizar usando la misma función y tokenizador
X_test_enc = encode_texts(X_test_texts, tokenizer)

# Paso 2: Hacer predicciones con el modelo BERT entrenado
y_test_pred_logits = model.predict(X_test_enc).logits
y_test_pred = np.argmax(y_test_pred_logits, axis=1)

# Paso 3: Guardar predicciones en el DataFrame
df_test['bert_pred'] = y_test_pred

# Mostrar un preview de los resultados
print(df_test[['id', 'bert_pred']].head())

# (Opcional) Guardar predicciones a un CSV
df_test[['id', 'bert_pred']].rename(columns={'bert_pred': 'target'}).to_csv('bert_predictions.csv', index=False)

print("Predicciones guardadas en 'bert_predictions.csv'")


   id  bert_pred
0   0          1
1   2          1
2   3          1
3   9          1
4  11          1
Predicciones guardadas en 'bert_predictions.csv'
