In [2]:
import pandas as pd
import numpy as np

import re

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Carga y limpiesa de los datos

In [3]:
def load_data():
    url = "https://raw.githubusercontent.com/dD2405/Twitter_Sentiment_Analysis/master/train.csv"
    df = pd.read_csv(url)
    return df

In [4]:
def clean_text(text):
    text = re.sub(r'@\w+', '', text)  # eliminar menciones
    text = re.sub(r'#\w+', '', text)  # eliminar hashtags
    text = re.sub(r'http\S+', '', text)  # eliminar URLs
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)  # eliminar caracteres especiales
    text = re.sub(r'\s+', ' ', text)  # eliminar espacios múltiples
    return text.strip()

In [5]:
df = load_data()
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [6]:
df_clean = df.copy()
# Limpiar el tweet
df_clean['tweet'] = df_clean['tweet'].apply(clean_text)
df_clean['tweet'] = df_clean['tweet'].str.lower()  # convertir a minúsculas
df_clean['tweet'] = df_clean['tweet'].str.replace(r'\d+', '', regex=True)  # eliminar números
df_clean['tweet'] = df_clean['tweet'].str.replace(r'\s+', ' ', regex=True)  # eliminar espacios múltiples

In [7]:
df_clean.head()

Unnamed: 0,id,label,tweet
0,1,0,when a father is dysfunctional and is so selfi...
1,2,0,thanks for credit i cant use cause they dont o...
2,3,0,bihday your majesty
3,4,0,i love u take with u all the time in ur
4,5,0,factsguide society now


In [None]:
import sys
import os
sys.path.append(os.path.abspath("../src"))

import data_loader
import model_rnn
import evaluate
import utils  # si lo usas para métricas o visualización


import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Cargar datos
df = data_loader.load_data()

# Limpiar texto
df['tweet'] = df['tweet'].apply(utils.clean_text)

# Tokenización
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['tweet'])

sequences = tokenizer.texts_to_sequences(df['tweet'])
padded = pad_sequences(sequences, maxlen=50, padding='post')

# Datos y etiquetas
X = padded
y = np.array(df['label'])

# División train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


from tensorflow.keras.optimizers import Adam

vocab_size = 10000
embedding_dim = 64
input_length = 50

model = model_rnn.build_rnn(vocab_size, embedding_dim, input_length)
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])
model.summary()


history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=5,
    batch_size=32
)

model.save("../models/rnn_model.h5")


from evaluate import evaluate_model

y_pred = model.predict(X_test)
y_pred_labels = (y_pred > 0.5).astype("int32")

evaluate_model(y_test, y_pred_labels)
