In [27]:
# Manipulación de datos
import numpy as np
import pandas as pd
import re

# Machine Learning - Scikit-learn
from sklearn.model_selection import train_test_split

# TensorFlow / Keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer


# Carga y pre-procesamiento de los datos

In [5]:
def load_data():
    url = "https://raw.githubusercontent.com/dD2405/Twitter_Sentiment_Analysis/master/train.csv"
    df = pd.read_csv(url)
    return df

In [6]:
# Función para equilibrar el dataset
def balance_dataset(df):
    # Contar la cantidad de muestras por clase
    counts = df['label'].value_counts()
    
    # Encontrar la clase mayoritaria
    major_class = counts.idxmax()
    
    # Encontrar la cantidad de muestras de la clase mayoritaria
    major_count = counts.max()
    
    # Crear un nuevo dataframe vacío para almacenar las muestras equilibradas
    balanced_df = pd.DataFrame(columns=df.columns)
    
    # Iterar sobre cada clase y agregar muestras al nuevo dataframe
    for label, count in counts.items():
        if label == major_class:
            balanced_df = pd.concat([balanced_df, df[df['label'] == label]])
        else:
            # Calcular el número de muestras a agregar para equilibrar
            num_samples_to_add = major_count - count
            
            # Seleccionar aleatoriamente muestras de la clase minoritaria
            samples_to_add = df[df['label'] == label].sample(num_samples_to_add, replace=True)
            
            # Agregar las muestras al nuevo dataframe
            balanced_df = pd.concat([balanced_df, samples_to_add])
    
    return balanced_df.reset_index(drop=True)

In [7]:
def clean_text(text):
    text = re.sub(r'@\w+', '', text)  # eliminar menciones
    text = re.sub(r'#\w+', '', text)  # eliminar hashtags
    text = re.sub(r'http\S+', '', text)  # eliminar URLs
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)  # eliminar caracteres especiales
    text = re.sub(r'\s+', ' ', text)  # eliminar espacios múltiples
    return text.strip()

In [8]:
df = load_data()
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [9]:
df_clean = df.copy()
# Limpiar el tweet
df_clean['tweet'] = df_clean['tweet'].apply(clean_text)
df_clean['tweet'] = df_clean['tweet'].str.lower()  # convertir a minúsculas
df_clean['tweet'] = df_clean['tweet'].str.replace(r'\d+', '', regex=True)  # eliminar números
df_clean['tweet'] = df_clean['tweet'].str.replace(r'\s+', ' ', regex=True)  # eliminar espacios múltiples

In [10]:
df_clean.head()

Unnamed: 0,id,label,tweet
0,1,0,when a father is dysfunctional and is so selfi...
1,2,0,thanks for credit i cant use cause they dont o...
2,3,0,bihday your majesty
3,4,0,i love u take with u all the time in ur
4,5,0,factsguide society now


In [11]:
df_clean = balance_dataset(df_clean)

# Tokenización

In [28]:
# Tokenización
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(df_clean['tweet'])

sequences = tokenizer.texts_to_sequences(df_clean['tweet'])
padded = pad_sequences(sequences, maxlen=50, padding='post')

# División del conjunto de datos: Train, test, validate

In [29]:
# Label a formato binario
df_clean['label'] = df_clean['label'].astype('int32')

In [30]:
# Datos y etiquetas
X = padded
y = np.array(df_clean['label'])

In [31]:
# Dividir entre train y temp
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Dividir X_temp entre validación y test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)


In [32]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape , X_val.shape, y_val.shape

((40038, 50), (8580, 50), (40038,), (8580,), (8580, 50), (8580,))

# Creación del modelo RNN

In [None]:
vocab_size = 10000
embedding_dim = 64
input_length = 50

In [34]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_length=input_length),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Para clasificación binaria
])



# Copilación y entrenamiento

In [35]:
y_train = y_train.reshape(-1, 1)

In [36]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=5,
    batch_size=32,
)


Epoch 1/5
[1m 995/1252[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m3s[0m 12ms/step - accuracy: 0.5110 - loss: 0.6931

KeyboardInterrupt: 

# Evaluación

In [None]:
# Medición de la precisión en el conjunto de test
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Accuracy en test: {accuracy:.4f}')


[1m269/269[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9797 - loss: 0.1029
Accuracy en test: 0.9796


# Guardado del modelo

In [None]:
# Se guarda el modelo
model.save("../models/rnn_model.keras")