In [1]:
! pip install --upgrade pip
! pip install -r requirements.txt -q



In [2]:
from pathlib import Path
import re
import unicodedata

import contractions
from kaggle.api.kaggle_api_extended import KaggleApi
from keras.models import Sequential
from keras.layers import (
    Input,
    Dense,
    Dropout,
    LSTM,
    TextVectorization,
    Embedding,
    Bidirectional,
    GRU
)
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import numpy as np
import num2words
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from wordcloud import WordCloud
from ydata_profiling import ProfileReport

% matplotlib inline
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

2024-10-29 01:28:24.006858: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-29 01:28:24.010275: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-29 01:28:24.020355: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1730183304.037392   59709 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1730183304.042510   59709 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-29 01:28:24.059316: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

In [3]:
# Constantes
DATA_PATH = Path("tweet-gpt/tweet_gpt.csv")
MY_SEED = 42
SENTIMENT_FIELD = "sentiment"

STOPWORDS = set(stopwords.words('english'))
TEXT_FIELD = "Tweet"

In [4]:
# Funciones
def remove_stopwords(words):
  for word in STOPWORDS:
    token = ' ' + word + ' '
    words = re.sub(token, ' ', words)
  return words

def stem_and_lemmatize(words, apply_stem=True, apply_lemmatize=True):
    if apply_lemmatize:
        lemmatized_words = " ".join([WordNetLemmatizer().lemmatize(word) for word in words.split()])
    else:
        lemmatized_words = words

    if apply_stem:
        stemmed_words = " ".join([PorterStemmer().stem(word) for word in lemmatized_words.split()])
    else:
        stemmed_words = lemmatized_words

    return stemmed_words


def preproccesing(words, apply_stem=True, apply_lemmatize=True):
    words = words.lower()
    words = re.sub(r"(\d+)", lambda x: num2words.num2words(int(x.group(0))), words)
    words = re.sub(r"[\"(),¡!¿?:;'>]", "", words)
    words = unicodedata.normalize('NFKD', words).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    words = remove_stopwords(words)
    words = stem_and_lemmatize(words,apply_stem, apply_lemmatize)
    words = " ".join([word for word in words.split() if len(word) > 1])
    return words

In [5]:
api = KaggleApi()
api.authenticate()
if not DATA_PATH.exists:
    api.dataset_download_files(
        'evilspirit05/tweet-gpt',
        path='tweet-gpt',
        unzip=True
    )

In [None]:
data = pd.read_csv(DATA_PATH)
display(data.head(3))
print()
print(data[SENTIMENT_FIELD].value_counts())
print()
train, test = train_test_split(data, test_size=0.2, stratify=data[SENTIMENT_FIELD], random_state=MY_SEED, shuffle=True)
train, val = train_test_split(train, test_size=0.2, stratify=train[SENTIMENT_FIELD], random_state=MY_SEED, shuffle=True)
print("Tamaño de datos de entrenamiento:", train.shape)
print("Tamaño de datos de validación:", val.shape)
print("Tamaño de datos de prueba:", test.shape)
print()
X_train, X_test, X_val= train[TEXT_FIELD], test[TEXT_FIELD], val[TEXT_FIELD]
y_train, y_test, y_val= train[SENTIMENT_FIELD], test[SENTIMENT_FIELD], val[SENTIMENT_FIELD]
print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_val:", X_val.shape, "y_val:", y_val.shape)
print("X_test:", X_test.shape, "y_test:", y_test.shape)
print()
display(X_train.head(3))
print()

In [None]:
unique_sentiments = y_train.unique()
print(unique_sentiments, "\n")

for sentiment in unique_sentiments:
  row = data[data[SENTIMENT_FIELD] == sentiment].iloc[0]
  print(f"Sentiment:", sentiment)
  print(f"Sentence:", row[TEXT_FIELD])

In [8]:
# Aplicar contractions.fix and preprocessing en los conjuntos
data.loc[X_train.index, 'Tweet'] = X_train.apply(contractions.fix).apply(preproccesing)
data.loc[X_val.index, 'Tweet'] = X_val.apply(contractions.fix).apply(preproccesing)
data.loc[X_test.index, 'Tweet'] = X_test.apply(contractions.fix).apply(preproccesing)

In [9]:
# Apply contractions.fix and preprocessing to the training, validation, and test sets
X_train = X_train.apply(contractions.fix).apply(preproccesing)
X_val = X_val.apply(contractions.fix).apply(preproccesing)
X_test = X_test.apply(contractions.fix).apply(preproccesing)

In [None]:
# Generar un reporte exploratorio automático
profile = ProfileReport(data[['Tweet']], title='Reporte EDA', html={'style':{'full_width':True}})
profile.to_notebook_iframe()
text_data = ' '.join(data['Tweet'].astype(str))
print(f"Tamaño del texto para WordCloud: {len(text_data)}")
wordcloud = WordCloud(width=1600, height=800).generate(text_data)
plt.figure(figsize=(16, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
# Remove duplicates from X_train
X_train_clean = X_train.drop_duplicates()
# Now, filter y_train using the same indices that were kept in X_train_clean
y_train_clean = y_train[X_train.index.isin(X_train_clean.index)]
# Verify the new shapes after removing duplicates
print(f"Shape of X_train after removing duplicates: {X_train_clean.shape}")
print(f"Shape of y_train after removing duplicates: {y_train_clean.shape}")

label_encoder = LabelEncoder()
# Fit the encoder only on y_train and transform y_train
y_train_encoded = label_encoder.fit_transform(y_train)
# Transform y_val and y_test using the same label encoder
y_val_encoded = label_encoder.transform(y_val)
y_test_encoded = label_encoder.transform(y_test)
# Get the original class labels
unique_labels = label_encoder.classes_
# Print the mapping between numeric values and original labels
for valor_numerico, etiqueta_original in enumerate(unique_labels):
    print(f'Valor numérico: {valor_numerico}, Etiqueta original: {etiqueta_original}')

In [None]:
# Crear la capa de TextVectorization para tokenizar el texto
vectorize_layer = TextVectorization(
    standardize=None,  # No aplicar normalización adicional (puedes ajustar según lo que necesites)
    split="whitespace",  # Dividir el texto por espacios en blanco
    max_tokens=None,  # Puedes limitar el número máximo de tokens si es necesario
    output_mode='int',  # Convertir el texto en secuencias de enteros
    output_sequence_length=None  # Longitud de la secuencia (puedes ajustar si necesitas padding)
)

# Adaptar la capa con los datos de entrenamiento (X_train)
vectorize_layer.adapt(X_train)

In [None]:
# Aplicar la capa vectorize_layer sobre los conjuntos de entrenamiento, validación y prueba
X_train_vectorized = vectorize_layer(X_train)
X_val_vectorized = vectorize_layer(X_val)
X_test_vectorized = vectorize_layer(X_test)

row_index = 8010
print("Texto original en X_train:", X_train.iloc[row_index], end="\n")
print("Texto tokenizado en X_train_vectorized:", X_train_vectorized[row_index])

In [None]:
print("Vocabulario:", vectorize_layer.get_vocabulary()[:100])
print("Configuración:", vectorize_layer.get_config())

In [None]:
sequence_lengths = [len(text.split()) for text in X_train]
plt.hist(sequence_lengths, bins=50)
plt.xlabel('Longitud de Secuencia')
plt.ylabel('Frecuencia')
plt.show()

In [None]:
my_callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=4),
    # tf.keras.callbacks.TensorBoard(log_dir='/content/logs'),
]

In [None]:
model = Sequential()

model.add(vectorize_layer)
model.add(Embedding(10000, 300, name="Capa_Embedding"))

model.add(LSTM(300, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(1024, activation='relu', name='Capa_Oculta'))
model.add(Dropout(0.8))

model.add(Dense(3, activation='softmax', name='Capa_Salida'))  # 3 para 3 clases

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(
      X_train_tf, y_train,
      validation_data = (X_val_tf, y_val),
      epochs=20,
      callbacks=my_callbacks)