In [3]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from gensim.models import Word2Vec

In [4]:
# Load the dataset
df = pd.read_csv("../../data/cleaned/out.csv")

# Text preprocessing
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text

df['text'] = df['text'].apply(preprocess_text)
df['text'].head()

0     tiffanylue i know i was listenin to bad habit...
1    layin n bed with a headache ughhhh waitin on y...
2                      funeral ceremony gloomy friday 
3                 wants to hang out with friends soon 
4     dannycastillo we want to trade with someone w...
Name: text, dtype: object

In [5]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

df['text'] = df['text'].apply(remove_stopwords)
df['text'].head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/yonosoysantiago/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0    tiffanylue know listenin bad habit earlier sta...
1              layin n bed headache ughhhh waitin call
2                       funeral ceremony gloomy friday
3                              wants hang friends soon
4    dannycastillo want trade someone houston ticke...
Name: text, dtype: object

In [6]:
tokenized_texts = [text.split() for text in df['text']]
word2vec = Word2Vec(tokenized_texts, vector_size=100, window=5, min_count=1)

In [7]:
def text_to_vector(text):
    words = text.split()
    word_vectors = [word2vec.wv[word] for word in words if word in word2vec.wv]
    if not word_vectors:
        return np.zeros(word2vec.vector_size).tolist()
    return np.mean(word_vectors, axis=0).tolist()


df['vector'] = df['text'].apply(text_to_vector)

In [8]:
encoder = OneHotEncoder(sparse=False)
encoded_labels = encoder.fit_transform(df['label'].values.reshape(-1, 1))




In [9]:
encoded_labels.shape

(65989, 7)

In [10]:
encoded_labels[0]

array([1., 0., 0., 0., 0., 0., 0.])

In [11]:
df.head()

Unnamed: 0,label,text,vector
0,0,tiffanylue know listenin bad habit earlier sta...,"[-0.13640040159225464, 0.4457484781742096, -0...."
1,0,layin n bed headache ughhhh waitin call,"[-0.3171183466911316, 0.411883145570755, -0.14..."
2,0,funeral ceremony gloomy friday,"[-0.24589645862579346, 0.1969228982925415, -0...."
3,1,wants hang friends soon,"[-0.43576765060424805, 0.8001293540000916, -0...."
4,6,dannycastillo want trade someone houston ticke...,"[-0.17103679478168488, 0.7238341569900513, -0...."


In [12]:
def convert_to_tensor(arg):
    arg = tf.convert_to_tensor(arg, dtype=tf.float32)
    return arg

In [13]:
X = np.array([np.array(vec) for vec in df['vector'].to_numpy()], dtype=np.float32)
y = encoded_labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
type(X_train[0])

numpy.ndarray

In [15]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)


X_train shape: (52791, 100)
y_train shape: (52791, 7)


In [16]:
# Model parameters
input_dim = 100  # Word2Vec vector size
output_dim = 7   # Number of sentiment labels
hidden_units = 32  # LSTM hidden units

# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=input_dim+1, output_dim=input_dim, input_length=input_dim))
model.add(LSTM(hidden_units))
model.add(Dense(output_dim, activation='softmax'))
model.summary()
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2)

2023-05-01 11:39:33.997089: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-01 11:39:33.997287: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-01 11:39:34.001095: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-01 11:39:34.001319: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-01 11:39:34.001472: I tensorflow/compiler/xla/stream_executo

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          10100     
                                                                 
 lstm (LSTM)                 (None, 32)                17024     
                                                                 
 dense (Dense)               (None, 7)                 231       
                                                                 
Total params: 27,355
Trainable params: 27,355
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10


2023-05-01 11:39:37.150613: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:433] Could not create cudnn handle: CUDNN_STATUS_INTERNAL_ERROR
2023-05-01 11:39:37.150657: W tensorflow/core/framework/op_kernel.cc:1830] OP_REQUIRES failed at cudnn_rnn_ops.cc:1554 : UNKNOWN: Fail to find the dnn implementation.


UnknownError: Graph execution error:

Fail to find the dnn implementation.
	 [[{{node CudnnRNN}}]]
	 [[sequential/lstm/PartitionedCall]] [Op:__inference_train_function_3308]