# Recursive Neural Networks


In [1]:
import torch
import numpy as np
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cpu


## Extract the data

In [7]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
# path = "/content/drive/Shareddrives/G5/project-4-sentiment-classification/"
path = "./"
train_data = pd.read_csv(path + "train.csv")
test_data = pd.read_csv(path + "test.csv")

print(train_data.shape)
print(test_data.shape)
train_data.head()

(25000, 2)
(25000, 1)


Unnamed: 0,message,label
0,I saw this movie in NEW York city. I was waiti...,neg
1,This is a German film from 1974 that is someth...,neg
2,I attempted watching this movie twice and even...,neg
3,On his birthday a small boys tells his mother ...,neg
4,"The person who wrote the review ""enough with t...",pos


In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jeffr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jeffr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\jeffr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [5]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalpha() and token.lower() not in stop_words]
    return ' '.join(tokens)

In [6]:
train_data['tokens'] = train_data['message'].apply(preprocess_text)
test_data['tokens'] = test_data['message'].apply(preprocess_text)

In [13]:
train_data.head()

Unnamed: 0,message,label,tokens,vectors
0,I saw this movie in NEW York city. I was waiti...,0,saw movie new york city waiting bus next morni...,"[[-0.21996641, 0.17705941, -0.0033930233, -0.6..."
1,This is a German film from 1974 that is someth...,0,german film something woman come castle beyond...,"[[-0.46728015, 0.27693665, -0.29466736, -0.378..."
2,I attempted watching this movie twice and even...,0,attempted watching movie twice even fast forwa...,"[[-0.12267326, -0.110086754, 0.07383823, 0.421..."
3,On his birthday a small boys tells his mother ...,0,birthday small boy tell mother son want go hom...,"[[0.0073563126, 0.34156787, -0.5304753, -0.314..."
4,"The person who wrote the review ""enough with t...",1,person wrote review enough sweating spitting a...,"[[-0.0344274, 0.1957269, -0.32643205, 0.307864..."


In [7]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()  
train_data['label'] = label_encoder.fit_transform(train_data['label'])

In [8]:
from gensim.models import Word2Vec

sentences = train_data['tokens'].apply(lambda x: x.split()).to_list()
word2vec = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, sg=1)

sentences_test = test_data['tokens'].apply(lambda x: x.split()).to_list()
word2vec_test = Word2Vec(sentences=sentences_test, vector_size=100, window=5, min_count=1, sg=1)

In [16]:
word2vec.wv.most_similar("like", topn=5)

[('creepier', 0.7893841862678528),
 ('hmmmm', 0.7754266262054443),
 ('preteen', 0.7654963135719299),
 ('alot', 0.7647897005081177),
 ('anyways', 0.7632929086685181)]

In [9]:
def sentence_to_vectors(sentence, model, vector_size=100):
    vectors = []
    for word in sentence.split():
        if word in model.wv:
            vectors.append(model.wv[word])
        else:
            vectors.append([0] * vector_size)
    return np.array(vectors)

In [10]:
train_data['vectors'] = train_data['tokens'].apply(lambda x: sentence_to_vectors(x, word2vec))
test_data['vectors'] = test_data['tokens'].apply(lambda x: sentence_to_vectors(x, word2vec_test))

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 2), lowercase=True)
train_data['tfidf'] = list(vectorizer.fit_transform(train_data['tokens']).toarray())
test_data['tfidf'] = list(vectorizer.transform(test_data['tokens']).toarray())

In [28]:
max_len = max(train_data['tokens'].apply(lambda text: len(text.split())))
mean_len = int(round(np.mean(train_data['tokens'].apply(lambda text: len(text.split())))))

print(f'Max length: {max_len}')
print(f'Mean length: {mean_len}')

Max length: 1421
Mean length: 119


In [None]:
def pad_sentences(vectors, max_len, vector_size=100):
    if len(vectors) > max_len:
        vectors = vectors[:max_len]
    else:
        padding = np.zeros((max_len - len(vectors), vector_size))
        vectors = np.vstack([vectors, padding])
    return torch.tensor(vectors, dtype=torch.float32)

In [30]:
train_data['tensor'] = train_data['vectors'].apply(lambda x: pad_sentences(x, mean_len, word2vec.vector_size))
test_data['tensor'] = test_data['vectors'].apply(lambda x: pad_sentences(x, mean_len, word2vec_test.vector_size))

In [113]:
train_data.head()

Unnamed: 0,message,label,tokens,vectors,tfidf,tensor
0,I saw this movie in NEW York city. I was waiti...,0,saw movie new york city waiting bus next morni...,"[[0.17053513, 0.044531778, -0.5490574, -0.4508...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[tensor(0.1705), tensor(0.0445), tensor(-0.54..."
1,This is a German film from 1974 that is someth...,0,german film something woman come castle beyond...,"[[-0.2837934, 0.1716008, -0.071153946, -0.3870...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[tensor(-0.2838), tensor(0.1716), tensor(-0.0..."
2,I attempted watching this movie twice and even...,0,attempted watching movie twice even fast forwa...,"[[-0.015660213, 0.0361225, 0.23330258, 0.34803...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[tensor(-0.0157), tensor(0.0361), tensor(0.23..."
3,On his birthday a small boys tells his mother ...,0,birthday small boy tell mother son want go hom...,"[[0.16865695, 0.36055747, -0.49575418, -0.3732...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[tensor(0.1687), tensor(0.3606), tensor(-0.49..."
4,"The person who wrote the review ""enough with t...",1,person wrote review enough sweating spitting a...,"[[-0.079325594, 0.27384898, 0.07582834, 0.3497...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[tensor(-0.0793), tensor(0.2738), tensor(0.07..."


In [36]:
X = torch.stack(train_data['tensor'].tolist())
y = torch.tensor(train_data['label'].values)

print(X.shape)
print(y.shape)

torch.Size([25000, 119, 100])
torch.Size([25000])


In [37]:
from torch.utils.data import DataLoader, TensorDataset, random_split

dataset = TensorDataset(X, y)

train_size = int(0.8 * len(dataset))  # 80% para entrenamiento
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(len(train_loader)) 
print(len(test_loader)) 

625
157


## Models

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        _, (hidden, _) = self.lstm(x)  
        hidden = hidden[-1]
        out = self.fc(hidden)
        return out

# class LSTM(nn.Module):
#     def __init__(self, input_size, hidden_size, output_size, num_layers=1):
#         super(LSTM, self).__init__()
#         self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
#         self.fc = nn.Linear(hidden_size, output_size)

#     def forward(self, window):
#         _, (h_out, _) = self.lstm(window)
#         h_out = h_out.view(-1, self.hidden_size)
#         out = self.fc(h_out)
#         return out

In [48]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(RNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        _, hidden = self.rnn(x) 
        hidden = hidden[-1] 
        out = self.fc(hidden)
        return out

In [None]:
class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(GRU, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        _, hidden = self.gru(x)
        hidden = hidden[-1]
        out = self.fc(hidden)
        return out

## Training Function
This function is designed to train all three models efficiently and streamline the process for the user.

In [39]:
def train(model, optimizer, loss_f, num_epochs, train_loader):
    losses = []
    
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        for inputs, labels in train_loader:
            labels = labels.long()
            outputs = model(inputs)
            loss = loss_f(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        avg_loss = epoch_loss / len(train_loader)
        losses.append(avg_loss)

        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss}")

## Hyperparameters
Define the model's hyperparameters

In [None]:
num_epochs = 10
learning_rate = 0.001

input_size = 100 # word2vec.vector_size
hidden_size = 128
output_size = 2
num_layers = 1  

## Create and train the models

### LSTM

In [41]:
lstm = LSTM(input_size, hidden_size, output_size, num_layers)
loss_function_LSTM = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate)

train(lstm, optimizer, loss_function_LSTM, num_epochs, train_loader)

Epoch 1/30, Loss: 0.67310631275177
Epoch 2/30, Loss: 0.6694829176902771
Epoch 3/30, Loss: 0.5659720916748047
Epoch 4/30, Loss: 0.347309819483757
Epoch 5/30, Loss: 0.3212477997779846
Epoch 6/30, Loss: 0.31245408944487574
Epoch 7/30, Loss: 0.30451204755306244
Epoch 8/30, Loss: 0.29581692311763763
Epoch 9/30, Loss: 0.29111148651838303
Epoch 10/30, Loss: 0.2821523575127125
Epoch 11/30, Loss: 0.277583460944891
Epoch 12/30, Loss: 0.2668392973065376
Epoch 13/30, Loss: 0.25670862380862236
Epoch 14/30, Loss: 0.2463035105586052
Epoch 15/30, Loss: 0.23160683302283286
Epoch 16/30, Loss: 0.2173728201806545
Epoch 17/30, Loss: 0.2012839603126049
Epoch 18/30, Loss: 0.1823293737858534
Epoch 19/30, Loss: 0.16659079930782317
Epoch 20/30, Loss: 0.14284013803899287
Epoch 21/30, Loss: 0.12361272724866867
Epoch 22/30, Loss: 0.10804368139356375
Epoch 23/30, Loss: 0.09039054093137383
Epoch 24/30, Loss: 0.07517291503101588
Epoch 25/30, Loss: 0.06458085372969508
Epoch 26/30, Loss: 0.055900429471954706
Epoch 27/3

In [52]:
def evaluate(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total
    return accuracy

In [50]:
# Función de búsqueda en cuadrícula de hiperparámetros
def grid_search(param_grid):
    best_accuracy = 0
    best_params = {}

    # Iterar sobre todas las combinaciones posibles de hiperparámetros
    for hidden_size in param_grid['hidden_size']:
        for num_layers in param_grid['num_layers']:
            for learning_rate in param_grid['learning_rate']:
                for batch_size in param_grid['batch_size']:
                    for num_epochs in param_grid['num_epochs']:

                        # Crear DataLoader con el tamaño de batch actual
                        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
                        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

                        # Inicializar el modelo LSTM con los parámetros actuales
                        model = LSTM(input_size=100, hidden_size=hidden_size, output_size=2, num_layers=num_layers)

                        # Inicializar la función de pérdida y el optimizador
                        loss_function_LSTM = torch.nn.CrossEntropyLoss()
                        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

                        # Entrenar el modelo
                        print(f"Entrenando con hidden_size={hidden_size}, num_layers={num_layers}, "
                              f"learning_rate={learning_rate}, batch_size={batch_size}, num_epochs={num_epochs}")
                        train(model, optimizer, loss_function_LSTM, num_epochs, train_loader)

                        # Evaluar el modelo
                        accuracy = evaluate(model, test_loader)
                        print(f"Accuracy con hidden_size={hidden_size}, num_layers={num_layers}, "
                              f"learning_rate={learning_rate}, batch_size={batch_size}, num_epochs={num_epochs}: {accuracy * 100:.2f}%")

                        # Guardar la mejor configuración
                        if accuracy > best_accuracy:
                            best_accuracy = accuracy
                            best_params = {
                                'hidden_size': hidden_size,
                                'num_layers': num_layers,
                                'learning_rate': learning_rate,
                                'batch_size': batch_size,
                                'num_epochs': num_epochs
                            }
    
    return best_accuracy, best_params

In [None]:
# Parámetros para la búsqueda en cuadrícula
param_grid = {
    'hidden_size': [64, 128, 256],           # Diferentes tamaños de la capa oculta
    'num_layers': [1, 2, 3],                 # Número de capas LSTM
    'learning_rate': [0.001, 0.0005, 0.0001],# Diferentes tasas de aprendizaje
    'batch_size': [32, 64, 128],             # Tamaño del batch
    'num_epochs': [10, 20]                   # Número de épocas para entrenar
}

# Realizar búsqueda en cuadrícula
best_accuracy, best_params = grid_search(param_grid)

# Imprimir los mejores parámetros y la precisión
print(f"\nMejores parámetros: {best_params}")
print(f"Mejor Accuracy: {best_accuracy * 100:.2f}%")

Entrenando con hidden_size=64, num_layers=1, learning_rate=0.001, batch_size=32, num_epochs=10
Epoch 1/10, Loss: 0.6768471588134766
Epoch 2/10, Loss: 0.6823424056053161
Epoch 3/10, Loss: 0.6500057030677795
Epoch 4/10, Loss: 0.6567579681396485
Epoch 5/10, Loss: 0.6770723893642425
Epoch 6/10, Loss: 0.547683597946167
Epoch 7/10, Loss: 0.3490979295015335
Epoch 8/10, Loss: 0.3291783967256546
Epoch 9/10, Loss: 0.3170449034690857
Epoch 10/10, Loss: 0.3127932381272316
Accuracy con hidden_size=64, num_layers=1, learning_rate=0.001, batch_size=32, num_epochs=10: 86.82%
Entrenando con hidden_size=64, num_layers=1, learning_rate=0.001, batch_size=32, num_epochs=20
Epoch 1/20, Loss: 0.6727185891628266
Epoch 2/20, Loss: 0.6612150899410247
Epoch 3/20, Loss: 0.6191956439495087
Epoch 4/20, Loss: 0.6384363409042358
Epoch 5/20, Loss: 0.6589520169258117
Epoch 6/20, Loss: 0.642452643918991
Epoch 7/20, Loss: 0.5970067672729492
Epoch 8/20, Loss: 0.4220517461776733
Epoch 9/20, Loss: 0.3469949995517731
Epoch 1

### RNN

In [49]:
rnn = RNN(input_size, hidden_size, output_size, num_layers)
loss_function_RNN = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
train(rnn, optimizer, loss_function_RNN, num_epochs, train_loader)

Epoch 1/30, Loss: 0.6794653319835663
Epoch 2/30, Loss: 0.6824531795978546
Epoch 3/30, Loss: 0.6913336690902709
Epoch 4/30, Loss: 0.6706020780563354
Epoch 5/30, Loss: 0.6765170733451843
Epoch 6/30, Loss: 0.6700386498451233
Epoch 7/30, Loss: 0.6504225824832917
Epoch 8/30, Loss: 0.6611987741470337
Epoch 9/30, Loss: 0.6874834663391113
Epoch 10/30, Loss: 0.6852647385597229
Epoch 11/30, Loss: 0.6820053078651428
Epoch 12/30, Loss: 0.6822574971199036
Epoch 13/30, Loss: 0.6760899051666259
Epoch 14/30, Loss: 0.6828932273864746
Epoch 15/30, Loss: 0.6846454354286193
Epoch 16/30, Loss: 0.6752873534202576
Epoch 17/30, Loss: 0.6712932606697083
Epoch 18/30, Loss: 0.6513241497516632
Epoch 19/30, Loss: 0.6773011771202088
Epoch 20/30, Loss: 0.6637817116737366
Epoch 21/30, Loss: 0.6677394290924072
Epoch 22/30, Loss: 0.6562525920391082
Epoch 23/30, Loss: 0.6268605331897735
Epoch 24/30, Loss: 0.6468837797641754
Epoch 25/30, Loss: 0.5980380709171296
Epoch 26/30, Loss: 0.6667880938529969
Epoch 27/30, Loss: 0.

### GRU

In [None]:
gru = GRU(input_size, hidden_size, output_size, num_layers)
loss_function_GRU = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(gru.parameters(), lr=learning_rate)
train(gru, optimizer, loss_function_GRU, num_epochs, train_loader)