# Excercise 5
## NLP with Pytorch 🔥

Use Pytorch framework to solve the below exercises.


In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## 5.1 Predict rating of a movie using PyTorch

**Exercise:** Use Pytorch framework to predict rating.

In [2]:
# Descargar recursos necesarios de NLTK
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# Cargar los datos
dataTraining = pd.read_csv('https://github.com/sergiomora03/AdvancedTopicsAnalytics/raw/main/datasets/dataTraining.zip', encoding='UTF-8', index_col=0)
plots = dataTraining['plot']
y = (dataTraining['rating'] >= dataTraining['rating'].mean()).astype(int)

In [4]:
plots

Unnamed: 0,plot
3107,most is the story of a single father who takes...
900,a serial killer decides to teach the secrets o...
6724,"in sweden , a female blackmailer with a disfi..."
4704,"in a friday afternoon in new york , the presi..."
2582,"in los angeles , the editor of a publishing h..."
...,...
8417,""" our marriage , their wedding . "" it ' s l..."
1592,"the wandering barbarian , conan , alongside ..."
1723,"like a tale spun by scheherazade , kismet fol..."
7605,"mrs . brisby , a widowed mouse , lives in a..."


In [5]:
y

Unnamed: 0,rating
3107,1
900,0
6724,1
4704,1
2582,1
...,...
8417,0
1592,0
1723,0
7605,1


## Data Precosessing

- Remove stopwords
- Lowercase
- split the text in words
- pad_sequences

In [6]:
# Preprocesamiento de texto
def preprocess_text(texts):
    stop_words = set(stopwords.words('english'))
    processed_texts = []

    for text in texts:
        # Convertir a minúsculas, tokenizar y eliminar palabras vacías
        tokens = word_tokenize(text.lower())
        filtered_words = [word for word in tokens if word.isalnum() and word not in stop_words]
        processed_texts.append(" ".join(filtered_words))

    return processed_texts

plots_processed = preprocess_text(plots.values)

In [7]:
# Tokenización y secuencias con relleno
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(plots_processed)
X = tokenizer.texts_to_sequences(plots_processed)
X_padded = pad_sequences(X, maxlen=200)


In [8]:
# Separar datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_padded, y.values, test_size=0.2, random_state=42)


In [9]:
# Convertir a tensores
X_train_tensor = torch.tensor(X_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

In [10]:

# Crear el conjunto de datos y el DataLoader
train_data = TensorDataset(X_train_tensor, y_train_tensor)
test_data = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64)

## Build Model

Create a neural network to predict the rating of a movie, calculate the testing set accuracy.

In [11]:
# Definir el modelo
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (hidden, cell) = self.lstm(embedded)
        dense_output = self.fc(hidden[-1])
        output = self.sigmoid(dense_output)
        return output

In [12]:
# Parámetros del modelo
vocab_size = 5000
embedding_dim = 128
hidden_dim = 64
output_dim = 1

In [13]:
# Instanciar el modelo
model = LSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim)

# Definir la pérdida y el optimizador
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [14]:
# Función de entrenamiento
def train_model(model, train_loader, criterion, optimizer, epochs=5):
    model.train()
    for epoch in range(epochs):
        running_loss = 0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}")

In [15]:
# Función de evaluación
def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs).squeeze()
            predicted = (outputs > 0.5).float()
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total
    print(f"Test Accuracy: {accuracy * 100:.2f}%")

In [16]:
# Entrenar y evaluar el modelo
train_model(model, train_loader, criterion, optimizer, epochs=5)
evaluate_model(model, test_loader)

Epoch 1, Loss: 0.6943368159159266
Epoch 2, Loss: 0.6691416399647491
Epoch 3, Loss: 0.6095208579843695
Epoch 4, Loss: 0.49915136411936595
Epoch 5, Loss: 0.3577427573577322
Test Accuracy: 57.88%
