In [2]:
# Bibliotecas requeridas
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import re
import spacy
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import string
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt

In [4]:
# Leer los datos de ejemplos
reviews = pd.read_csv("tripadvisor_hotel_reviews.csv")
print(reviews.shape)
reviews.head()

(20491, 2)


Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


In [5]:
# Seleccionar las columnas relevantes y calcular la longitud de las oraciones
reviews = reviews[['Review', 'Rating']]
reviews.columns = ['review', 'rating']
reviews['review_length'] = reviews['review'].apply(lambda x: len(x.split()))
reviews.head(20)

Unnamed: 0,review,rating,review_length
0,nice hotel expensive parking got good deal sta...,4,87
1,ok nothing special charge diamond member hilto...,2,250
2,nice rooms not 4* experience hotel monaco seat...,3,217
3,"unique, great stay, wonderful time hotel monac...",5,89
4,"great stay great stay, went seahawk game aweso...",5,191
5,love monaco staff husband stayed hotel crazy w...,5,134
6,"cozy stay rainy city, husband spent 7 nights m...",5,101
7,"excellent staff, housekeeping quality hotel ch...",4,85
8,"hotel stayed hotel monaco cruise, rooms genero...",5,59
9,excellent stayed hotel monaco past w/e delight...,5,35


In [7]:
#Cambio de la numeración de la clasificaciones de 0 a 4
zero_numbering = {1:0, 2:1, 3:2, 4:3, 5:4}
reviews['rating'] = reviews['rating'].apply(lambda x: zero_numbering[x])

In [8]:
# Estadísticas
pd.set_option('display.float_format', lambda x: '%.2f' % x)
reviews.describe()

Unnamed: 0,rating,review_length
count,20491.0,20491.0
mean,2.95,104.38
std,1.23,100.66
min,0.0,7.0
25%,2.0,48.0
50%,3.0,77.0
75%,4.0,124.0
max,4.0,1931.0


In [10]:
# Tokenización: proceso de separar un fragmento de texto en 
#  unidades más pequeñas llamadas tokens. 
#  Los tokens pueden ser palabras, caracteres o sub-palabras.
tok = spacy.blank("en")

def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

# Se cuenta la cantidad de ocurrencias de cada token 
# en el corpus.

#count number of occurences of each word
counts = Counter()
for index, row in reviews.iterrows():
    counts.update(tokenize(row['review']))

# Se eliminan los tokens que no ocurren con mucha frecuencia (menos de dos veces). 
print("num_words before:",len(counts.keys()))
for word in list(counts):
    if counts[word] < 2:
        del counts[word]
print("num_words after:",len(counts.keys()))


num_words before: 49116
num_words after: 25185


In [12]:
# Se crea el vocabulario
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)



In [13]:
def encode_sentence(text, vocab2index, N=70):
    """
    Codificación de una oración antes de ser utilizada por el modelo. 
    Parámetros:
       text: el texto a procesar
       vocab2index: diccionario con el vocabulario a utilizar. 
       N: largo máximo
    """
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    
    # El get en diccionario permite definir un valor si un item no existe ("UNK").  
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    
    # Largo máximo del resultado.
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

reviews['encoded'] = reviews['review'].apply(lambda x: np.array(encode_sentence(x,vocab2index ), dtype=object))
print(reviews.head())

                                              review  rating  review_length  \
0  nice hotel expensive parking got good deal sta...       3             87   
1  ok nothing special charge diamond member hilto...       1            250   
2  nice rooms not 4* experience hotel monaco seat...       2            217   
3  unique, great stay, wonderful time hotel monac...       4             89   
4  great stay great stay, went seahawk game aweso...       4            191   

                                             encoded  
0  [[2, 3, 4, 5, 6, 7, 8, 9, 3, 10, 11, 12, 13, 1...  
1  [[79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 78, ...  
2  [[2, 234, 39, 77, 74, 3, 235, 90, 7, 3, 236, 2...  
3  [[347, 11, 69, 9, 11, 348, 349, 3, 235, 11, 68...  
4  [[69, 9, 69, 9, 11, 393, 394, 395, 396, 11, 39...  


In [14]:
# Verificación de qué tan bien balanceadas están las clases.
Counter(reviews['rating'])

Counter({3: 6039, 1: 1793, 2: 2184, 4: 9054, 0: 1421})

In [15]:
# Extracción de características y taget.
X = list(reviews['encoded'])
y = list(reviews['rating'])

# División de datos de entrenamiento y validación
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

In [16]:
# Definción de la clase Dataset para manejo de los datos
class ReviewsDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]

In [17]:
# Creación de los datasets de entrenamiento y validación
train_ds = ReviewsDataset(X_train, y_train)
valid_ds = ReviewsDataset(X_valid, y_valid)

In [18]:
class LSTM_fixed_len(torch.nn.Module) :
    """
    Clase para realizar la clasificación de las oraciones. 
    """
    def __init__(self, vocab_size, embedding_dim, hidden_dim, tagset_size=5) :
        """
        Inicialización de la clase.
        Parámetros:
           embedding_dim: dimesionalidad del vector de palabras. 
           hidden_dim: dimensión de la capa oculta de la red. 
           vocab_size: tamaño del vocabulario.  
           tagset_size: número de clases.
        """
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, tagset_size)
        
        # Durante el entrenamiento, pone a cero aleatoriamente algunos de los elementos 
        # del tensor de entrada con probabilidad p utilizando muestras de una 
        # distribución de Bernoulli. Esta ha demostrado ser una técnica eficaz para la regularización.
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [19]:
def train_model(model, epochs=10, lr=0.001):
    """
    Entrenamiento del modelo utilizando PyTorch.
    """
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long()
            y = y.long()
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, val_rmse = validation_metrics(model, val_dl)
        if i % 5 == 1:
            print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))

def validation_metrics (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in valid_dl:
        x = x.long()
        y = y.long()
        y_hat = model(x, l)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total, sum_rmse/total

In [20]:
batch_size = 5000
vocab_size = len(words)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=batch_size)

model_fixed =  LSTM_fixed_len(vocab_size, 50, 50)
train_model(model_fixed, epochs=30, lr=0.01)

train loss 1.378, val loss 1.347, val accuracy 0.448, and val rmse 1.601
train loss 1.285, val loss 1.309, val accuracy 0.458, and val rmse 1.490
train loss 1.306, val loss 1.332, val accuracy 0.455, and val rmse 1.471
train loss 1.164, val loss 1.238, val accuracy 0.450, and val rmse 1.288
train loss 1.082, val loss 1.240, val accuracy 0.462, and val rmse 1.253
train loss 0.993, val loss 1.214, val accuracy 0.488, and val rmse 1.206


In [21]:
# Validación del modelo 
average_loss, accuracy, average_rmse = validation_metrics (model_fixed, val_dl)

print ("Exactitud", accuracy)
print("Raíz del error cuadrático medio", average_rmse)

Exactitud tensor(0.4967)
Raíz del error cuadrático medio 1.146502046730344
