In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import string
import re
from collections import Counter

import nltk
from nltk import tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer

from tensorflow import keras
from keras.preprocessing.text import one_hot

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import torch
from torch import nn 
import torch.nn.functional as F
from torch.utils import data as d
from torch.utils.data import Dataset, DataLoader

torch.manual_seed(0)
np.random.seed(0)

In [2]:
data = pd.read_csv('train.csv')
data.head(5)

Unnamed: 0,rating,reviewText,summary
0,3,Plot Storyline: 5 StarsThis novel accomplished...,3 1/4 Stars
1,3,I did not like how EL ended this one. I don't ...,"It was going great, then just.... ended"
2,5,I love how old fashioned this family is - they...,LOVED ALL 4!
3,5,I loved this story - It's about two friends wh...,friends make the best lovers
4,1,"In the Dark Lands, a virus killed all possibil...",Blatantly sexist and homophobic


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   rating      9000 non-null   int64 
 1   reviewText  9000 non-null   object
 2   summary     9000 non-null   object
dtypes: int64(1), object(2)
memory usage: 211.1+ KB


In [4]:
review = []

my_stopwords = set(nltk.corpus.stopwords.words('english'))
my_stopwords.remove('no')
my_stopwords.remove('not')
my_stopwords.remove('very')
my_stopwords.add('book')
my_stopwords.add('story')

lemmatizer = WordNetLemmatizer()

vocabulary = {}

review_size = []

def tokenize(text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return nltk.tokenize.word_tokenize(nopunct)


for i in range(len(data['rating'])):
    review.append(data['summary'][i] +' '+ data['reviewText'][i])

    
counts = Counter()

for i in range(len(review)):
    r = []
    tokens = tokenize(review[i])
    for t in tokens:
        if t not in my_stopwords and len(t) > 1:
            r.append(lemmatizer.lemmatize(t))
            
    review[i] = r
    counts.update(r)
    review_size.append(len(r))
    #print(review[i])
    
 
data['review'] = review
    


print("num_words before:",len(counts.keys()))
for word in list(counts):
    if counts[word] < 2:
        del counts[word]


print("num_words after:",len(counts.keys()))

print(counts)

num_words before: 23727
num_words after: 14125


In [9]:
encode_index = {'UNK':0}
N = 500


most_freq = counts.most_common()


for i in range(len(list(counts))):
    encode_index[most_freq[i][0]] = i+1
    
encoded = []   
    
for r in data['review']:
    
    encoded_review = [0] * N
    for i in range(N):
        if i < len(r) and r[i] in encode_index:
            encoded_review[i] = encode_index[r[i]]
    
    encoded.append(encoded_review)

    
encoded_np = np.array(encoded)
    
data['encoded'] = encoded

encoded_np

array([[  56,   34,  259, ...,    0,    0,    0],
       [  70,   19,  359, ...,    0,    0,    0],
       [  59,    7,  123, ...,    0,    0,    0],
       ...,
       [ 230,  201,   58, ...,    0,    0,    0],
       [1453,    3, 2973, ...,    0,    0,    0],
       [ 786,  630,  274, ...,    0,    0,    0]])

In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(encoded_np, data['rating'].values, test_size=0.2, random_state=42)

X_train.dtype

dtype('int32')

In [31]:
X_train = torch.tensor(X_train).to(torch.float64)
y_train = torch.tensor(y_train).to(torch.float64).view(-1,1) 

train_data = d.TensorDataset(X_train, y_train)


  X_train = torch.tensor(X_train).to(torch.int64)
  y_train = torch.tensor(y_train).to(torch.int64).view(-1,1)


In [51]:
batch_size = 8
vocab_size = len(counts)
train_dl = DataLoader(train_data, batch_size=batch_size, shuffle=True)


list(train_dl)[0]

[tensor([[ 192,   47, 3919,  ...,    0,    0,    0],
         [   5,  231,  198,  ...,    0,    0,    0],
         [ 258,    7,   20,  ...,    0,    0,    0],
         ...,
         [ 117,   51,   22,  ...,    0,    0,    0],
         [ 720, 1473,  200,  ...,    0,    0,    0],
         [ 163,  103,  260,  ...,    0,    0,    0]]),
 tensor([[5],
         [4],
         [5],
         [1],
         [5],
         [1],
         [5],
         [4]])]

In [52]:
class LSTM_fixed_len(nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 5)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

    
    
    
def train_model(model, epochs=30, lr=0.01):
    # ====== Loss and optimizer =========
    loss_fn = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr) 


    # ====== Training =========
    for i in range(epochs):

        # since train_loader is an iterable object, we can use for loop. 
        # Note x_train, y_train contains only 8 sample as per batch_size = 8
        for (x_train, y_train) in train_dl:
            
            x_train = x_train.float()
            y_train = y_train.float()
            # zero the parameter gradients
            optimizer.zero_grad()

            # calulate output and loss 
            output = model(x_train)
            loss = loss_fn(output, y_train)

            # backprop and take a step
            print(loss.dtype)
            loss.backward()
            optimizer.step()

        if i % 10 == 0:
            # Note we need to deactivate training (not compute gradient) and move to validation phase
            model.eval()
            with torch.no_grad():
                output_val = model(X_val)
                loss_val = loss_fn(output_val, y_val)
            model.train() # after you predict on val set, you need to set back to training mode

            print('Epoch {}: {:.4f} (Train) {:.4f} (Val)'.format(i, loss, loss_val))

In [53]:
model_fixed =  LSTM_fixed_len(vocab_size, 50, 50)

In [54]:
train_model(model_fixed)

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)