In [1]:
import nltk

In [None]:
nltk.download('all')

In [2]:
import pandas as pd 
import numpy as np 
import torch 
import torch.nn as nn
import torch.nn.functional as F 
import torch.optim as optim

In [3]:
dataset = pd.read_csv("Restaurant_Reviews.tsv", delimiter='\t')
dataset.head(5)

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [6]:
ps = PorterStemmer()

In [7]:
corpus = []
import re

for i in range(len(dataset)):
    new_row = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    new_row = new_row.lower()
    new_row = new_row.split()
    clean_row = [ps.stem(word) for word in new_row if word not in (stopwords.words('english'))]
    clean_row = ' '.join(clean_row)
    corpus.append(clean_row)




## Converting text into numeric vectors

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=1500, min_df = 3, max_df = 0.6)
text_vectors = vectorizer.fit_transform(corpus).toarray()

In [9]:
y = dataset.iloc[:,1].values

In [10]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(text_vectors, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape,y_train.shape,y_test.shape

((800, 467), (200, 467), (800,), (200,))

In [11]:
input_size = 467
epochs = 1000
learning_rate = 0.01
batch_size =32
hidden_size = 500
output_size = 2

In [12]:
class Textclassifier(nn.Module):
    def __init__(self):
        super(Textclassifier, self).__init__()

        self.layer1 = nn.Linear(input_size, hidden_size)
        self.layer2 = nn.Linear(hidden_size, hidden_size)
        self.layer3 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        #print(x.shape)
        output = F.relu(self.layer1(x))
        #print(output.shape)
        output = F.relu(self.layer2(output))
        #print(output.shape)
        output = self.layer3(output)
        #print(output.shape)
        #print(F.log_softmax(output, dim=1).shape)
        return F.log_softmax(output, dim=1)
    
model = Textclassifier()
print(model)

Textclassifier(
  (layer1): Linear(in_features=467, out_features=500, bias=True)
  (layer2): Linear(in_features=500, out_features=500, bias=True)
  (layer3): Linear(in_features=500, out_features=2, bias=True)
)


In [13]:
sgd_optim = torch.optim.Adam(model.parameters(), lr = learning_rate)
criterion = nn.NLLLoss()

In [None]:
#X_train = torch.tensor(np.array(X_train), dtype = torch.float32, requires_grad=True)
#y_train = torch.tensor(np.array(y_train).reshape(-1,1), dtype = torch.float32)

In [14]:
X_train = torch.from_numpy(X_train).float()
X_test = torch.from_numpy(X_test).float()

y_train = torch.from_numpy(y_train)
y_test = torch.from_numpy(y_test)

In [15]:
X_train.shape, y_train.shape

(torch.Size([800, 467]), torch.Size([800]))

In [16]:
bce_loss_list = []
for epoch in range(epochs):

    sgd_optim.zero_grad()

    outputs = model(X_train)
    #print(outputs.shape, y_train.shape)
    bce_loss = criterion(outputs, y_train)

    bce_loss.backward()
    bce_loss_list.append(bce_loss)

    sgd_optim.step()

    print(f"Epoch : {epoch}, Loss : {bce_loss.item()}")

Epoch : 0, Loss : 0.6934576630592346
Epoch : 1, Loss : 0.6844948530197144
Epoch : 2, Loss : 0.5731980204582214
Epoch : 3, Loss : 0.3894008994102478
Epoch : 4, Loss : 0.24066786468029022
Epoch : 5, Loss : 0.17016109824180603
Epoch : 6, Loss : 0.11300583928823471
Epoch : 7, Loss : 0.08748533576726913
Epoch : 8, Loss : 0.06843941658735275
Epoch : 9, Loss : 0.05187882483005524
Epoch : 10, Loss : 0.048570141196250916
Epoch : 11, Loss : 0.04335484653711319
Epoch : 12, Loss : 0.034375496208667755
Epoch : 13, Loss : 0.029017005115747452
Epoch : 14, Loss : 0.026481665670871735
Epoch : 15, Loss : 0.026256414130330086
Epoch : 16, Loss : 0.0251470897346735
Epoch : 17, Loss : 0.021195316687226295
Epoch : 18, Loss : 0.021703390404582024
Epoch : 19, Loss : 0.023493222892284393
Epoch : 20, Loss : 0.023327112197875977
Epoch : 21, Loss : 0.021438205614686012
Epoch : 22, Loss : 0.0194769948720932
Epoch : 23, Loss : 0.021559065207839012
Epoch : 24, Loss : 0.021221280097961426
Epoch : 25, Loss : 0.01966155

In [17]:
sample = ["The movie is awfull"]

In [18]:
sample = vectorizer.transform(sample).toarray()

In [19]:

sentiment = model(torch.from_numpy(sample).float())
sentiment

tensor([[-0.2022, -1.6977]], grad_fn=<LogSoftmaxBackward0>)

In [20]:
sample2 = ["Good tasty and the texture was just great"]

In [21]:
sample2 = vectorizer.transform(sample2).toarray()

In [22]:

sentiment2 = model(torch.from_numpy(sample2).float())
sentiment2

tensor([[-18.2555,   0.0000]], grad_fn=<LogSoftmaxBackward0>)

In [23]:
model.state_dict()

OrderedDict([('layer1.weight',
              tensor([[ 0.1307,  0.0726,  0.0958,  ...,  0.1334,  0.0718,  0.0705],
                      [ 0.0877,  0.0685, -0.0980,  ..., -0.0843,  0.0227,  0.1508],
                      [-0.0370, -0.0433,  0.0319,  ..., -0.0438, -0.0215, -0.0413],
                      ...,
                      [ 0.0398, -0.0607, -0.0142,  ...,  0.0854,  0.0401, -0.1142],
                      [-0.0443,  0.0784, -0.0827,  ..., -0.1570, -0.0266, -0.0497],
                      [ 0.0946, -0.0495,  0.0760,  ...,  0.1187,  0.0729, -0.0185]])),
             ('layer1.bias',
              tensor([ 0.0321,  0.0084, -0.0586,  0.0158, -0.0265, -0.0128, -0.1272,  0.0265,
                      -0.0196, -0.0355, -0.0897, -0.0184, -0.0385,  0.0082, -0.0206,  0.0247,
                      -0.0219,  0.0161,  0.0150,  0.0304, -0.0136, -0.0058, -0.0171,  0.0002,
                       0.0345,  0.0480, -0.0575, -0.0248, -0.0059, -0.0212,  0.0206,  0.0161,
                       0.0228,