In [85]:
import nltk

In [None]:
nltk.download('all')

In [86]:
import pandas as pd 
import numpy as np 
import torch 
import torch.nn as nn
import torch.nn.functional as F 
import torch.optim as optim

In [87]:
dataset = pd.read_csv("Restaurant_Reviews.tsv", delimiter='\t')
dataset.head(5)

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [88]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [89]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [90]:
ps = PorterStemmer()

In [91]:
corpus = []
import re

for i in range(len(dataset)):
    new_row = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    new_row = new_row.lower()
    new_row = new_row.split()
    clean_row = [ps.stem(word) for word in new_row if word not in (stopwords.words('english'))]
    clean_row = ' '.join(clean_row)
    corpus.append(clean_row)




## Converting text into numeric vectors

In [187]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=1500, min_df = 3, max_df = 0.6)
text_vectors = vectorizer.fit_transform(corpus).toarray()

In [188]:
y = dataset.iloc[:,1].values

In [216]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(text_vectors, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape,y_train.shape,y_test.shape

((800, 467), (200, 467), (800,), (200,))

In [217]:
input_size = 467
epochs = 1000
learning_rate = 0.01
batch_size =32
hidden_size = 500
output_size = 2

In [218]:
class Textclassifier(nn.Module):
    def __init__(self):
        super(Textclassifier, self).__init__()

        self.layer1 = nn.Linear(input_size, hidden_size)
        self.layer2 = nn.Linear(hidden_size, hidden_size)
        self.layer3 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        #print(x.shape)
        output = F.relu(self.layer1(x))
        #print(output.shape)
        output = F.relu(self.layer2(output))
        #print(output.shape)
        output = self.layer3(output)
        #print(output.shape)
        #print(F.log_softmax(output, dim=1).shape)
        return F.log_softmax(output, dim=1)
    
model = Textclassifier()
print(model)

Textclassifier(
  (layer1): Linear(in_features=467, out_features=500, bias=True)
  (layer2): Linear(in_features=500, out_features=500, bias=True)
  (layer3): Linear(in_features=500, out_features=2, bias=True)
)


In [219]:
sgd_optim = torch.optim.Adam(model.parameters(), lr = learning_rate)
criterion = nn.NLLLoss()

In [220]:
#X_train = torch.tensor(np.array(X_train), dtype = torch.float32, requires_grad=True)
#y_train = torch.tensor(np.array(y_train).reshape(-1,1), dtype = torch.float32)

In [221]:
X_train = torch.from_numpy(X_train).float()
X_test = torch.from_numpy(X_test).float()

y_train = torch.from_numpy(y_train)
y_test = torch.from_numpy(y_test)

In [222]:
X_train.shape, y_train.shape

(torch.Size([800, 467]), torch.Size([800]))

In [223]:
bce_loss_list = []
for epoch in range(epochs):

    sgd_optim.zero_grad()

    outputs = model(X_train)
    #print(outputs.shape, y_train.shape)
    bce_loss = criterion(outputs, y_train)

    bce_loss.backward()
    bce_loss_list.append(bce_loss)

    sgd_optim.step()

    print(f"Epoch : {epoch}, Loss : {bce_loss.item()}")

Epoch : 0, Loss : 0.6933846473693848
Epoch : 1, Loss : 0.6666982769966125
Epoch : 2, Loss : 0.5024487972259521
Epoch : 3, Loss : 0.3084805905818939
Epoch : 4, Loss : 0.19375364482402802
Epoch : 5, Loss : 0.15536852180957794
Epoch : 6, Loss : 0.11050688475370407
Epoch : 7, Loss : 0.07044658809900284
Epoch : 8, Loss : 0.06507411599159241
Epoch : 9, Loss : 0.054927147924900055
Epoch : 10, Loss : 0.0403677262365818
Epoch : 11, Loss : 0.03444969654083252
Epoch : 12, Loss : 0.03334742784500122
Epoch : 13, Loss : 0.03017677739262581
Epoch : 14, Loss : 0.027559921145439148
Epoch : 15, Loss : 0.024322323501110077
Epoch : 16, Loss : 0.024343283846974373
Epoch : 17, Loss : 0.02520822547376156
Epoch : 18, Loss : 0.02617214247584343
Epoch : 19, Loss : 0.024289477616548538
Epoch : 20, Loss : 0.02056623063981533
Epoch : 21, Loss : 0.0200934037566185
Epoch : 22, Loss : 0.023264706134796143
Epoch : 23, Loss : 0.022025292739272118
Epoch : 24, Loss : 0.019808899611234665
Epoch : 25, Loss : 0.020084721967

In [224]:
sample = ["The movie is awfull"]

In [225]:
sample = vectorizer.transform(sample).toarray()

In [226]:
torch.from_numpy(sample).float()

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

In [227]:

sentiment = model(torch.from_numpy(sample).float())
sentiment

tensor([[-0.2015, -1.7011]], grad_fn=<LogSoftmaxBackward0>)

In [249]:
sample2 = ["Good tasty and the texture was just great"]

In [250]:
sample2 = vectorizer.transform(sample2).toarray()

In [251]:

sentiment2 = model(torch.from_numpy(sample2).float())
sentiment2

tensor([[-24.1774,   0.0000]], grad_fn=<LogSoftmaxBackward0>)

In [180]:
model.state_dict()

OrderedDict([('layer1.weight',
              tensor([[-0.0140,  0.0438, -0.0215,  ..., -0.0223, -0.0427, -0.0273],
                      [ 0.0257,  0.0284, -0.0206,  ..., -0.0341, -0.0254,  0.0125],
                      [-0.0207, -0.0313,  0.0252,  ..., -0.0390, -0.0141,  0.0239],
                      ...,
                      [-0.0138, -0.0421,  0.0385,  ...,  0.0416,  0.0229,  0.0187],
                      [-0.0006,  0.0053,  0.0435,  ...,  0.0412, -0.0046, -0.0010],
                      [ 0.0065,  0.0186, -0.0217,  ..., -0.0166, -0.0119,  0.0348]])),
             ('layer1.bias',
              tensor([ 0.0447, -0.0213, -0.0443, -0.0424, -0.0018,  0.0274,  0.0059,  0.0445,
                       0.0135,  0.0178, -0.0021, -0.0185,  0.0354,  0.0260,  0.0241,  0.0336,
                      -0.0451,  0.0270,  0.0330,  0.0310, -0.0261,  0.0157,  0.0026, -0.0302,
                      -0.0195,  0.0439, -0.0316, -0.0039,  0.0402, -0.0097,  0.0191, -0.0244,
                       0.0250,