In [149]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd

In [150]:
df = pd.read_excel('./sentiments_analysis.xlsx')

In [151]:
df.head()

Unnamed: 0,text,value
0,اما یک نکته منفی در مورد این گوشی، عدم وجود ی...,-1
1,* کیت تکامل هوشمند - Smart Evolution که به صور...,0
2,IOS :*محيط خشک و ايستا - مشتريان فقط استفاده ک...,-1
3,هر کي نخره ضرر کرده بازي هاي hd رو بدون کوچکتر...,1
4,سلام\nتلويزيون هاي پلاسما عمر کمتري نسبت به le...,-1


In [152]:
LABELS = df['value']
TEXT = df['text']
LABELS.unique()

array([-1,  0,  1,  2, -2], dtype=int64)

In [153]:
from string import punctuation

review = list()

for text in TEXT:
    # remove punctuations
    text = "".join([ch for ch in text if ch not in punctuation])
    review.append(text)

# whole corpus
all_text = " ".join(review)
# whole words
all_words = all_text.split()


# Creating Tokenzer

 Count all the words and sort it based on counts

In [154]:
len(all_words)
print(all_words[:2])

['اما', 'یک']


In [155]:
from collections import Counter
count_words = Counter(all_words)
total_words = len(all_words)

sorted_words = count_words.most_common()
print("Top ten occuring words : ",sorted_words[:10])


Top ten occuring words :  [('و', 821), ('که', 598), ('به', 589), ('از', 496), ('در', 487), ('با', 378), ('اين', 272), ('این', 247), ('است', 238), ('را', 217)]


In [156]:
# we start from 1, because 0 is for the words that is not in our corpus
vocab_to_int = {k[0]:v+1 for v, k in enumerate(sorted_words)}
vocab_to_int

{'و': 1,
 'که': 2,
 'به': 3,
 'از': 4,
 'در': 5,
 'با': 6,
 'اين': 7,
 'این': 8,
 'است': 9,
 'را': 10,
 'هم': 11,
 'گوشي': 12,
 'آن': 13,
 'رو': 14,
 'من': 15,
 'هاي': 16,
 'خيلي': 17,
 'تا': 18,
 'شده': 19,
 'استفاده': 20,
 'صفحه': 21,
 'بسیار': 22,
 'مي': 23,
 'نظر': 24,
 'نیز': 25,
 'تبلت': 26,
 'براي': 27,
 'برای': 28,
 'داره': 29,
 'دوربین': 30,
 'اما': 31,
 'کيفيت': 32,
 'بر': 33,
 'يک': 34,
 'دارد': 35,
 'دوربين': 36,
 'یک': 37,
 'ولي': 38,
 'کار': 39,
 'های': 40,
 'وجود': 41,
 'گوشی': 42,
 'هست': 43,
 'ها': 44,
 'دو': 45,
 'خود': 46,
 'مدل': 47,
 'بازي': 48,
 'بسيار': 49,
 'فقط': 50,
 'نسبت': 51,
 'دستگاه': 52,
 'نداره': 53,
 'کیفیت': 54,
 'بايد': 55,
 'واقعا': 56,
 'خوب': 57,
 'عالي': 58,
 'بود': 59,
 'شما': 60,
 'کردن': 61,
 'کم': 62,
 'نيست': 63,
 'سرعت': 64,
 'ي': 65,
 'قرار': 66,
 'می': 67,
 'يه': 68,
 'عامل': 69,
 'هر': 70,
 'خوبي': 71,
 'ميشه': 72,
 'روی': 73,
 'اي': 74,
 'تو': 75,
 'باشد': 76,
 'کرد': 77,
 'اگر': 78,
 'کردم': 79,
 'عکس': 80,
 'روي': 81,
 'بالا': 82,
 'س

Encode review in to list of Integer by using above dictionary

In [157]:
encoded_reviews = list()

for t in review:
    encoded_review = list()

    for word in t:
        if word not in vocab_to_int.keys():
            encoded_review.append(0)
        else:
            encoded_review.append(vocab_to_int[word])
    encoded_reviews.append(encoded_review)

make all the encoded_review of the same length

In [158]:
sequence_length = 250

features = np.zeros((len(encoded_reviews), sequence_length), dtype=int)

for i, review in enumerate(encoded_reviews):

    review_len = len(review)

    if review_len <= sequence_length:
        padd = list(np.zeros(sequence_length - review_len))

        new = padd + review
    else:
        new = review[:sequence_length]

    features[i, :] = new
    

        




In [159]:
labels = list(map(lambda x: x+2, LABELS))
print(labels[:10])


[1, 2, 1, 3, 1, 3, 1, 3, 3, 1]


In [160]:
#split_dataset into 80% training , 10% test and 10% Validation Dataset
train_x=features[:int(0.8*len(features))]
train_y=labels[:int(0.8*len(features))]
valid_x=features[int(0.8*len(features)):int(0.9*len(features))]
valid_y=labels[int(0.8*len(features)):int(0.9*len(features))]
test_x=features[int(0.9*len(features)):]
test_y=labels[int(0.9*len(features)):]
print(len(train_y), len(valid_y), len(test_y))

832 104 104


In [161]:

import torch
from torch.utils.data import DataLoader, TensorDataset

#create Tensor Dataset
train_data=TensorDataset(torch.LongTensor(train_x), torch.LongTensor(train_y))
valid_data=TensorDataset(torch.LongTensor(valid_x), torch.LongTensor(valid_y))
test_data=TensorDataset(torch.LongTensor(test_x), torch.LongTensor(test_y))
TensorDataset()
#dataloader
batch_size=50
train_loader=DataLoader(train_data, batch_size=batch_size, shuffle=True)
valid_loader=DataLoader(valid_data, batch_size=batch_size, shuffle=True)
test_loader=DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [162]:
class RNN(nn.Module):

    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):

        super().__init__()
        self.embdedding = nn.Embedding(input_dim, embedding_dim)

        self.rnn = nn.RNN(embedding_dim, hidden_dim)

        self.fc = nn.Linear(hidden_dim, output_dim)


    def forward(self, text):
        #print('in forward------------')
        text = torch.permute(text,(1, 0))
        #print(text.shape)
        embedded = self.embdedding(text)
        #print(embedded.shape)
        output, hidden = self.rnn(embedded)
        #print(output.shape)
        out = self.fc(hidden.squeeze(0))
        #print(hidden.shape, out.shape)
        return out

        





In [163]:
INPUT_DIM = len(vocab_to_int) + 1
EMBEDDING_DIM = 400
HIDDEN_DIM = 256
OUTPUT_DIM = 5

net = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [186]:
# loss and optimization functions
lr=0.001

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

# check if CUDA is available
train_on_gpu = torch.cuda.is_available()

# training params

epochs = 15 # 3-4 is approx where I noticed the validation loss stop decreasing

counter = 0
print_every = 100
clip=5 # gradient clipping

# move model to GPU, if available
if(train_on_gpu):
    net.cuda()

net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state

    # batch loop
    for inputs, labels in train_loader:
        counter += 1

        if(train_on_gpu):
            inputs=inputs.cuda()
            labels=labels.cuda()
        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        # zero accumulated gradients
        net.zero_grad()
        #labels = labels.unsqueeze(1)
        # get the output from the model


        output =  net(inputs)
        #print(labels.shape, output.shape, inputs.shape)

        # calculate the loss and perform backprop
        loss = criterion(output, labels)
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats

        # Get validation loss

        val_losses = []
        net.eval()
    for inputs, labels in valid_loader:
        
        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        if torch.cuda.is_available():
            inputs, labels = inputs.cuda(), labels.cuda() 

        output = net(inputs)
        val_loss = criterion(output, labels)

        val_losses.append(val_loss.item())

    net.train()
    print("Epoch: {}/{}...".format(e+1, epochs),
            "Step: {}...".format(counter),
            "Loss: {:.6f}...".format(loss.item()),
            "Val Loss: {:.6f}".format(np.mean(val_losses)))

Epoch: 1/15... Step: 17... Loss: 1.326637... Val Loss: 1.545336
Epoch: 2/15... Step: 34... Loss: 1.345012... Val Loss: 1.383544
Epoch: 3/15... Step: 51... Loss: 1.448909... Val Loss: 1.461790
Epoch: 4/15... Step: 68... Loss: 1.392342... Val Loss: 1.379290
Epoch: 5/15... Step: 85... Loss: 1.485127... Val Loss: 1.442046
Epoch: 6/15... Step: 102... Loss: 1.409958... Val Loss: 1.569964
Epoch: 7/15... Step: 119... Loss: 1.524013... Val Loss: 1.444223
Epoch: 8/15... Step: 136... Loss: 1.262300... Val Loss: 1.527852
Epoch: 9/15... Step: 153... Loss: 1.307498... Val Loss: 1.488374
Epoch: 10/15... Step: 170... Loss: 1.219935... Val Loss: 1.437290
Epoch: 11/15... Step: 187... Loss: 1.372240... Val Loss: 1.450623
Epoch: 12/15... Step: 204... Loss: 1.152960... Val Loss: 1.413075
Epoch: 13/15... Step: 221... Loss: 1.481505... Val Loss: 1.402921
Epoch: 14/15... Step: 238... Loss: 1.372661... Val Loss: 1.605143
Epoch: 15/15... Step: 255... Loss: 1.430397... Val Loss: 1.457418


In [187]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for x, y in iterator:

            predictions = model(x)
            
            loss = criterion(predictions, y)
            

            epoch_loss += loss.item()

        
    return epoch_loss / len(iterator)

In [188]:
evaluate(net, test_loader, criterion)

1.5186590353647869

In [189]:
#text= 'اصلا فیلم خوبی نبود'
data = next(iter(test_loader))

result = net(data[0])
torch.argmax(result, dim=1)

tensor([4, 1, 2, 1, 4, 2, 3, 4, 4, 4, 4, 2, 1, 1, 3, 2, 1, 4, 1, 1, 1, 2, 1, 3,
        1, 1, 4, 4, 3, 1, 4, 2, 4, 1, 4, 2, 2, 2, 3, 1, 4, 4, 2, 3, 4, 1, 2, 1,
        3, 1])

In [190]:
data[1]

tensor([1, 1, 4, 1, 2, 3, 3, 3, 4, 1, 3, 2, 1, 4, 1, 1, 2, 2, 3, 1, 3, 3, 0, 4,
        3, 4, 1, 2, 1, 1, 0, 1, 3, 3, 3, 2, 3, 3, 4, 1, 4, 3, 4, 3, 2, 0, 4, 3,
        1, 0])