In [22]:
!pip install -q unidecode

In [23]:
import torch
import torch.nn as nn
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
import unidecode

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [24]:
 nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [26]:
seed = 1
torch.manual_seed(seed)

<torch._C.Generator at 0x7e37142cf2f0>

In [27]:
!pip install -q gdown
!gdown --id 1uYXI4O3oWBA6QC8ZJ-r6yaTTfkdAnl_Q
!unzip /content/dataset.zip

Downloading...
From: https://drive.google.com/uc?id=1uYXI4O3oWBA6QC8ZJ-r6yaTTfkdAnl_Q
To: /content/dataset.zip
100% 230k/230k [00:00<00:00, 4.01MB/s]
Archive:  /content/dataset.zip
replace dataset/all-data.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: dataset/all-data.csv    


In [28]:
data_path = '/content/dataset/all-data.csv'
headers = ['sentiment', 'content']

df = pd.read_csv(data_path, names = headers, encoding = 'ISO-8859-1')
print(df)

     sentiment                                            content
0      neutral  According to Gran , the company has no plans t...
1      neutral  Technopolis plans to develop in stages an area...
2     negative  The international electronic industry company ...
3     positive  With the new production plant the company woul...
4     positive  According to the company 's updated strategy f...
...        ...                                                ...
4841  negative  LONDON MarketWatch -- Share prices ended lower...
4842   neutral  Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843  negative  Operating profit fell to EUR 35.4 mn from EUR ...
4844  negative  Net sales of the Paper segment decreased to EU...
4845  negative  Sales in Finland decreased by 10.5 % in Januar...

[4846 rows x 2 columns]


In [29]:
classes = {
    class_name: idx for idx, class_name in enumerate(df['sentiment'].unique())
}
print(classes)
df['sentiment'] = df['sentiment'].apply(lambda x: classes[x])
print(df)

{'neutral': 0, 'negative': 1, 'positive': 2}
      sentiment                                            content
0             0  According to Gran , the company has no plans t...
1             0  Technopolis plans to develop in stages an area...
2             1  The international electronic industry company ...
3             2  With the new production plant the company woul...
4             2  According to the company 's updated strategy f...
...         ...                                                ...
4841          1  LONDON MarketWatch -- Share prices ended lower...
4842          0  Rinkuskiai 's beer sales fell by 6.5 per cent ...
4843          1  Operating profit fell to EUR 35.4 mn from EUR ...
4844          1  Net sales of the Paper segment decreased to EU...
4845          1  Sales in Finland decreased by 10.5 % in Januar...

[4846 rows x 2 columns]


In [30]:
# preprocessing
english_stop_words = stopwords.words('english')
stemmer = PorterStemmer()

def text_normalize(text):
  text = text.lower() #Lowercase
  text = unidecode.unidecode(text) # xoa dau cau PunctuationRemoval
  text = text.strip() # xoa khoang trang dau cuoi
  text = re.sub(r'[^\w\s]', '', text) # xoa khoang trang thua

  text = ' '.join([word for word in text.split(' ') if word not in english_stop_words]) # StopwordsRemoval
  text = ' '.join([stemmer.stem(word) for word in text.split(' ')]) # Stemming (convert cac dang- thi cua tu ve origin word)

  return text

In [31]:
#build vocab
vocab = []
for sequence in df['content'].tolist():
  tokens = sequence.split()
  for token in tokens:
    if token not in vocab:
      vocab.append(token)

vocab.append('UNK')
vocab.append('PAD')

word_to_idx = {
    word: idx for idx, word in enumerate(vocab)
}

In [32]:
# convert text token to token id
def transform(text, word_to_idx, max_seq_len):
  tokens = []
  for word in text.split():
    try:
      word_ids = word_to_idx[word]
    except:
      word_ids = word_to_idx['UNK']
    tokens.append(word_ids)
  if len(tokens) < max_seq_len:
    tokens += [word_to_idx['PAD']] * (max_seq_len - len(tokens))
  elif len(tokens) > max_seq_len:
    tokens = tokens[:max_seq_len]
  return tokens

In [33]:
val_size = 0.2
test_size = 0.125
is_shuffle = True

texts = df['content'].to_list()
labels = df['sentiment'].to_list()

x_train, x_val, y_train, y_val = train_test_split(texts, labels,
                                                  test_size = val_size,
                                                  shuffle = is_shuffle,
                                                  random_state= seed)
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train,
                                                    test_size = test_size,
                                                    shuffle = is_shuffle,
                                                    random_state= seed)

In [34]:
#build dataset
class SentimentDataset(Dataset):
  def __init__(self, x, y, word_to_idx, max_len_seq, transform = None):
    super().__init__()
    self.texts = x
    self.labels = y
    self.word_to_idx = word_to_idx
    self.max_len_seq = max_len_seq
    self.transform = transform

  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    text = self.texts[idx]
    label = self.labels[idx]

    if self.transform:
      text = self.transform(text, self.word_to_idx, self.max_len_seq)
    text = torch.tensor(text)

    return text, label

In [35]:
#dataloader
max_len_seq = 32
train_dataset = SentimentDataset(x_train, y_train, word_to_idx, max_len_seq, transform)
val_dataset = SentimentDataset(x_val, y_val, word_to_idx, max_len_seq, transform)
test_dataset = SentimentDataset(x_test, y_test, word_to_idx, max_len_seq, transform)

train_batch_size = 128
test_batch_size = 8
train_dataloader = DataLoader(
    dataset= train_dataset,
    batch_size= train_batch_size,
    shuffle= True,
    drop_last= False
)
val_dataloader = DataLoader(
    dataset= val_dataset,
    batch_size= test_batch_size,
    shuffle= False,
    drop_last= False
)
test_dataloader = DataLoader(
    dataset= test_dataset,
    batch_size= test_batch_size,
    shuffle= False,
    drop_last= False
)

In [36]:
class Sentiment_RNN(nn.Module):
  def __init__(self, vocab_size, embedding_dim,
               hidden_size, n_layers,
               n_classes, dropout_prob):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.rnn = nn.RNN(
        input_size = embedding_dim,
        hidden_size = hidden_size,
        num_layers = n_layers,
        batch_first= True
    )
    self.norm = nn.LayerNorm(hidden_size)
    self.dropout = nn.Dropout(dropout_prob)
    self.fc1 = nn.Linear(hidden_size, 16)
    self.relu = nn.ReLU()
    self.fc2 = nn.Linear(16, n_classes)

  def forward(self, x):
    x = self.embedding(x)
    x, hn = self.rnn(x) # hn contains the final hidden state for each layer
    # x shape (Batch,Layer,Feature)
    x = x[:,-1,:] # take last layer

    x = self.norm(x)
    x = self.dropout(x)
    x = self.fc1(x)
    x = self.relu(x)
    x = self.fc2(x)

    return x

In [37]:
vocab_size = len(vocab)
embedding_dim = 64
hidden_size = 64
n_layers = 2
n_classes = len(list(classes.keys()))
dropout_prob = 0.2

device = 'cuda' if torch.cuda.is_available() else 'cpu'
#model
model = Sentiment_RNN(
  vocab_size= vocab_size,
  embedding_dim= embedding_dim,
  hidden_size= hidden_size,
  n_layers= n_layers,
  n_classes= n_classes,
  dropout_prob= dropout_prob).to(device)

#setup loss func and optimizer
lr = 1e-4
epochs = 50

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = lr)

In [53]:
def train():
    train_losses = []
    val_losses = []
    for epoch in range(epochs):
      batch_train_losses = []

      model.train()
      for idx, (texts, labels) in enumerate(train_dataloader):
        texts = texts.to(device)
        labels = labels.to(device)

        outputs = model(texts)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        batch_train_losses.append(loss.item())

      train_loss = sum(batch_train_losses) / len(batch_train_losses)
      train_losses.append(train_loss)

      model.eval()
      all_predictions = []
      all_labels = []
      with torch.no_grad():
        batch_val_losses = []
        for idx, (texts, labels) in enumerate(val_dataloader):
          all_labels.extend(labels)
          texts = texts.to(device)
          labels = labels.to(device)

          outputs = model(texts)
          indices = torch.argmax(outputs, dim = 1)
          all_predictions.extend(indices)
          loss = criterion(outputs, labels)
          batch_val_losses.append(loss.item())

      val_loss = sum(batch_val_losses) / len(batch_val_losses)
      val_losses.append(val_loss)

      all_labels = [label.item() for label in all_labels]
      all_predictions = [prediction.item() for prediction in all_predictions]
      metric = classification_report(all_labels, all_predictions)
      print("Epoch: {}/{},Train Loss: {:.4f}, Val Loss: {:.4f}".format(epoch + 1, epochs, train_loss, val_loss))
      print(metric)


In [54]:
train()

Epoch: 1/50,Train Loss: 0.8931, Val Loss: 0.9265
              precision    recall  f1-score   support

           0       0.60      0.96      0.74       570
           1       0.00      0.00      0.00       122
           2       0.41      0.09      0.14       278

    accuracy                           0.59       970
   macro avg       0.34      0.35      0.29       970
weighted avg       0.47      0.59      0.47       970

Epoch: 2/50,Train Loss: 0.8917, Val Loss: 0.9256
              precision    recall  f1-score   support

           0       0.60      0.95      0.74       570
           1       0.00      0.00      0.00       122
           2       0.38      0.08      0.14       278

    accuracy                           0.58       970
   macro avg       0.33      0.35      0.29       970
weighted avg       0.46      0.58      0.47       970

Epoch: 3/50,Train Loss: 0.8884, Val Loss: 0.9259
              precision    recall  f1-score   support

           0       0.60      0.95   