In [1]:
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors 



In [2]:
import torch
torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device(type='cuda')

In [3]:
train = pd.read_csv('/kaggle/input/vietnews/processed_data.csv')

In [4]:
train.head()

Unnamed: 0,original,summary
0,bộ_trưởng sitharaman phát_biểu tại buổi gặp_gỡ...,bộ_trưởng quốc_phòng ấn_độ bày_tỏ mong_muốn vi...
1,tổng_thống pháp emmanuel_macron phấn_khích ăn_...,sau màn ăn_mừng phấn_khích trên khán_đài emma...
2,lực_lượng tuần_duyên nhật_bản đang nỗ_lực tìm_...,một hòn đảo của nhật nằm sát khu_vực tranh_chấ...
3,cảnh_sát thành_phố barcelona tây_ban_nha đã đ...,tình_báo mỹ báo với cảnh_sát tây_ban_nha về ng...
4,trước_đây khán_giả thấy rằng nghệ_sĩ xuân_hươ...,nghệ_sĩ xuân_hương và mc thanh_bạch đã từng đư...


In [5]:
x_train = train['original']
y_train = train['summary']

In [6]:
x_train = x_train.apply(lambda x: ' '.join([x for x in x.split() if not any([i.isdigit() for i in x])]))
y_train = y_train.apply(lambda y: ' '.join([y for y in y.split() if not any([j.isdigit() for j in y])]))

In [7]:
x_train = x_train.apply(lambda x: ' '.join(x.replace('.', ' . ').split()))
y_train = y_train.apply(lambda y: ' '.join(y.replace('.', ' . ').split()))

In [8]:
from tqdm import tqdm
x_vocab_freq = {}
for sentence in tqdm(x_train.to_numpy()):
  for word in sentence.split():
    x_vocab_freq[word] = x_vocab_freq.get(word, 0) + 1

y_vocab_freq = {}
for sentence in tqdm(y_train.to_numpy()):
  for word in sentence.split():
    y_vocab_freq[word] = y_vocab_freq.get(word, 0) + 1

100%|██████████| 105418/105418 [00:13<00:00, 7706.72it/s]
100%|██████████| 105418/105418 [00:01<00:00, 77102.71it/s]


In [9]:
x_vocab = list(x_vocab_freq.keys())
x_vocab.sort()
x_vocab_size = len(x_vocab)+3

y_vocab = list(y_vocab_freq.keys())
y_vocab.sort()
y_vocab_size = len(y_vocab)+3

In [114]:
x_word2id = {word: i+3 for i, word in enumerate(x_vocab)}
x_word2id['SOS'] = 1
x_word2id['EOS'] = 2
x_word2id['PAD'] = 0

x_id2word = {i+3: word for i, word in enumerate(x_vocab)}
x_id2word[1] = 'SOS'
x_id2word[2] = 'EOS'
x_id2word[0] = 'PAD'

y_word2id = {word: i+3 for i, word in enumerate(y_vocab)}
y_word2id['SOS'] = 1
y_word2id['EOS'] = 2
y_word2id['PAD'] = 0

y_id2word = {i+3: word for i, word in enumerate(y_vocab)}
y_id2word[1] = 'SOS'
y_id2word[2] = 'EOS'
y_id2word[0] = 'PAD'

In [11]:
def encode(sentence, word2id):
    words = sentence.split()
    result = np.zeros(len(words)+2)
    for i in range(1, len(words)+1):
        if word2id.get(words[i-1]) != None:
            result[i] = word2id.get(words[i-1])
    result[0] = word2id['SOS']
    result[-1] = word2id['EOS']
    return result

In [12]:
x_train_vec = x_train.apply(lambda x: encode(x, x_word2id))
y_train_vec = y_train.apply(lambda y: encode(y, y_word2id))

In [13]:
word_embed = KeyedVectors.load_word2vec_format("/kaggle/input/vietnews/vi.vec")

In [14]:
embedding_dim = len(word_embed['a'])

In [27]:
x_embedding_weight = [[0 for i in range(embedding_dim)] for j in range(x_vocab_size)]
for word in x_vocab:
    try:
        x_embedding_weight[x_word2id[word]] = word_embed[word]
    except:
        pass
    
y_embedding_weight = [[0 for i in range(embedding_dim)] for j in range(y_vocab_size)]
for word in y_vocab:
    try:
        y_embedding_weight[y_word2id[word]] = word_embed[word]
    except:
        pass

In [16]:
xlens = x_train_vec.apply(lambda s: len(s))
ylens = y_train_vec.apply(lambda s: len(s))
MAX_LENGTH_X = max(xlens.to_numpy())
MAX_LENGTH_Y = max(ylens.to_numpy())

In [17]:
x_train_vec = x_train_vec.values
y_train_vec = y_train_vec.values

In [18]:
from keras.utils import pad_sequences

x_train_padded = pad_sequences(x_train_vec, maxlen=MAX_LENGTH_X, padding='post')
y_train_padded = pad_sequences(y_train_vec, maxlen=MAX_LENGTH_Y, padding='post')

In [19]:
import torch
x_train_tensor = torch.tensor(x_train_padded, dtype=torch.long)
y_train_tensor = torch.tensor(y_train_padded, dtype=torch.long)

# model

In [20]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [32]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, embedding_weight):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, embedding_dim)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_weight, dtype= torch.float32, device = device),requires_grad= False)
        self.gru = nn.GRU(embedding_dim, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size, device=device)

In [33]:
class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size, embedding_weight):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, embedding_dim)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_weight, dtype= torch.float32, device = device),requires_grad= False)
        self.gru = nn.GRU(embedding_dim, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size, device=device)

In [23]:
class AttnDecoder(nn.Module):
    def __init__(self, hidden_size, output_size, embedding_weight, dropout_p, max_length):
        super(AttnDecoder, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_weight, dtype= torch.float32),requires_grad= False)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, self.hidden_size, device=device)

In [144]:
class Seq2Seq(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, x_embedding_weight, y_embedding_weight, teacher_forcing_ratio):
        super(Seq2Seq, self).__init__()
        self.encoder = Encoder(input_size, hidden_size, x_embedding_weight).to(device)
        self.decoder = Decoder(hidden_size, output_size, y_embedding_weight).to(device)
        self.teacher_forcing_ratio = teacher_forcing_ratio
    
    def forward(self, input, target, criterion):
        input_len = input.size(0)
        output_len = target.size(0)
        batch_size = input.size(1)
        loss = 0
        
        encoder_hidden = self.encoder.initHidden()
        for ei in range(input_len):
            encoder_output, encoder_hidden = self.encoder(input[ei], encoder_hidden)
        
        decoder_hidden = encoder_hidden
        decoder_input = torch.tensor([1]*batch_size).to(device)
        for di in range(output_len):
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
            use_teacher_forcing = True if random.random() < self.teacher_forcing_ratio else False
            if use_teacher_forcing:
                decoder_input = target[di]
            else:
                topv, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze()
            loss += criterion(decoder_output, target[di])
        return loss, decoder_output
    
    @torch.no_grad()
    def infer(self, input, output_len):
        input_len = input.size(0)
        batch_size = input.size(1)
        outputs = [np.ones(batch_size)]
        
        encoder_hidden = self.encoder.initHidden()
        for ei in range(input_len):
            encoder_output, encoder_hidden = self.encoder(input[ei], encoder_hidden)
        
        decoder_hidden = encoder_hidden
        decoder_input = torch.tensor([1]*batch_size).to(device)
        for di in tqdm(range(output_len)):
            decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze()
            print(decoder_input)
            output = np.argmax(decoder_output.detach().cpu().numpy(), axis=1)
            outputs.append(output)
            
        return outputs


In [44]:
from tqdm import tqdm
def train(model, dataloader, criterion, optimizer, epochs):
    for epoch in range(epochs):
        avg_loss = 0
        for batch in tqdm(dataloader):
            input, target = batch[0].transpose(1, 0).to(device), batch[1].transpose(1, 0).to(device)
            loss, _ = model(input, target, criterion)
            avg_loss+=loss.item()

            optimizer.zero_grad()
            loss.backward()

            optimizer.step()
        avg_loss/=len(dataloader)
        print(f'Epoch {epoch+1} loss: {avg_loss}')

In [145]:
import torch.utils.data as data

hidden_size = 256
batch_size = 256
epochs = 20

dataset = data.TensorDataset(x_train_tensor, y_train_tensor)
dataloader = data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
model = Seq2Seq(x_vocab_size, hidden_size, y_vocab_size, x_embedding_weight, y_embedding_weight, 0).to(device)
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
train(model, dataloader, criterion, optimizer, epochs)

In [77]:
torch.save(model, f'model_epoch_{epochs}.pt')

In [104]:
val = pd.read_csv('/kaggle/input/vietnews/processed_data_val.csv')

In [150]:
x_val_0 = val['original']
y_val_0 = val['summary']

In [None]:
x_val = x_val_0.apply(lambda x: ' '.join([x for x in x.split() if not any([i.isdigit() for i in x])]))
y_val = y_val_0.apply(lambda y: ' '.join([y for y in y.split() if not any([j.isdigit() for j in y])]))

In [None]:
x_val = x_val.apply(lambda x: ' '.join(x.replace('.', ' . ').split()))
y_val = y_val.apply(lambda y: ' '.join(y.replace('.', ' . ').split()))

In [106]:
x_val_vec = x_val.apply(lambda x: encode(x, x_word2id))
y_val_vec = y_val.apply(lambda y: encode(y, y_word2id))

In [108]:
x_val_padded = pad_sequences(x_val_vec, maxlen=MAX_LENGTH_X, padding='post')
y_val_padded = pad_sequences(y_val_vec, maxlen=MAX_LENGTH_Y, padding='post')

In [110]:
x_val_tensor = torch.tensor(x_val_padded, dtype=torch.long).to(device)
y_val_tensor = torch.tensor(y_val_padded, dtype=torch.long).to(device)

In [None]:
outputs = model.infer(x_val_tensor[0:5].transpose(1,0), MAX_LENGTH_Y)

In [148]:
def getText(outputs):
    results = []
    outputs = np.transpose(outputs)
    for i in range(len(outputs)):
        result = [y_id2word[int(x)] for x in outputs[i]]
        results.append(result)
    return results

In [None]:
x_val_0[0:5].values

In [None]:
getText(outputs)

In [None]:
y_val_0[0:5].values