In [None]:
import csv
import random
import re
import string
import time

import numpy as np

import nltk
from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
print(string.punctuation)

In [None]:
SOS_token = 0
EOS_token = 1

message_path = "./messages.tsv"

DATASET_PATH = "./message_dataset.pth"
DATALOADER_PATH = "./message_dataloader.pth"

In [120]:
class MessageDataset(Dataset):
    def __init__(self, message_file):
        self.filepath = message_file
        self.punctuation = {"?", "!", ":", "/", ";"}

        self.VOCAB_INDEX = {}
        self.INDEX_VOCAB = {0: "<sos>", 1: "<eos>"}
        self.word_num = 2

        self.prompt = []
        self.response = []

        self.max_message = 0

        with open(self.filepath) as message_file:
            reader = list(csv.reader(message_file, delimiter="\t"))
            message_len = len(reader)
            for i,row in enumerate(reader):
                you = row[0]
                me = row[1]

                you_tokens = self.tokenize(you)
                me_tokens = self.tokenize(me)

                for tokens in (you_tokens, me_tokens):
                    if len(tokens) > self.max_message:
                        self.max_message = len(tokens)
                    for token in tokens:
                        if token not in self.VOCAB_INDEX:
                            self.VOCAB_INDEX[token] = self.word_num
                            self.INDEX_VOCAB[self.word_num] = token
                            self.word_num += 1
                
                self.prompt.append(you)
                self.response.append(me)
                
                if i % 1000 == 0:
                    print("Processed {} out of {} rows in message file".format(i, message_len))
        
        self.VOCAB_LEN = len(self.VOCAB_INDEX)

    def __getitem__(self, index):
        you = self.tokenize(self.prompt[index])
        me = self.tokenize(self.response[index])

        you_indices = np.zeros((len(you), 1))
        me_indices = np.zeros((len(me), 1))

        for i,tok in enumerate(you):
            ind = self.VOCAB_INDEX[tok]
            you_indices[i] = ind
        
        for i,tok in enumerate(me):
            ind = self.VOCAB_INDEX[tok]
            me_indices[i] = ind

        you_np = np.asarray(you_indices)
        me_np = np.asarray(me_indices)

        return torch.LongTensor(you_np), torch.LongTensor(me_np)

    def __len__(self):
        return len(self.prompt)
    
    def tokenize(self, text):
        text = text.lower()
        text = re.sub(r'\d+', '', text)
        text = text.translate(str.maketrans('', '', string.punctuation))
        words = word_tokenize(text)
        return words

In [121]:
try:
    dataset = torch.load(DATASET_PATH)
    dataloader = torch.load(DATALOADER_PATH)
    print("Loaded dataset and dataloader from paths")
except:
    print("Did not load dataset and dataloader from storage")
    dataset = MessageDataset(message_path)
    dataloader = DataLoader(dataset=dataset, batch_size=1, shuffle=True, pin_memory=True, num_workers=2)
    # torch.save(dataset, DATASET_PATH)
    # torch.save(dataloader, DATALOADER_PATH)
    print("Saved dataset and dataloader at {} and {}".format(DATASET_PATH, DATALOADER_PATH))

Did not load dataset and dataloader from storage
Processed 0 out of 21658 rows in message file
Processed 1000 out of 21658 rows in message file
Processed 2000 out of 21658 rows in message file
Processed 3000 out of 21658 rows in message file
Processed 4000 out of 21658 rows in message file
Processed 5000 out of 21658 rows in message file
Processed 6000 out of 21658 rows in message file
Processed 7000 out of 21658 rows in message file
Processed 8000 out of 21658 rows in message file
Processed 9000 out of 21658 rows in message file
Processed 10000 out of 21658 rows in message file
Processed 11000 out of 21658 rows in message file
Processed 12000 out of 21658 rows in message file
Processed 13000 out of 21658 rows in message file
Processed 14000 out of 21658 rows in message file
Processed 15000 out of 21658 rows in message file
Processed 16000 out of 21658 rows in message file
Processed 17000 out of 21658 rows in message file
Processed 18000 out of 21658 rows in message file
Processed 1900

In [61]:
print(dataset.VOCAB_INDEX)

 19117, 'clubbing': 19118, 'rugby': 19119, 'vaping': 19120, 'partaking': 19121, 'psetig': 19122, 'determination': 19123, 'ithis': 19124, 'bolded': 19125, 'measured': 19126, 'imposter': 19127, 'whenre': 19128, 'originaly': 19129, 'cannnn': 19130, 'whichever': 19131, 'gigle': 19132, 'cutee': 19133, 'hellllo': 19134, 'hundred': 19135, 'plagu': 19136, 'aweeeee': 19137, 'whatre': 19138, 'hom': 19139, 'gv': 19140, 'trout': 19141, 'slapped': 19142, 'emalee': 19143, 'liquor': 19144, 'whyre': 19145, 'callls': 19146, 'lychee': 19147, 'alcoholism': 19148, 'tackled': 19149, 'tackle': 19150, 'throgh': 19151, 'jammmmm': 19152, 'thurssaturday': 19153, 'partylong': 19154, 'efuck': 19155, 'frisun': 19156, 'bak': 19157, 'dumbasses': 19158, 'dentists': 19159, 'dinghys': 19160, 'mahbe': 19161, 'listings': 19162, 'pictirws': 19163, 'cuddling': 19164, 'unmarried': 19165, 'miami': 19166, 'weeather': 19167, 'dissapoint': 19168, 'richness': 19169, 'slyly': 19170, 'yacht': 19171, 'estate': 19172, 'volatile': 19

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, max_length, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [152]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_tensor = torch.transpose(input_tensor, 0, 1).squeeze(1).squeeze(1)
    target_tensor = torch.transpose(target_tensor, 0, 1).squeeze(1).squeeze(1)

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di].unsqueeze(0))
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di].unsqueeze(0))
            if decoder_input.item() == EOS_token:
                break

    if loss != 0:
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        return loss.item() / target_length
    else:
        return loss

In [71]:
learning_rate = 0.01
# NEED TO SET
vocab_words = dataset.VOCAB_LEN
max_length = dataset.max_message
print(max_length)

801


In [131]:
def trainIters(encoder, decoder, dataloader, epochs, temp_path, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every
    prev_loss_avg = None

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for i in range(epochs):
        for prompt,response in dataloader:
            input_tensor = prompt
            target_tensor = response

            loss = train(input_tensor, target_tensor, encoder,
                        decoder, encoder_optimizer, decoder_optimizer, criterion, max_length)
            print_loss_total += loss
            plot_loss_total += loss
        else:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d) %.4f' % (timeSince(start, i / epochs), print_loss_avg))
            if prev_loss_avg is None or print_loss_avg < prev_loss_avg:
                torch.save({encoder: encoder.state_dict(), decoder: decoder.state_dict()}, temp_path)
                print("Saved model at temp path!")

In [151]:
hidden_size = 256
encoder = EncoderRNN(vocab_words, hidden_size).to(device)
decoder = AttnDecoderRNN(hidden_size, vocab_words, max_length, dropout_p=0.1).to(device)

epochs = 2
temp_path = "./bot_temp.pth"
final_path = "./bot.pth"

trainIters(encoder, decoder, dataloader, epochs, max_length)
torch.save({encoder: encoder.state_dict(), decoder: decoder.state_dict()}, final_path)

AttributeError: 'int' object has no attribute 'item'

In [145]:
def evaluate(encoder, decoder, dataset, text, max_length):
    with torch.no_grad():
        input_tensor = build_input(text, dataset)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(dataset.INDEX_VOCAB[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [146]:
def build_input(text, dataset):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(text)
    indices = np.zeros((len(you), 1))
    for i,tok in enumerate(words):
        ind = dataset.VOCAB_INDEX[tok]
        indices[i] = ind
    ind_np = np.asarray(indices)

    return torch.LongTensor(ind_np)

In [None]:
def play(text, encoder=encoder, decoder=decoder, dataset=dataset):
    output_words, attentions = evaluate(encoder, decoder, datset, text, max_length)
    output_sentence = ' '.join(output_words)
    print(output_sentence)