<a href="https://colab.research.google.com/github/khyatidoshi/HumorGeneration/blob/main/Humor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
import pandas as pd
import torch
import torchtext as tt
from torch.utils.data import DataLoader
from torch import nn, optim
from collections import Counter
import torch.nn.functional as F

In [4]:
joke_dataset = pd.read_csv('new_joke_data.csv')
joke_dataset.head()

Unnamed: 0,ID,Joke
0,0,"[me narrating a documentary about narrators] ""..."
1,1,Telling my daughter garlic is good for you. Go...
2,2,I've been going through a really rough period ...
3,3,"If I could have dinner with anyone, dead or al..."
4,4,Two guys walk into a bar. The third guy ducks.


In [5]:
class Dataset:
  def __init__(self, dataset, sequence_length=64):
    self.sequence_length = sequence_length
    self.words = self.load_words(dataset)
    self.uniq_words = self.get_uniq_words()
    self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
    self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}
    self.words_indexes = [self.word_to_index[w] for w in self.words]

  def load_words(self, dataset):
    text = " ".join(dataset['Joke'])
    return text.split(' ')

  def get_uniq_words(self):
    word_counts = Counter(self.words)
    return sorted(word_counts, key=word_counts.get, reverse=True)

  def __len__(self):
    return len(self.words_indexes) - self.sequence_length

  def __getitem__(self, index):
    return (
      torch.tensor(self.words_indexes[index:index + self.sequence_length]),
      torch.tensor(self.words_indexes[index + 1:index + self.sequence_length + 1]),
    )

In [28]:
class Model(nn.Module):
    def __init__(self, df, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False):
        super(Model, self).__init__()
        self.ntoken = ntoken
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.df = df
        self.rnn = getattr(nn, 'LSTM')(ninp, nhid, nlayers, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        if tie_weights:
            if nhid != ninp:
                raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight

        self.init_weights()
        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.encoder.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.bias)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def forward(self, input, hidden):

        # Note that Input comes in as size (50 x 64)
        # This is 50 samples, taking 64 tokens of each joke (indices for each token)
        emb = self.drop(self.encoder(input))
        # We embed the indices as vectors - each index is going be represented by a vector of length 64
        # So that emb now as size (50 x 64 x 64) - each of the 64 tokens is now represented by a vector of length 64
        # print(" here  maybe")
        output = torch.zeros( ( emb.size(0), self.ninp ) )
        # Rather than append outputs to a list, I'm constructing the full output tensor in advance
        # And then I'll just fill it in


        for i in range(emb.size(0)):
            # print(" here ")
            rnn_output, hidden = self.rnn(emb[i].view(1, self.ninp, -1), hidden)
            # So note that emb[i] is of size (64 (tokens) x 64 (vector for each token))
            # emb[i].view(1, 64, -1) resizes it to [1 (sample) x 64 (tokens) x 64 (vector for each token)]
            # PREVIOUSLY: You had emb[i].view(1, 1, -1), which turned it into a tensor [ 1 x (64*64) ]
            # But this new version keeps each token separated rather than turning it into one really long vector


            # At this point, rnn_output has shape [1, 64, 1024], a sequence of 64 large calculated vectors
            # self.drop(rnn_output) - we drop some components but the size doesn't change
            rnn_output = self.drop(rnn_output)
            char_scores = self.decoder(rnn_output)
            # print(char_scores.size())
            # char_scores has size [1, 64, 163945]
            # For each of the 64 tokens, we have 163945 different values, amounting to a 'score' for each
            # Of the available tokens - this score is a simple linear function of the rnn_output terms

            # We want to turn each of these 'scores' into a probability for each token

            char_prob = F.log_softmax(char_scores, dim=1)
            actual_probs = torch.exp(char_prob)

            # actual_probs is of size [1, 64, 163945], for each of the 64 terms, we have 163945 probabilities
            # one probability for each possible token


            # Sampling the next character based on the probability distribution
            next_char = torch.multinomial(actual_probs[0], 1)
            # This is going to pick an integer (one of the token indices) based on the probabilities
            # So next_char is going to have shape [64, 1] - one integer for each of the 64 terms
            output[i] = next_char.view(-1)
            # Saves the selected integer into the output

        # The final shape of output is [50, 64] - for each of the 50 samples, we've generated a prediction of
        # 64 integer indices for each token

        return output, hidden


    def init_state(self, bsz):
        weight = next(self.parameters())
        return (weight.new_zeros(self.nlayers, bsz, self.nhid),
                weight.new_zeros(self.nlayers, bsz, self.nhid))


In [29]:
df = Dataset(joke_dataset)
ntoken = len(df.uniq_words)

model = Model(df,ntoken, ninp = 64 , nhid=256, nlayers=3, dropout=0.5, tie_weights=False)



In [51]:
def train_model(model, dataset, bs, num_epochs, learning_rate):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    train_loader = DataLoader(dataset, batch_size=bs, shuffle=True)
    print(train_loader.batch_size)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        total_loss = 0
        print(" EPOC ",epoch)
        for inputs, targets in train_loader:
            # print(" HERE ",inputs.size(),targets.size())
            inputs, targets = inputs.to(device), targets.to(device)

            optimizer.zero_grad()

            # Initialize hidden state for each batch
            hidden = model.init_state(bs)

            outputs, _ = model(inputs, hidden)  # The LSTM hidden state is now handled internally
            # print(" HERE ",outputs.size(),targets.size())

            # Cast to float
            float_tensor = targets.float()
            targets = torch.tensor(float_tensor, dtype=torch.float, requires_grad=True)
            loss = criterion(outputs, targets)

            loss.backward()

            optimizer.step()

            total_loss += loss.item()
            print("epoc ", epoch,"Loss: ",total_loss)
        print(f"Epoch [{epoch + 1}/{num_epochs}] Loss: {total_loss / len(train_loader)}")

    print("Training finished.")


In [None]:
# Usage
batch_size = 64
num_epochs = 10
learning_rate = 0.001

train_model(model, df, batch_size, num_epochs, learning_rate)


64
 EPOC  0


  targets = torch.tensor(float_tensor, dtype=torch.float, requires_grad=True)


epoc  0 Loss:  68116537344.0
epoc  0 Loss:  147088015360.0
epoc  0 Loss:  222600175616.0
epoc  0 Loss:  286442446848.0
epoc  0 Loss:  353752530944.0
epoc  0 Loss:  426754129920.0
epoc  0 Loss:  506541383680.0
epoc  0 Loss:  576851550208.0
epoc  0 Loss:  649847676928.0
epoc  0 Loss:  721930108928.0
epoc  0 Loss:  789547495424.0
epoc  0 Loss:  865959014400.0
epoc  0 Loss:  946460078080.0
epoc  0 Loss:  1020308746240.0
epoc  0 Loss:  1098228649984.0
epoc  0 Loss:  1170977607680.0
epoc  0 Loss:  1243142787072.0
epoc  0 Loss:  1313614696448.0
epoc  0 Loss:  1388045012992.0
epoc  0 Loss:  1463498174464.0
epoc  0 Loss:  1536194850816.0
epoc  0 Loss:  1612779614208.0
epoc  0 Loss:  1684526452736.0
epoc  0 Loss:  1751269380096.0
epoc  0 Loss:  1826859913216.0
epoc  0 Loss:  1908610338816.0
epoc  0 Loss:  1987209531392.0
epoc  0 Loss:  2064027725824.0
epoc  0 Loss:  2143568793600.0
epoc  0 Loss:  2213032267776.0
epoc  0 Loss:  2288873959424.0
epoc  0 Loss:  2357857046528.0
epoc  0 Loss:  2443134