**Load dataset**:  Load pumed abstract dataset for training. Label does not requires to train the gnerator.

**Preprocess the data**: Clean and preprocess the dataset to remove any unnecessary characters, formatting, or noise. Tokenize the text by splitting it into individual words or subwords. This step helps the model understand the structure of the text.

**Build the model architecture**: Design the architecture of the text generator model using the pytorch

**Train the model**: Feed the preprocessed dataset into the model and train. The training process involves optimizing the model's parameters by minimizing a loss function that measures the difference between the generated text and the ground truth text in the dataset.

**Predict**:  Use predict function to generate the texts from the given input texts.

**Save and Load model**: Optional

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('gdrive')
import os

Load data, text file from data-directory

In [None]:

data_dir = "/content/gdrive/My Drive/DagDataScienceMaterial/data_folder/TextFolder/"
dd_file = "pubmed_abs.csv"
df = pd.read_csv(os.path.join(data_dir, dd_file))

In [None]:
df.shape

**Densign model**: We define our neural network by subclassing nn.Module, and initialize the neural network layers in __init__. Every nn.Module subclass implements the operations on input data in the **forward** method.

In [None]:
import torch
import argparse
import numpy as np
from torch import nn, optim
from torch.utils.data import DataLoader
from collections import Counter

class Model(nn.Module):
    def __init__(self, dataset):
        super(Model, self).__init__()
        self.lstm_size = 128
        self.embedding_dim = 128
        self.num_layers = 3

        n_vocab = len(dataset.uniq_words)
        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=self.embedding_dim,
        )
        self.lstm = nn.LSTM(
            input_size=self.lstm_size,
            hidden_size=self.lstm_size,
            num_layers=self.num_layers,
            dropout=0.2,
        )
        self.fc1 = nn.Linear(self.lstm_size, 256)
        self.fc2 = nn.Linear(256, n_vocab)

    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        output = self.fc1(output)
        logits = self.fc2(output)

        return logits, state

    def init_state(self, sequence_length):
        return (torch.zeros(self.num_layers, sequence_length, self.lstm_size),
                torch.zeros(self.num_layers, sequence_length, self.lstm_size))

In [None]:
# Asigning the GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:


class Dataset(torch.utils.data.Dataset):
    def __init__(
        self,
        args,
    ):
        self.args = args
        self.words = self.load_words()
        self.uniq_words = self.get_uniq_words()

        self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
        self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}

        self.words_indexes = [self.word_to_index[w] for w in self.words]

    def load_words(self):
        text = df['Abstract'].str.cat(sep=' ')
        return text.split(' ')

    def get_uniq_words(self):
        word_counts = Counter(self.words)
        return sorted(word_counts, key=word_counts.get, reverse=True)

    def __len__(self):
        return len(self.words_indexes) - self.args.sequence_length

    def __getitem__(self, index):
        return (
            torch.tensor(self.words_indexes[index:index+self.args.sequence_length]),
            torch.tensor(self.words_indexes[index+1:index+self.args.sequence_length+1]),)

Data Preprocessing, tokenizing and indecing in tokens

In [None]:
def train(dataset, model, args):
    model.train()

    dataloader = DataLoader(
        dataset,
        batch_size=args.batch_size,
    )

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(args.max_epochs):
        state_h, state_c = model.init_state(args.sequence_length)
        total_loss = 0.0
        n = 0
        model_path = os.path.join(data_dir, args.model_file)
        state_h, state_c = state_h.to(device), state_c.to(device)
        for batch, (x, y) in enumerate(dataloader):
            x = x.to(device)
            y = y.to(device)
            optimizer.zero_grad()

            y_pred, (state_h, state_c) = model(x, (state_h, state_c))
            loss = criterion(y_pred.transpose(1, 2), y)

            state_h = state_h.detach()
            state_c = state_c.detach()

            loss.backward()
            optimizer.step()
            total_loss +=loss.item()
            n +=1
        total_loss = total_loss/float(n)
        if (epoch+1)%5 ==0:
          state = {
          'epoch': epoch,
          'state_dict': model.state_dict(),
          'optimizer': optimizer.state_dict()
            }
          torch.save(state, model_path)

          print({ 'epoch': epoch, 'loss': total_loss})


Once model is trained, we can generate the new texts from the given inputs

In [None]:
def predict(dataset, model, text, next_words=20):
    words = text.split(' ')
    model.eval()

    state_h, state_c = model.init_state(len(words))
    state_h, state_c = state_h.to(device), state_c.to(device)
    for i in range(0, next_words):
        x = torch.tensor([[dataset.word_to_index[w] for w in words[i:]]])
        x = x.to(device)
        y_pred, (state_h, state_c) = model(x, (state_h, state_c))
        last_word_logits = y_pred[0][-1]
        p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().cpu().numpy()
        word_index = np.random.choice(len(last_word_logits), p=p)
        words.append(dataset.index_to_word[word_index])

    return " ".join(words)


In [None]:
parser = argparse.ArgumentParser()
parser.add_argument('--max-epochs', type=int, default=80)
parser.add_argument('--batch-size', type=int, default=256)
parser.add_argument('--sequence-length', type=int, default=20)
parser.add_argument('--model_file', type=str, default="pubmed_generator.pt")
args, unknown = parser.parse_known_args()
print (args)
dataset = Dataset(args)

model = Model(dataset)
model.to(device)

train(dataset, model, args)

In [None]:
print(predict(dataset, model, text='Most occasions cases for chronic pain', next_words =30))