In [12]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/gitanjali/democracy_and_education.txt
/kaggle/input/gitanjali/gitanjali.txt


In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

In [14]:
with open("/kaggle/input/gitanjali/gitanjali.txt", "r", encoding="utf8") as file:
    data = file.readlines()
    data = "".join(data)
    data = re.sub(pattern= "[\\n]{2,}", repl= "<multiple_new_line>" , string= data)
    data = re.sub(pattern= "\\n", repl = " ", string = data)
    data = re.sub(pattern= "<multiple_new_line>", repl = "\n", string = data).lower()

In [54]:
n_vocab = len(set(data))
n_char = len(data)

token_to_id = dict((token, id_) for id_, token in enumerate(set(data)))
id_to_token = dict((id_, token) for id_, token in enumerate(set(data)))

In [55]:
seq_len = 100
stride = 1
X, y = [], []
for i in range(0,n_char - seq_len,stride):
    X.append([token_to_id[token] for token in data[i:i+seq_len]])
    y.append(token_to_id[data[i+seq_len]])

In [56]:
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.LongTensor(X).unsqueeze(2)/n_vocab,
        self.y = torch.LongTensor(y)
    def __len__(self):
        return self.y.__len__()
    def __getitem__(self, idx):
        return self.X[0][idx], self.y[idx]
    

In [57]:
training_frac = 0.9
slicing_idx = int(len(X)*training_frac)
train_dataset = CustomDataset(X[:slicing_idx],y[:slicing_idx])
test_dataset = CustomDataset(X[slicing_idx:],y[slicing_idx:])

In [92]:
batch_size = 1024
train_dataloader = DataLoader(dataset= train_dataset, shuffle = False, batch_size = batch_size)
test_dataloader = DataLoader(dataset= test_dataset, shuffle = False, batch_size = batch_size)

In [93]:
class TextGenerator(nn.Module):
    def __init__(self):
        super(TextGenerator, self).__init__()
        self.lstm = nn.LSTM(
            input_size = 1,
            hidden_size = 768,
            num_layers = 3,
            batch_first = True,
            bidirectional = False
        )
        self.dropout = nn.Dropout(p=0.2)
        self.linear = nn.Linear(in_features= 768, out_features= n_vocab)
        
    def forward(self, X):
        # X shape (batch_size, seq_len, input_size = 1)
        X, _ = self.lstm(X)
        # X shape  (batch_size, seq_len, D*hidden_size = 1*128)
        X = X[:,-1,:] # Selecting the last hidden state
        X = self.linear(self.dropout(X))
        return X

In [94]:
model = TextGenerator()

In [95]:
epochs = 20
learning_rate = 1e-3
optimizer = optim.Adam(model.parameters(), lr = learning_rate)
loss_function = nn.CrossEntropyLoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
for epoch in tqdm(range(epochs)):
    model.train()
    epoch_loss = 0
    for batch_input, batch_target in train_dataloader:
        batch_input = batch_input.to(device)
        batch_target = batch_target.to(device)

        batch_model_output = model(batch_input)
        loss = loss_function(batch_model_output, batch_target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f"epoch:- {epoch} train_loss:-{epoch_loss/len(train_dataloader)}")
    
model.eval()
with torch.no_grad():
    val_loss = 0
    for batch_input, batch_target in test_dataloader:
        batch_input = batch_input.to(device)
        batch_target = batch_target.to(device)

        batch_model_output = model(batch_input)
        val_loss += loss_function(batch_model_output, batch_target).item()
    print(f"val_loss:-{val_loss/len(test_dataloader)}")

  0%|          | 0/20 [00:00<?, ?it/s]

epoch:- 0 train_loss:-3.0323546756397595
epoch:- 1 train_loss:-2.9569779309359463
epoch:- 2 train_loss:-2.9328502958471123
epoch:- 3 train_loss:-2.8443816228346392
epoch:- 4 train_loss:-2.716508622602983
epoch:- 5 train_loss:-2.5712875626303933
epoch:- 6 train_loss:-2.4144168160178445
epoch:- 7 train_loss:-2.2581350716677577
epoch:- 8 train_loss:-2.145042480121959
epoch:- 9 train_loss:-2.0473135644739324
epoch:- 10 train_loss:-1.951199832829562
epoch:- 11 train_loss:-1.870663915980946
epoch:- 12 train_loss:-1.8148728717457165
epoch:- 13 train_loss:-1.7584070834246548
epoch:- 14 train_loss:-1.6863704551349987
epoch:- 15 train_loss:-1.6241088845513083
epoch:- 16 train_loss:-1.5920341903513129
epoch:- 17 train_loss:-1.5718367489901457
epoch:- 18 train_loss:-1.5079039898785678
epoch:- 19 train_loss:-1.4692152402617715
val_loss:-1.8977052313940865


In [96]:
def sample(preds, temperature = 1.0):
    # helper function to sample an index from a probability array
    exp_preds = torch.exp(torch.log(preds)/temperature)
    preds = exp_preds/ torch.sum(exp_preds)
    return torch.multinomial(input = preds, num_samples = 1).item()
def generate(prompt, model, n_chars_to_generate, temp):
    prompt = prompt[:seq_len]
    prompt_id_list = [token_to_id[token]/n_vocab for token in prompt]
    model.eval()
    with torch.no_grad():
        for i in range(n_chars_to_generate):
            predictions = model(torch.Tensor(prompt_id_list[i:i+seq_len]).reshape((1,seq_len,1)).to(device))
            predictions = nn.Softmax(dim = 1)(predictions)
            index = sample(predictions, temperature = temp)
            generated_char = id_to_token[index]
            prompt_id_list.append(index/n_vocab)
            prompt+= generated_char
    return prompt

In [103]:
prompt = "now that the day will come when my sight of this earth shall be lost, and life will take its leave in silence, drawing the last curtain over my eyes.".lower()
print(generate(prompt, model, 200, 1.0))

now that the day will come when my sight of this earth shall be lost, and life will take its leave io theeglsc
when my said and forest ihyough the dar when d have thi carth in thy nowpi tou seades, from.s, eare io care to batt ihe dimi whrre oer- mhlcs; iheye is bvaornt hou d amkgi this dayr
ehat it
