In [1]:
import pandas as pd
import numpy as np
import torch 
import torch.nn as nn
from torch.optim import Adam, SGD

import nltk
from nltk.corpus import stopwords
import string
 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/hyarrava/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
### Data Loading

In [3]:
data = pd.read_csv("../data/news_summary_more.csv")
data.head(5)

Unnamed: 0,headlines,text
0,upGrad learner switches to career in ML & Al w...,"Saurav Kant, an alumnus of upGrad and IIIT-B's..."
1,Delhi techie wins free food from Swiggy for on...,Kunal Shah's credit card bill payment platform...
2,New Zealand end Rohit Sharma-led India's 12-ma...,New Zealand defeated India by 8 wickets in the...
3,Aegon life iTerm insurance plan helps customer...,"With Aegon Life iTerm Insurance plan, customer..."
4,"Have known Hirani for yrs, what if MeToo claim...",Speaking about the sexual harassment allegatio...


In [4]:
data.describe()

Unnamed: 0,headlines,text
count,98401,98401
unique,98280,98360
top,Warne produced 'ball of century' with his 1st ...,Virender Sehwag was captaining India when he h...
freq,3,2


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98401 entries, 0 to 98400
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   headlines  98401 non-null  object
 1   text       98401 non-null  object
dtypes: object(2)
memory usage: 1.5+ MB


In [6]:
data = data.iloc[:1000]

In [7]:
data.shape

(1000, 2)

### Punctuation removal, stop word removal, loweing

In [8]:
stop_words = set(stopwords.words('english'))


In [9]:
def remove_punct_stop_words(sentence):
    translator = str.maketrans('', '', string.punctuation)
    punct_sentence = sentence.translate(translator)
    
    clean_words = []
    for word in punct_sentence.split(' '):
        if word not in stop_words:
            clean_words.append(word.lower())

    return ' '.join([word for word in clean_words])

In [10]:
sample_sentence = """'Ousted Nissan Chairman Carlos Ghosn has said his arrest over alleged financial misconduct was led by "plot and treason" by the Japanese carmaker\'s executives who opposed its deeper integration with Renault and Mitsubishi. Ghosn added he had discussed the integration plans with Nissan\'s CEO in September, a month before his arrest. He further said he wouldn\'t flee if granted bail.'"""

clean_sentence = remove_punct_stop_words(sample_sentence)

### Punctuation removal, stop word removal, loweing 
### may not be helpful in this case as they cutdown the fluency

## We wil be movng without them
## We will do tokenization and then Word embeddings


In [11]:
### Creating the Summarize text class
from torch.utils.data import Dataset, DataLoader
class Summarize_text(Dataset):
    def __init__(self, txt, summary, tokenizer, max_length = 512):
        super().__init__()

        self.txt = txt
        self.summary = summary
        self.max_length = max_length

        self.tokens_list, self.attn_masks_list = self.tokenize_words(self.txt)
        self.smr_tokens_list, self.smr_attn_masks_list = self.tokenize_words(self.summary)


    def tokenize_words(self, text):
        tokens_list, attn_masks_list = [], []
        for line in text:
            tokens = tokenizer(line, padding = 'max_length', truncation = True,
                                return_tensors = "pt", max_length = self.max_length)
            input_ids , attention_masks = tokens["input_ids"], tokens["attention_mask"]
            tokens_list.append(input_ids.squeeze(0))
            attn_masks_list.append(attention_masks.squeeze(0))
        return tokens_list,  attn_masks_list
        

    def __len__(self):
        return len(self.tokens_list)

    def __getitem__(self, idx):
        return {
            "input_ids" : self.tokens_list[idx],
            "attention_mask" : self.attn_masks_list[idx],
            "labels" : self.smr_tokens_list[idx],
            "labels_attention_mask" : self.smr_attn_masks_list[idx]
        }

In [12]:
def create_dataloader_V1(text, headlines, max_length = 512, batch_size = 4, stride =4, num_workers = 4):
    dataset = Summarize_text(text,headlines,tokenizer, max_length = max_length)
    dataloader= DataLoader(dataset,
    batch_size = batch_size,
    shuffle = True,
    drop_last = True,
    num_workers = num_workers
                )
    return dataloader

headlines = data["headlines"].to_list()
text = data["text"].to_list()

## Tokenization
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("slauw87/bart_summarisation")
dataloader = create_dataloader_V1(text, headlines, max_length = 512, batch_size = 4, stride =4, num_workers = 4)



In [13]:
### Each batch of size (4,4,512)
### 4 samples in each batch
### Each sample consists of 512 dimensional vector

In [14]:
## Create the model Architecture
### Encoder
### Decoder

In [15]:
class Encoder(nn.Module):
    def __init__(self, vocab_size,emb_dim, hidden_dim, num_layers):
        super(Encoder, self).__init__()

        self.vocab_size = vocab_size
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim
        self.emb_dim = emb_dim 

        self.embedding_layer = nn.Embedding(self.vocab_size, self.emb_dim, padding_idx = tokenizer.pad_token_id)
        self.lstm_layers = nn.LSTM(self.emb_dim, self.hidden_dim, self.num_layers, 
                                   dropout= 0.2, bidirectional = False, batch_first = True)
        self.dropout_layer = nn.Dropout(0.2)

    def forward(self, x):

        out = self.embedding_layer(x)
        lstm_output, (hidden_state, cell_state) = self.lstm_layers(out)
        out = self.dropout_layer(lstm_output)
        return out, (hidden_state, cell_state)

In [16]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers):
        super(Decoder, self).__init__()
        self.vocab_size = vocab_size
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        self.embedding_layer = nn.Embedding(self.vocab_size, self.emb_dim, padding_idx=tokenizer.pad_token_id)
        self.lstm_layer = nn.LSTM(self.emb_dim, self.hidden_dim, num_layers=self.num_layers,
                                  bidirectional=False, batch_first=True, dropout=0.1)  # Set bidirectional=False
        self.dropout_layer = nn.Dropout(0.2)
        self.fc = nn.Linear(self.hidden_dim, self.vocab_size)  # Changed to hidden_dim

    def forward(self, x, hidden_state, cell_state):
        out = self.embedding_layer(x)
        lstm_out, (hidden_state, cell_state) = self.lstm_layer(out, (hidden_state, cell_state))
        out = self.dropout_layer(lstm_out)
        out = self.fc(out)

        return out, hidden_state, cell_state  # Return hidden and cell states


In [17]:
import torch
import torch.optim as optim
import torch.nn as nn

# Assuming necessary imports and data preparation are done
num_epochs = 10
learning_rate = 0.001
vocab_size = tokenizer.vocab_size
batch_size = 128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize models
encoder = Encoder(vocab_size=vocab_size, emb_dim=512, hidden_dim=64, num_layers=4).to(device)
decoder = Decoder(vocab_size=vocab_size, emb_dim=512, hidden_dim=64, num_layers=4).to(device)

# Optimizer and loss function
optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=learning_rate)
criterion = nn.CrossEntropyLoss(ignore_index=0)

# Training loop
for epoch in range(num_epochs):
    encoder.train()
    decoder.train()
    for i, batch in enumerate(dataloader):
        
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)

        # Encode the input
        encoder_out, (encoder_hidden, encoder_cell) = encoder(input_ids)

        # Initialize decoder input (start with <BOS> token)
        decoder_input = labels[:, 0].unsqueeze(1)  # Start with the first token of the labels
        decoder_hidden = encoder_hidden  # Initialize with encoder hidden state
        decoder_cell = encoder_cell  # Initialize with encoder cell state

        # Initialize tensor for decoder outputs
        decoder_outputs = torch.zeros(labels.size(0), labels.size(1) - 1, vocab_size).to(device)
        loss = 0

        for t in range(1, labels.size(1)):  # Start from the second token
            output, decoder_hidden, decoder_cell = decoder(decoder_input, decoder_hidden, decoder_cell)

            # Store the output
            decoder_outputs[:, t - 1, :] = output.squeeze(1)

            # Calculate loss
            loss += criterion(output.view(-1, vocab_size), labels[:, t].view(-1))

            # Prepare next input for the decoder (teacher forcing)
            decoder_input = labels[:, t].unsqueeze(1)  # Use true label for next input

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Print progress
        if (i + 1) % 10 == 0:  # Print every 10 batches
            print(f'Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(dataloader)}], Loss: {loss.item():.4f}')

# Save the trained model
torch.save({
    'encoder_state_dict': encoder.state_dict(),
    'decoder_state_dict': decoder.state_dict(),
}, 'encoder_decoder_model.pth')

print("Model training complete and saved.")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch [1/10], Step [10/250], Loss: 5288.4595
Epoch [1/10], Step [20/250], Loss: 4077.8938
Epoch [1/10], Step [30/250], Loss: 2872.7795
Epoch [1/10], Step [40/250], Loss: 1843.0858
Epoch [1/10], Step [50/250], Loss: 1036.4354
Epoch [1/10], Step [60/250], Loss: 534.9172
Epoch [1/10], Step [70/250], Loss: 320.6650
Epoch [1/10], Step [80/250], Loss: 247.9296
Epoch [1/10], Step [90/250], Loss: 211.5475
Epoch [1/10], Step [100/250], Loss: 242.7027
Epoch [1/10], Step [110/250], Loss: 201.6288
Epoch [1/10], Step [120/250], Loss: 190.0233
Epoch [1/10], Step [130/250], Loss: 167.1587
Epoch [1/10], Step [140/250], Loss: 154.6165
Epoch [1/10], Step [150/250], Loss: 174.9448
Epoch [1/10], Step [160/250], Loss: 162.2901
Epoch [1/10], Step [170/250], Loss: 167.7577
Epoch [1/10], Step [180/250], Loss: 206.8517
Epoch [1/10], Step [190/250], Loss: 175.6917
Epoch [1/10], Step [200/250], Loss: 147.1778
Epoch [1/10], Step [210/250], Loss: 161.6917
Epoch [1/10], Step [220/250], Loss: 185.5316
Epoch [1/10], 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch [2/10], Step [10/250], Loss: 136.4793
Epoch [2/10], Step [20/250], Loss: 154.1091
Epoch [2/10], Step [30/250], Loss: 148.3020
Epoch [2/10], Step [40/250], Loss: 142.3313
Epoch [2/10], Step [50/250], Loss: 124.2923
Epoch [2/10], Step [60/250], Loss: 153.4417
Epoch [2/10], Step [70/250], Loss: 134.6255
Epoch [2/10], Step [80/250], Loss: 143.9094
Epoch [2/10], Step [90/250], Loss: 119.9430
Epoch [2/10], Step [100/250], Loss: 142.2036
Epoch [2/10], Step [110/250], Loss: 148.8927
Epoch [2/10], Step [120/250], Loss: 123.2168
Epoch [2/10], Step [130/250], Loss: 128.0288
Epoch [2/10], Step [140/250], Loss: 124.3980
Epoch [2/10], Step [150/250], Loss: 116.6490
Epoch [2/10], Step [160/250], Loss: 133.0189
Epoch [2/10], Step [170/250], Loss: 147.9809
Epoch [2/10], Step [180/250], Loss: 124.1066
Epoch [2/10], Step [190/250], Loss: 123.1701
Epoch [2/10], Step [200/250], Loss: 133.4922
Epoch [2/10], Step [210/250], Loss: 138.8601
Epoch [2/10], Step [220/250], Loss: 118.2033
Epoch [2/10], Step 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch [3/10], Step [10/250], Loss: 137.9628
Epoch [3/10], Step [20/250], Loss: 112.1264
Epoch [3/10], Step [30/250], Loss: 129.7979
Epoch [3/10], Step [40/250], Loss: 127.1412
Epoch [3/10], Step [50/250], Loss: 127.3604
Epoch [3/10], Step [60/250], Loss: 123.8983
Epoch [3/10], Step [70/250], Loss: 116.8362
Epoch [3/10], Step [80/250], Loss: 100.2570
Epoch [3/10], Step [90/250], Loss: 125.1436
Epoch [3/10], Step [100/250], Loss: 120.1727
Epoch [3/10], Step [110/250], Loss: 126.6572
Epoch [3/10], Step [120/250], Loss: 135.2001
Epoch [3/10], Step [130/250], Loss: 130.3113
Epoch [3/10], Step [140/250], Loss: 116.8130
Epoch [3/10], Step [150/250], Loss: 118.1501
Epoch [3/10], Step [160/250], Loss: 137.3941
Epoch [3/10], Step [170/250], Loss: 111.2843
Epoch [3/10], Step [180/250], Loss: 113.9249
Epoch [3/10], Step [190/250], Loss: 108.0497
Epoch [3/10], Step [200/250], Loss: 131.1588
Epoch [3/10], Step [210/250], Loss: 114.5397
Epoch [3/10], Step [220/250], Loss: 112.5743
Epoch [3/10], Step 

TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch [3/10], Step [250/250], Loss: 122.6408


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch [4/10], Step [10/250], Loss: 117.2511
Epoch [4/10], Step [20/250], Loss: 100.6183
Epoch [4/10], Step [30/250], Loss: 127.9222
Epoch [4/10], Step [40/250], Loss: 126.8414
Epoch [4/10], Step [50/250], Loss: 111.9195
Epoch [4/10], Step [60/250], Loss: 127.7744
Epoch [4/10], Step [70/250], Loss: 122.5267
Epoch [4/10], Step [80/250], Loss: 133.7843
Epoch [4/10], Step [90/250], Loss: 124.8258
Epoch [4/10], Step [100/250], Loss: 120.7554
Epoch [4/10], Step [110/250], Loss: 120.1002
Epoch [4/10], Step [120/250], Loss: 113.7559
Epoch [4/10], Step [130/250], Loss: 131.4079
Epoch [4/10], Step [140/250], Loss: 130.0333
Epoch [4/10], Step [150/250], Loss: 106.6927
Epoch [4/10], Step [160/250], Loss: 97.5887
Epoch [4/10], Step [170/250], Loss: 106.8365
Epoch [4/10], Step [180/250], Loss: 93.7810
Epoch [4/10], Step [190/250], Loss: 108.3759
Epoch [4/10], Step [200/250], Loss: 118.7685
Epoch [4/10], Step [210/250], Loss: 123.4406
Epoch [4/10], Step [220/250], Loss: 126.4320
Epoch [4/10], Step [2

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch [5/10], Step [10/250], Loss: 126.0642
Epoch [5/10], Step [20/250], Loss: 116.2042
Epoch [5/10], Step [30/250], Loss: 118.7453
Epoch [5/10], Step [40/250], Loss: 108.4187
Epoch [5/10], Step [50/250], Loss: 103.8763
Epoch [5/10], Step [60/250], Loss: 114.6579
Epoch [5/10], Step [70/250], Loss: 130.2008
Epoch [5/10], Step [80/250], Loss: 100.7407
Epoch [5/10], Step [90/250], Loss: 121.6217
Epoch [5/10], Step [100/250], Loss: 129.5717
Epoch [5/10], Step [110/250], Loss: 103.3908
Epoch [5/10], Step [120/250], Loss: 125.3384
Epoch [5/10], Step [130/250], Loss: 145.4140
Epoch [5/10], Step [140/250], Loss: 123.2391
Epoch [5/10], Step [150/250], Loss: 111.9265
Epoch [5/10], Step [160/250], Loss: 107.1448
Epoch [5/10], Step [170/250], Loss: 116.6856
Epoch [5/10], Step [180/250], Loss: 103.7813
Epoch [5/10], Step [190/250], Loss: 104.6075
Epoch [5/10], Step [200/250], Loss: 117.0537
Epoch [5/10], Step [210/250], Loss: 109.5249
Epoch [5/10], Step [220/250], Loss: 131.6974
Epoch [5/10], Step 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch [6/10], Step [10/250], Loss: 102.9738
Epoch [6/10], Step [20/250], Loss: 120.6307
Epoch [6/10], Step [30/250], Loss: 103.4153
Epoch [6/10], Step [40/250], Loss: 101.6525
Epoch [6/10], Step [50/250], Loss: 115.7540
Epoch [6/10], Step [60/250], Loss: 116.5428
Epoch [6/10], Step [70/250], Loss: 133.5904
Epoch [6/10], Step [80/250], Loss: 133.1753
Epoch [6/10], Step [90/250], Loss: 112.0092
Epoch [6/10], Step [100/250], Loss: 116.1924
Epoch [6/10], Step [110/250], Loss: 120.6336
Epoch [6/10], Step [120/250], Loss: 108.5417
Epoch [6/10], Step [130/250], Loss: 114.5824
Epoch [6/10], Step [140/250], Loss: 106.0465
Epoch [6/10], Step [150/250], Loss: 107.0128
Epoch [6/10], Step [160/250], Loss: 104.5076
Epoch [6/10], Step [170/250], Loss: 117.6576
Epoch [6/10], Step [180/250], Loss: 123.2336
Epoch [6/10], Step [190/250], Loss: 117.2726
Epoch [6/10], Step [200/250], Loss: 112.1479
Epoch [6/10], Step [210/250], Loss: 126.4739
Epoch [6/10], Step [220/250], Loss: 113.6585
Epoch [6/10], Step 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch [7/10], Step [10/250], Loss: 108.1268
Epoch [7/10], Step [20/250], Loss: 128.9407
Epoch [7/10], Step [30/250], Loss: 106.4290
Epoch [7/10], Step [40/250], Loss: 104.4338
Epoch [7/10], Step [50/250], Loss: 107.7215
Epoch [7/10], Step [60/250], Loss: 111.8962
Epoch [7/10], Step [70/250], Loss: 96.1509
Epoch [7/10], Step [80/250], Loss: 100.4778
Epoch [7/10], Step [90/250], Loss: 117.1059
Epoch [7/10], Step [100/250], Loss: 114.1706
Epoch [7/10], Step [110/250], Loss: 100.6361
Epoch [7/10], Step [120/250], Loss: 109.3726
Epoch [7/10], Step [130/250], Loss: 109.1886
Epoch [7/10], Step [140/250], Loss: 128.1091
Epoch [7/10], Step [150/250], Loss: 107.3138
Epoch [7/10], Step [160/250], Loss: 99.7722
Epoch [7/10], Step [170/250], Loss: 121.5961
Epoch [7/10], Step [180/250], Loss: 111.7435
Epoch [7/10], Step [190/250], Loss: 102.3688
Epoch [7/10], Step [200/250], Loss: 128.9720
Epoch [7/10], Step [210/250], Loss: 114.6114
Epoch [7/10], Step [220/250], Loss: 123.8648
Epoch [7/10], Step [2

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch [8/10], Step [10/250], Loss: 122.3809
Epoch [8/10], Step [20/250], Loss: 99.9339
Epoch [8/10], Step [30/250], Loss: 89.9272
Epoch [8/10], Step [40/250], Loss: 106.7215
Epoch [8/10], Step [50/250], Loss: 117.0284
Epoch [8/10], Step [60/250], Loss: 110.2725
Epoch [8/10], Step [70/250], Loss: 117.2591
Epoch [8/10], Step [80/250], Loss: 111.2425
Epoch [8/10], Step [90/250], Loss: 100.3124
Epoch [8/10], Step [100/250], Loss: 110.5373
Epoch [8/10], Step [110/250], Loss: 103.1211
Epoch [8/10], Step [120/250], Loss: 99.6969
Epoch [8/10], Step [130/250], Loss: 134.3621
Epoch [8/10], Step [140/250], Loss: 111.7913
Epoch [8/10], Step [150/250], Loss: 115.6362
Epoch [8/10], Step [160/250], Loss: 105.3374
Epoch [8/10], Step [170/250], Loss: 95.1773
Epoch [8/10], Step [180/250], Loss: 100.6977
Epoch [8/10], Step [190/250], Loss: 115.4196
Epoch [8/10], Step [200/250], Loss: 132.9806
Epoch [8/10], Step [210/250], Loss: 119.4558
Epoch [8/10], Step [220/250], Loss: 104.8916
Epoch [8/10], Step [230

TOKENIZERS_PARALLELISM=(true | false)


Epoch [8/10], Step [250/250], Loss: 108.3817


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch [9/10], Step [10/250], Loss: 101.1842
Epoch [9/10], Step [20/250], Loss: 104.3108
Epoch [9/10], Step [30/250], Loss: 119.3657
Epoch [9/10], Step [40/250], Loss: 112.9912
Epoch [9/10], Step [50/250], Loss: 112.8099
Epoch [9/10], Step [60/250], Loss: 81.9102
Epoch [9/10], Step [70/250], Loss: 122.0830
Epoch [9/10], Step [80/250], Loss: 97.2430
Epoch [9/10], Step [90/250], Loss: 105.3666
Epoch [9/10], Step [100/250], Loss: 104.0759
Epoch [9/10], Step [110/250], Loss: 105.2614
Epoch [9/10], Step [120/250], Loss: 123.0393
Epoch [9/10], Step [130/250], Loss: 112.6892
Epoch [9/10], Step [140/250], Loss: 114.1610
Epoch [9/10], Step [150/250], Loss: 86.4679
Epoch [9/10], Step [160/250], Loss: 100.5414
Epoch [9/10], Step [170/250], Loss: 116.5331
Epoch [9/10], Step [180/250], Loss: 104.5601
Epoch [9/10], Step [190/250], Loss: 93.5961
Epoch [9/10], Step [200/250], Loss: 104.7480
Epoch [9/10], Step [210/250], Loss: 107.0865
Epoch [9/10], Step [220/250], Loss: 94.1099
Epoch [9/10], Step [230/

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch [10/10], Step [10/250], Loss: 110.2848
Epoch [10/10], Step [20/250], Loss: 92.2484
Epoch [10/10], Step [30/250], Loss: 103.7044
Epoch [10/10], Step [40/250], Loss: 124.2909
Epoch [10/10], Step [50/250], Loss: 113.7615
Epoch [10/10], Step [60/250], Loss: 84.6242
Epoch [10/10], Step [70/250], Loss: 121.9430
Epoch [10/10], Step [80/250], Loss: 106.9101
Epoch [10/10], Step [90/250], Loss: 102.3693
Epoch [10/10], Step [100/250], Loss: 91.8315
Epoch [10/10], Step [110/250], Loss: 97.2724
Epoch [10/10], Step [120/250], Loss: 113.0755
Epoch [10/10], Step [130/250], Loss: 112.4084
Epoch [10/10], Step [140/250], Loss: 106.8098
Epoch [10/10], Step [150/250], Loss: 89.9621
Epoch [10/10], Step [160/250], Loss: 104.1515
Epoch [10/10], Step [170/250], Loss: 100.5593
Epoch [10/10], Step [180/250], Loss: 106.0526
Epoch [10/10], Step [190/250], Loss: 115.6786
Epoch [10/10], Step [200/250], Loss: 93.4203
Epoch [10/10], Step [210/250], Loss: 116.5991
Epoch [10/10], Step [220/250], Loss: 116.8332
Epo

In [18]:
def tokenize_input(input_text, tokenizer):
    # Tokenize and convert to tensor
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
    return inputs["input_ids"]


In [19]:
def encode_input(input_ids, encoder, device):
    input_ids = input_ids.to(device)
    encoder_out, encoder_hidden = encoder(input_ids)
    return encoder_out, encoder_hidden


In [20]:
def generate_summary(input_ids, encoder, decoder, device, max_length=50):
    encoder_out, (encoder_hidden, encoder_cell) = encode_input(input_ids, encoder, device)

    # Start decoding
    decoder_input = torch.tensor([[tokenizer.bos_token_id]]).to(device)  # BOS token
    decoder_hidden = encoder_hidden
    decoder_cell = encoder_cell
    summary = []

    for _ in range(max_length):
        output, decoder_hidden, decoder_cell = decoder(decoder_input, decoder_hidden, decoder_cell)
        next_token = output.argmax(2)  # Get the index of the highest probability token
        summary.append(next_token.item())

        # Stop if the EOS token is generated
        if next_token.item() == tokenizer.eos_token_id:
            break
        
        decoder_input = next_token  # Use the predicted token as next input

    return summary


In [21]:
def decode_summary(summary, tokenizer):
    return tokenizer.decode(summary, skip_special_tokens=True)


In [22]:
def predict_summary(input_text, encoder, decoder, tokenizer, device):
    input_ids = tokenize_input(input_text, tokenizer)
    summary_ids = generate_summary(input_ids, encoder, decoder, device)
    summary = decode_summary(summary_ids, tokenizer)
    return summary


In [27]:
# Load your trained models
checkpoint = torch.load('encoder_decoder_model.pth')
encoder.load_state_dict(checkpoint['encoder_state_dict'])
decoder.load_state_dict(checkpoint['decoder_state_dict'])

# Set to evaluation mode
encoder.eval()
decoder.eval()

# Input text to summarize
input_text = """Then, is there any method to get the total number of iteration for the "for loop"?

In my NLP problem, the total number of iteration is different from int(n_train_samples/batch_size)...

For example, if I truncate train data only 10,000 samples and set the batch size as 1024, then 363 iteration occurs in my NLP problem.

I wonder how to get the number of total iteration in "the for-loop"."""

# Get the summary
summary = predict_summary(input_text, encoder, decoder, tokenizer, device)
print("Summary:", summary)


Summary: I-- to to to to:::


  checkpoint = torch.load('encoder_decoder_model.pth')


In [25]:
summary

''