In [1]:
import pandas as pd
import numpy as np
import torch 
import torch.nn as nn
from torch.optim import Adam, SGD

import nltk
from nltk.corpus import stopwords
import string
 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/hyarrava/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
### Data Loading

In [3]:
data = pd.read_csv("../data/news_summary_more.csv")
data.head(5)

Unnamed: 0,headlines,text
0,upGrad learner switches to career in ML & Al w...,"Saurav Kant, an alumnus of upGrad and IIIT-B's..."
1,Delhi techie wins free food from Swiggy for on...,Kunal Shah's credit card bill payment platform...
2,New Zealand end Rohit Sharma-led India's 12-ma...,New Zealand defeated India by 8 wickets in the...
3,Aegon life iTerm insurance plan helps customer...,"With Aegon Life iTerm Insurance plan, customer..."
4,"Have known Hirani for yrs, what if MeToo claim...",Speaking about the sexual harassment allegatio...


In [4]:
data.describe()

Unnamed: 0,headlines,text
count,98401,98401
unique,98280,98360
top,Warne produced 'ball of century' with his 1st ...,Virender Sehwag was captaining India when he h...
freq,3,2


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98401 entries, 0 to 98400
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   headlines  98401 non-null  object
 1   text       98401 non-null  object
dtypes: object(2)
memory usage: 1.5+ MB


In [6]:
data["headlines"].max(), data["headlines"].min()

("â\x82¹95 lakh prize offered for 'radical ideas' to solve weak UK growth",
 "  'Loveratri' is not demeaning towards any culture: Salman")

In [7]:
len("""("â\x82¹95 lakh prize offered for 'radical ideas' to solve weak UK growth",""")

72

In [8]:
def char_count(x):
    return len(x)

In [9]:
data["headlines_char_count"] = data["headlines"].apply(lambda x: char_count(x))
data["text_char_count"] = data["text"].apply(lambda x: char_count(x))

In [10]:
data["headlines_char_count"].max(), data["headlines_char_count"].min(), data["text_char_count"].max(), data["text_char_count"].min()

(79, 9, 450, 4)

In [11]:
data[data["text_char_count"]== data["text_char_count"].min()]

Unnamed: 0,headlines,text,headlines_char_count,text_char_count
52,headlines,text,9,4


In [12]:
data.shape

(98401, 4)

In [13]:
data["text"].iloc[75]

'Ousted Nissan Chairman Carlos Ghosn has said his arrest over alleged financial misconduct was led by "plot and treason" by the Japanese carmaker\'s executives who opposed its deeper integration with Renault and Mitsubishi. Ghosn added he had discussed the integration plans with Nissan\'s CEO in September, a month before his arrest. He further said he wouldn\'t flee if granted bail.'

### Punctuation removal, stop word removal, loweing

In [14]:
stop_words = set(stopwords.words('english'))


In [15]:
def remove_punct_stop_words(sentence):
    translator = str.maketrans('', '', string.punctuation)
    punct_sentence = sentence.translate(translator)
    
    clean_words = []
    for word in punct_sentence.split(' '):
        if word not in stop_words:
            clean_words.append(word.lower())

    return ' '.join([word for word in clean_words])

In [16]:
sample_sentence = """'Ousted Nissan Chairman Carlos Ghosn has said his arrest over alleged financial misconduct was led by "plot and treason" by the Japanese carmaker\'s executives who opposed its deeper integration with Renault and Mitsubishi. Ghosn added he had discussed the integration plans with Nissan\'s CEO in September, a month before his arrest. He further said he wouldn\'t flee if granted bail.'"""

clean_sentence = remove_punct_stop_words(sample_sentence)

In [17]:
clean_sentence

'ousted nissan chairman carlos ghosn said arrest alleged financial misconduct led plot treason japanese carmakers executives opposed deeper integration renault mitsubishi ghosn added discussed integration plans nissans ceo september month arrest he said wouldnt flee granted bail'

### Punctuation removal, stop word removal, loweing 
### may not be helpful in this case as they cutdown the fluency

## We wil be movng without them
## We will do tokenization and then Word embeddings


In [None]:
## Tokenization
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("slauw87/bart_summarisation")

## Tokenization padding usin AutoTokenizer from Hugging face transformer model

headlines = data["headlines"].to_list()
text = data["text"].to_list()

def tokenize_words(text, tokenizer):
    
    tokens = tokenizer(text, padding = True, truncation = True,
                            return_tensors = "pt", max_length = 1024)
    return tokens["input_ids"], tokens["attention_mask"]

def create_tokens_attn_masks(text, tokenizer):
    tokens_list, attn_masks_list = [], []
    for line in text:
        input_ids , attention_masks = tokenize_words(line, tokenizer)
        tokens_list.append(input_ids)
        attn_masks_list.append(attention_masks)

    return tokens_list, attn_masks_list

headlines_tokens, hl_attn_masks = create_tokens_attn_masks(headlines, tokenizer)
text_tokens, text_attn_masks = create_tokens_attn_masks(text, tokenizer)

headlines_tokens, hl_attn_masks = [], []
for line in headlines:
    input_ids , attention_masks = tokenize_words(line, tokenizer)
    headlines_tokens.append(input_ids)
    hl_attn_masks.append(attention_masks)

text_tokens, text_attn_masks = [] , []
for line in text:
    input_ids , attention_masks = tokenize_words(line, tokenizer)
    text_tokens.append(input_ids)
    text_attn_masks.append(attention_masks)

len(text_tokens)

text_tokens[0].shape, text_tokens[20].shape, text_tokens[103].shape





In [31]:
### Creating the Summarize text class
from torch.utils.data import Dataset, DataLoader
class Summarize_text(Dataset):
    def __init__(self, txt, summary, tokenizer, max_length = 4):
        super().__init__()

        self.txt = txt
        self.summary = summary
        self.max_length = max_length

        self.tokens_list, self.attn_masks_list = self.tokenize_words(self.txt)
        self.smr_tokens_list, self.smr_attn_masks_list = self.tokenize_words(self.summary)


    def tokenize_words(self, text):
        tokens_list, attn_masks_list = [], []
        for line in text:
            tokens = tokenizer(line, padding = 'max_length', truncation = True,
                                return_tensors = "pt", max_length = 4)
            input_ids , attention_masks = tokens["input_ids"], tokens["attention_mask"]
            tokens_list.append(input_ids.squeeze(0))
            attn_masks_list.append(attention_masks.squeeze(0))
        return tokens_list,  attn_masks_list
        

    def __len__(self):
        return len(self.token_list)

    def __getitem__(self, idx):
        return {
            "input_ids" : self.tokens_list[idx],
            "attention_mask" : self.attn_masks_list[idx],
            "labels" : self.smr_tokens_list[idx],
            "labels_attention_mask" : self.smr_attn_masks_list[idx]
        }

In [26]:
def create_dataloader_V1(text, headlines, max_length = 4, batch_size = 4, stride =4, num_workers = 4):
    dataset = Summarize_text(text,headlines,tokenizer, max_length = 4)
    dataloader= DataLoader(dataset,
    batch_size = batch_size,
    shuffle = True,
    drop_last = True,
    num_workers = num_workers
                )
    return dataloader

In [27]:
headlines = data["headlines"].to_list()
text = data["text"].to_list()

## Tokenization
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("slauw87/bart_summarisation")
dataloader = create_dataloader_V1(text, headlines, max_length = 4, batch_size = 4, stride =4, num_workers = 4)

In [28]:
data_iter = iter(dataloader)
firstbatch= next(data_iter)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [37]:
firstbatch

{'input_ids': tensor([[    0, 42332,    41,     2],
         [    0,   133,  5729,     2],
         [    0, 36361,  5471,     2],
         [    0,   250,   569,     2]]),
 'attention_mask': tensor([[1, 1, 1, 1],
         [1, 1, 1, 1],
         [1, 1, 1, 1],
         [1, 1, 1, 1]]),
 'labels': tensor([[    0,   176,     6,     2],
         [    0,   448, 26772,     2],
         [    0,   863,  2678,     2],
         [    0, 12302,   661,     2]]),
 'labels_attention_mask': tensor([[1, 1, 1, 1],
         [1, 1, 1, 1],
         [1, 1, 1, 1],
         [1, 1, 1, 1]])}

In [36]:
embedding_layer(torch.tensor([3, 2,4,5,6,12,4]))

tensor([[ 1.4104e+00, -7.8955e-01,  3.9243e-01,  1.6079e+00,  2.1909e+00,
         -3.0261e-01,  4.3356e-01,  2.7988e-01,  3.4332e-01, -3.5139e+00],
        [ 3.6605e-01,  1.7165e-01,  5.4576e-01, -1.4111e-01, -1.1584e+00,
          5.4027e-02,  2.9665e+00,  1.3688e-01,  4.9501e-01,  1.5520e+00],
        [ 2.0591e-01, -5.3120e-02,  1.2891e-01,  4.1605e-01, -4.1656e-02,
          2.8797e-01,  1.6189e+00,  1.0940e+00,  5.5585e-01, -4.2801e-01],
        [-4.2894e-01,  1.9008e+00, -9.8795e-01, -1.0188e+00,  9.3724e-01,
         -2.7142e+00, -8.4535e-01,  7.1753e-01,  2.3240e-02,  4.9383e-01],
        [ 2.0435e-03,  9.0907e-01, -1.9496e+00, -1.5696e+00, -4.7788e-01,
         -8.8043e-02,  8.6036e-01, -6.0668e-01, -6.7618e-01,  1.3413e+00],
        [ 8.6062e-01,  5.7633e-01,  6.3997e-01, -8.3114e-01, -2.2186e+00,
          6.0251e-01,  2.3032e-01,  1.0184e+00, -1.1668e+00,  3.7657e-02],
        [ 2.0591e-01, -5.3120e-02,  1.2891e-01,  4.1605e-01, -4.1656e-02,
          2.8797e-01,  1.6189e+0

In [None]:
sample_text = text[0] + text[1]

In [None]:
sample_text

In [None]:
!pip install tiktoken 

In [None]:
from importlib.metadata import version
import tiktoken
print("tiktoken version", version("tiktoken"))

In [None]:
tokenizer = tiktoken.get_encoding("gpt2")
integers = tokenizer.encode(sample_text, allowed_special = {"<|endoftext>"})
print(integers)
                

In [None]:
strings = tokenizer.decode(integers)

In [None]:
print(strings)

In [None]:
## Data set preparation
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        super().__init__()

        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)

        for i in range(0, len(token_ids)-max_length,stride): ## Moving the words with stride lenght
            input_chunk = token_ids[i : i+max_length]
            self.input_ids.append(torch.tensor(input_chunk))

            target_chunk = token_ids[i+1 : i+max_length+1] ## predicting the next word
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]
            
            
                                  

In [None]:
### Creating DataLoaders

def create_dataloader_v1(txt, batch_size =4, max_length = 256, 
                           stride = 128, shuffle = True, drop_last = True, 
                           num_workers = 0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    dataloader = DataLoader(
        dataset,
        batch_size = batch_size,
        shuffle = shuffle,
        drop_last = drop_last,
        num_workers = num_workers
    )
    return dataloader

In [None]:
dataloader = create_dataloader_v1(sample_text, batch_size = 8, max_length =4, 
                                  stride =4, shuffle = False)

dataiter = iter(dataloader)
first_batch = next(dataiter)

In [None]:
input_tokens = tokenizer.encode(sample_text)
print(input_tokens)

In [None]:
len([50, 2899, 615, 29576, 11, 281, 435, 4182, 385, 286, 510, 42731, 290, 2873, 2043, 12, 33, 338, 23842, 6118, 287, 10850, 4673, 290, 35941, 9345, 11, 373, 257, 21714, 11998, 23164, 379, 
     4806, 418, 893, 351, 2048, 642, 812, 286, 670, 1998, 13, 383, 1430, 290, 510, 42731, 338, 11470, 12, 16863, 3451, 1104, 4193, 683, 6801, 284, 257, 6060, 33374, 379, 9634])

In [None]:
first_batch

In [None]:
second_batch = next(dataiter)
print(second_batch)

In [None]:
first_batch
## One batch contains batch-size samples each of lenght = max_length 
## that gives total batch_size*max_length items in a batch
## Stride 

In [None]:
second_batch = next(dataiter)
print(second_batch)