In [54]:
import pandas as pd
import numpy as np
import torch 
import torch.nn as nn
from torch.optim import Adam, SGD

import nltk
from nltk.corpus import stopwords
import string
 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/hyarrava/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [30]:
### Data Loading

In [2]:
data = pd.read_csv("../data/news_summary_more.csv")
data.head(5)

Unnamed: 0,headlines,text
0,upGrad learner switches to career in ML & Al w...,"Saurav Kant, an alumnus of upGrad and IIIT-B's..."
1,Delhi techie wins free food from Swiggy for on...,Kunal Shah's credit card bill payment platform...
2,New Zealand end Rohit Sharma-led India's 12-ma...,New Zealand defeated India by 8 wickets in the...
3,Aegon life iTerm insurance plan helps customer...,"With Aegon Life iTerm Insurance plan, customer..."
4,"Have known Hirani for yrs, what if MeToo claim...",Speaking about the sexual harassment allegatio...


In [3]:
data.describe()

Unnamed: 0,headlines,text
count,98401,98401
unique,98280,98360
top,Warne produced 'ball of century' with his 1st ...,Virender Sehwag was captaining India when he h...
freq,3,2


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98401 entries, 0 to 98400
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   headlines  98401 non-null  object
 1   text       98401 non-null  object
dtypes: object(2)
memory usage: 1.5+ MB


In [7]:
data["headlines"].max(), data["headlines"].min()

("â\x82¹95 lakh prize offered for 'radical ideas' to solve weak UK growth",
 "  'Loveratri' is not demeaning towards any culture: Salman")

In [25]:
len("""("â\x82¹95 lakh prize offered for 'radical ideas' to solve weak UK growth",""")

72

In [8]:
def char_count(x):
    return len(x)

In [21]:
data["headlines_char_count"] = data["headlines"].apply(lambda x: char_count(x))
data["text_char_count"] = data["text"].apply(lambda x: char_count(x))

In [None]:
("â\x82¹95 lakh prize offered for 'radical ideas' to solve weak UK growth",

In [23]:
data["headlines_char_count"].max(), data["headlines_char_count"].min(), data["text_char_count"].max(), data["text_char_count"].min()

(79, 9, 450, 4)

In [26]:
data[data["text_char_count"]== data["text_char_count"].min()]

Unnamed: 0,headlines,text,headlines_char_count,text_char_count
52,headlines,text,9,4


In [29]:
data.shape

(98401, 4)

In [46]:
data["text"].iloc[75]

'Ousted Nissan Chairman Carlos Ghosn has said his arrest over alleged financial misconduct was led by "plot and treason" by the Japanese carmaker\'s executives who opposed its deeper integration with Renault and Mitsubishi. Ghosn added he had discussed the integration plans with Nissan\'s CEO in September, a month before his arrest. He further said he wouldn\'t flee if granted bail.'

### Punctuation removal, stop word removal, loweing

In [40]:
stop_words = set(stopwords.words('english'))


In [55]:
def remove_punct_stop_words(sentence):
    translator = str.maketrans('', '', string.punctuation)
    punct_sentence = sentence.translate(translator)
    
    clean_words = []
    for word in punct_sentence.split(' '):
        if word not in stop_words:
            clean_words.append(word.lower())

    return ' '.join([word for word in clean_words])

In [56]:
sample_sentence = """'Ousted Nissan Chairman Carlos Ghosn has said his arrest over alleged financial misconduct was led by "plot and treason" by the Japanese carmaker\'s executives who opposed its deeper integration with Renault and Mitsubishi. Ghosn added he had discussed the integration plans with Nissan\'s CEO in September, a month before his arrest. He further said he wouldn\'t flee if granted bail.'"""

clean_sentence = remove_punct_stop_words(sample_sentence)

In [57]:
clean_sentence

'ousted nissan chairman carlos ghosn said arrest alleged financial misconduct led plot treason japanese carmakers executives opposed deeper integration renault mitsubishi ghosn added discussed integration plans nissans ceo september month arrest he said wouldnt flee granted bail'

### Punctuation removal, stop word removal, loweing 
### may not be helpful in this case as they cutdown the fluency

## We wil be movng without them
## We will do tokenization and then Word embeddings


In [63]:
## Tokenization
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("slauw87/bart_summarisation")

tokenizer_config.json:   0%|          | 0.00/300 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



## Tokenization padding usin AutoTokenizer from Hugging face transformer model

In [135]:
headlines = data["headlines"].to_list()
text = data["text"].to_list()

In [136]:
def tokenize_words(text, tokenizer):
    
    tokens = tokenizer(text, padding = True, truncation = True,
                            return_tensors = "pt", max_length = 1024)
    return tokens["input_ids"], tokens["attention_mask"]

In [137]:
def create_tokens_attn_masks(text, tokenizer):
    tokens_list, attn_masks_list = [], []
    for line in text:
        input_ids , attention_masks = tokenize_words(line, tokenizer)
        tokens_list.append(input_ids)
        attn_masks_list.append(attention_masks)

    return tokens_list, attn_masks_list

In [138]:
headlines_tokens, hl_attn_masks = create_tokens_attn_masks(headlines, tokenizer)
text_tokens, text_attn_masks = create_tokens_attn_masks(text, tokenizer)

In [130]:
headlines_tokens, hl_attn_masks = [], []
for line in headlines:
    input_ids , attention_masks = tokenize_words(line, tokenizer)
    headlines_tokens.append(input_ids)
    hl_attn_masks.append(attention_masks)

In [131]:
text_tokens, text_attn_masks = []
for line in text:
    input_ids , attention_masks = tokenize_words(line, tokenizer)
    text_tokens.append(input_ids)
    text_attn_masks.append(attention_masks)