### Goal: Summaraize text and predict what category it is out of business, entertainemnt, politics, sport and tech

In [73]:
import os
import re
import pandas as pd
from pathlib import Path

import nltk

import torch
from torch.utils.data import Dataset, DataLoader

In [74]:
data_path = Path(r"C:\Users\bamilosin\Documents\dataset\nlp\summarization\BBC News Summary")
articles_path = data_path / "News Articles"
summaries_path = data_path / "Summaries"

expected df pattern

 {

     "text": ["a ridiculously long text", "another ridiculously long text"],

     "summary" : ["long text", "another long text"],

     "category" : ["normal", "normal"]

 }

In [76]:
# load data from .txt's across all categories for articles and thier summaries
categories = ["business", "entertainment", "politics", "sport", "tech"]

articles  = []
label = []
summaries = []

for category in categories:
    for file in os.listdir(articles_path / category):
        with open(articles_path / category / file) as f:
            file_ = f.read()
            articles.append(file_)
            label.append(category)

        with open(summaries_path / category / file) as f:
            file_ = f.read()
            summaries.append(file_)


data_dict = {
    "articles" : articles,
    "summaries" : summaries,
    "labels" : label
}

data = pd.DataFrame(data_dict)
data

Unnamed: 0,articles,summaries,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,TimeWarner said fourth quarter sales rose 2% t...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,The dollar has hit its highest level against t...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,Yukos' owner Menatep Group says it will ask Ro...,business
3,High fuel prices hit BA's profits\n\nBritish A...,"Rod Eddington, BA's chief executive, said the ...",business
4,Pernod takeover talk lifts Domecq\n\nShares in...,Pernod has reduced the debt it took on to fund...,business
...,...,...,...
2220,BT program to beat dialler scams\n\nBT is intr...,BT is introducing two initiatives to help beat...,tech
2221,Spam e-mails tempt net shoppers\n\nComputer us...,A third of them read unsolicited junk e-mail a...,tech
2222,Be careful how you code\n\nA new European dire...,This goes to the heart of the European project...,tech
2223,US cyber security chief resigns\n\nThe man mak...,Amit Yoran was director of the National Cyber ...,tech


In [77]:
class NewsDataset(Dataset):
    def __init__(self, data, tokenizer, stopwords):
        super(NewsDataset, self).__init__()

        self.data = data

        self.data['cleaned_articles'] = self.data['articles'].apply(preprocess)
        self.data['cleaned_summaries'] = self.data['summaries'].apply(preprocess)

        self.data['cleaned_articles_tokens'] = self.data['cleaned_articles'].apply(lambda x: tokenize_text(x, tokenizer, stopwords))
        self.data['cleaned_summaries_tokens'] = self.data['cleaned_summaries'].apply(lambda x: tokenize_text(x, tokenizer, stopwords))

        # get longest article
        self.longest_article = self.get_longest_seq(list(self.data['cleaned_articles_tokens'].values))
        # get longest summary
        self.longest_summary = self.get_longest_seq(list(self.data['cleaned_summaries_tokens'].values))

        # article tokens list
        self.all_tokens_articles =  list(self.data['cleaned_articles_tokens'].values)
        # summaries tokens list
        self.all_tokens_summaries =  list(self.data['cleaned_summaries_tokens'].values)
        # all tokens in articles and summaries
        self.all_tokens = all_tokens_articles + all_tokens_summaries
        # list of unique tokens
        self.all_tokens = sorted(list(set([token for tokens_list in self.all_tokens for token in tokens_list])))

        # get vocab
        self.vocab = create_vocab(self.all_tokens)

        # pad article tokens
        self.data['padded_articles_tokens'] = self.data['cleaned_articles_tokens'].apply(lambda x: self.pad_tokens(x, self.longest_article))
        # pad summary tokens
        self.data['padded_summaries_tokens'] = self.data['cleaned_summaries_tokens'].apply(lambda x: self.pad_tokens(x, self.longest_summary))

        # get article indices
        self.data["articles_indices"] = self.data["padded_articles_tokens"].apply(lambda x : self.get_indices(x, self.vocab))
        # # get summary tokens
        self.data["summaries_indices"] = self.data["padded_summaries_tokens"].apply(lambda x : self.get_indices(x, self.vocab))

        self.label_map = {
            "business": 0, "entertainment" : 1, "politics" : 2, "sport" : 3, "tech" : 4
        }

        self.data['label_idxs'] = self.data['labels'].apply(lambda x : self.get_label_indices(x, self.label_map))


    def preprocess(self, text):
        lower = text.lower()
        text = re.sub(r'!"#$%&\'()*+-/:;<=>?@[\\]^_`{|}~', '', lower)
        # add sos and eos tokens
        text = '<SOS>' + text + '<EOS>'

        return text


    def tokenize_text(self, text, tokenizer, stopwords):
        tokens = tokenizer.tokenize(text)
        tokens = [token for token in tokens if token not in stopwords]  
        return tokens


    def get_longest_seq(self, tokens_list):
        longest_seq = 0

        for tokens in tokens_list:
            # get longest sequence
            if len(tokens) > longest_seq:
                longest_seq = len(tokens)

        return longest_seq


    def pad_tokens(self, tokens, longest_seq):
        if len(tokens) < longest_seq:
            tokens = tokens + ['<PAD>' for _ in range(longest_seq - len(tokens))]
        return tokens
        

    def create_vocab(self, all_tokens):
        vocab = {
            '<UNK>' : 0,
            '<PAD>' : 1,
            '<SOS>' : 2,
            '<EOS>' : 3
        }

        for token in all_tokens:
            if token not in vocab:
                vocab[token] = (len(vocab) - 1) + 1
        
        return vocab    

    
    def get_indices(self, tokens, vocab):
        indices = []
        for token in tokens:
            indices.append(vocab[token])

        return indices
    
    def get_label_indices(self, label, label_map):
        return label_map[label]
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        article = torch.tensor(self.data.loc[idx, 'articles_indices'])
        summary = torch.tensor(self.data.loc[idx, 'summaries_indices'])
        label = torch.tensor(self.data.loc[idx, 'label_idxs'])

        return article, summary, label
        

In [None]:
tokenizer = nltk.TweetTokenizer()
stopwords = nltk.corpus.stopwords.words('english')

dataset = NewsDataset(data, tokenizer, stopwords)

(tensor([    2,  3522, 34389,  ...,     1,     1,     1]),
 tensor([    2, 39718, 34122,  ...,     1,     1,     1]),
 tensor(0))