In [1]:
# BBC News archive ~2225 examples of news articles and their respective categories (labels)

import csv
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
with open("bbc-text.csv",'r') as csvfile:
    print(csvfile.readline())
    print(csvfile.readline())

category,text

tech,tv future in the hands of viewers with home theatre systems  plasma high-definition tvs  and digital video recorders moving into the living room  the way people watch tv will be radically different in five years  time.  that is according to an expert panel which gathered at the annual consumer electronics show in las vegas to discuss how these new technologies will impact one of our favourite pastimes. with the us leading the trend  programmes and other content will be delivered to viewers via home networks  through cable  satellite  telecoms companies  and broadband service providers to front rooms and portable devices.  one of the most talked-about technologies of ces has been digital and personal video recorders (dvr and pvr). these set-top boxes  like the us s tivo and the uk s sky+ system  allow people to record  store  play  pause and forward wind tv programmes when they want.  essentially  the technology allows for much more personalised tv. they are also bei

In [3]:
# Removing STOP words
def remove_stopwords(sentence):
    """
    Removes a list of stopwords
    Args: sentence (string): sentence to remove the stopwords from
    Returns: sentence(string): lowercase sentence without the stopwords
    """
    
    # List of stopwords
    stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", 
                 "and", "any", "are", "as", "at", "be", "because", "been", "before", 
                 "being", "below", "between", "both", "but", "by", "could", "did", "do", 
                 "does", "doing", "down", "during", "each", "few", "for", "from", "further", 
                 "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", 
                 "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", 
                 "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", 
                 "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", 
                 "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", 
                 "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", 
                 "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", 
                 "themselves", "then", "there", "there's", "these", "they", "they'd", 
                 "they'll", "they're", "they've", "this", "those", "through", "to", "too", 
                 "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", 
                 "we've", "were", "what", "what's", "when", "when's", "where", "where's", 
                 "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", 
                 "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", 
                 "yourselves" ]
    
    # Convert to lowercase
    sentence = sentence.lower()
    
    # Elegant version
    #sentence = ' '.join([word for word in sentence.split() if word not in stopwords])
    
    # Basic version
    temp = ""
    for word in sentence.split():
        if word not in stopwords:
            temp = temp + word + ' '
    sentence = temp.strip()    
    
    return sentence

In [4]:
# Test remove_stopwords function
remove_stopwords("I am about to go to the kitchen and have lunch")
# 'go kitchen lunch'

'go kitchen lunch'

In [5]:
# Reading the raw data
# The first line of CSV file should be ommited
# Use regular lists
# Use csv.reader
# This returns an iterable label-> row[0], text -> row[1]
# Use the remove_stopwords function in each sentence

def parse_data_from_file(filename):
    """
    Extracts sentences and labels from a CSV file
    Args: filename (string): path to the CSV file
    Returns: sentences, labels (list of strings, list of strings): tuple containing lists
    """
    
    sentences = []
    labels = []
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile)#(None, delimiter=None)
        # Skip first line
        next(reader, None)
        for data in reader:
            #print(data[0])
            sentences.append(remove_stopwords(data[1]))
            labels.append(data[0])
    return sentences, labels

In [6]:
# Test parse_data_from_file function
sentences, labels = parse_data_from_file("bbc-text.csv")

print("ORIGINAL DATASET:")
print(f"There are {len(sentences)} sentences in the dataset.")
print(f"First sentence has {len(sentences[0].split())} words (after removing stopwords).\n")
print(sentences[0].split())
print()
print(f"There are {len(labels)} labels in the dataset.")
print(f"The first 5 labels are {labels[:5]}\n\n")

ORIGINAL DATASET:
There are 2225 sentences in the dataset.
First sentence has 436 words (after removing stopwords).

['tv', 'future', 'hands', 'viewers', 'home', 'theatre', 'systems', 'plasma', 'high-definition', 'tvs', 'digital', 'video', 'recorders', 'moving', 'living', 'room', 'way', 'people', 'watch', 'tv', 'will', 'radically', 'different', 'five', 'years', 'time.', 'according', 'expert', 'panel', 'gathered', 'annual', 'consumer', 'electronics', 'show', 'las', 'vegas', 'discuss', 'new', 'technologies', 'will', 'impact', 'one', 'favourite', 'pastimes.', 'us', 'leading', 'trend', 'programmes', 'content', 'will', 'delivered', 'viewers', 'via', 'home', 'networks', 'cable', 'satellite', 'telecoms', 'companies', 'broadband', 'service', 'providers', 'front', 'rooms', 'portable', 'devices.', 'one', 'talked-about', 'technologies', 'ces', 'digital', 'personal', 'video', 'recorders', '(dvr', 'pvr).', 'set-top', 'boxes', 'like', 'us', 's', 'tivo', 'uk', 's', 'sky+', 'system', 'allow', 'people'

In [7]:
# Using TOKENIZER
# Input is the list of sentences
# Return a Tokenizer that has been fitted to those sentences. Also use OOV
def fit_tokenizer(sentences):
    """
    Instantiates the Tokenizer class
    Args: sentences (list): lower_cased sentences without stopwords
    Returns: tokenizer (object): an instance of the Tokenizer class containing the word-index dict
    """
    
    # Instantiate the Tokenizer class by passing in the oov_token argument
    tokenizer = Tokenizer(oov_token = "<OOV>")
    
    return tokenizer

In [8]:
tokenizer = fit_tokenizer(sentences)

# Tokenize input sentences
tokenizer.fit_on_texts(sentences)
    
# Generate word index dict
word_index = tokenizer.word_index

print(f"Vocabulary contains {len(word_index)} words")
print("<OOV> token included in vocabulary" if "<OOV>" in word_index else "<OOV> token NOT included in vocabulary")

Vocabulary contains 29714 words
<OOV> token included in vocabulary


In [9]:
def get_padded_sequences(tokenizer, sentences):
    """
    Generates an array of token sequences and pads them to the same length
    Args:
        tokenizer (object): Tokenizer instance containing the word_index dictionary
        sentences (list of string): list of sentences to tokenize and pad
    Returns:
        padded_sequences (array of int): tokenized sentences padded to the same length
    """
    
    # Convert sentences to sequences
    sequences = tokenizer.texts_to_sequences(sentences)
    
    # Pad the sequences using the post padding strategy
    padded_sequences = pad_sequences(sequences, padding='post')
    
    return padded_sequences

In [10]:
padded_sequences = get_padded_sequences(tokenizer, sentences)
print(f"First padded sequence: \n {padded_sequences[0]}")
print(f"Numpy shape of all sequences: {padded_sequences.shape}")
print(f"There are {padded_sequences.shape[0]} sequences in total and each one has a size of {padded_sequences.shape[1]}")

First padded sequence: 
 [  96  176 1157 ...    0    0    0]
Numpy shape of all sequences: (2225, 2438)
There are 2225 sequences in total and each one has a size of 2438


In [11]:
def tokenize_labels(labels):
    """
    Tokenizes the labels
    Args: labels (list of string): labels to tokenize
    Returns: label_sequences, label_word_index (list of string, dictionary): tokenized labels and the
           word-index
    """
    
    # Instantiate the Toenizer class
    label_tokenizer = Tokenizer()#(oov_token = "<OOV>") 
    
    # Fit the tokens to the labels
    label_tokenizer.fit_on_texts(labels)
    
    # label word index
    label_word_index = label_tokenizer.word_index
    
    # Label sequences
    label_sequences = label_tokenizer.texts_to_sequences(labels)
    
    return label_sequences, label_word_index

In [12]:
label_sequences, label_word_index = tokenize_labels(labels)
print(f"Vocabulary of labels: {label_word_index}")
print(f"First ten sequences: {label_sequences[:10]}")

Vocabulary of labels: {'sport': 1, 'business': 2, 'politics': 3, 'tech': 4, 'entertainment': 5}
First ten sequences: [[4], [2], [1], [1], [5], [3], [3], [1], [1], [5]]
