In [1]:
pip install adjustText

Collecting adjustText
  Downloading adjustText-0.8-py3-none-any.whl (9.1 kB)
Installing collected packages: adjustText
Successfully installed adjustText-0.8


In [2]:
%matplotlib inline
import zipfile
import re
import numpy as np
import pandas as pd
import os
import random
import tensorflow as tf
import matplotlib.pyplot as plt
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE
from adjustText import adjust_text

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd /content/drive/My Drive/Colab Notebooks

/content/drive/My Drive/Colab Notebooks


In [5]:
url = 'http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip'

In this project we will use BBC news articles dataset. It contains 2225 news articles belonging to 5 topics, business, entertainment, politics, sport, and tech which were published on the BBC website between 2004-2005

In [6]:
def download_data(url, data_dir):
    """Download a file if not present, and make sure it's the right size."""

    os.makedirs(data_dir, exist_ok=True)

    file_path = os.path.join(data_dir, 'bbc-fulltext.zip')

    if not os.path.exists(file_path):
        print('Downloading file...')
        filename, _ = urlretrieve(url, file_path)
    else:
        print("File already exists")

    extract_path = os.path.join(data_dir, 'bbc')
    if not os.path.exists(extract_path):

        with zipfile.ZipFile(os.path.join(data_dir, 'bbc-fulltext.zip'), 'r') as zipf:
            zipf.extractall(data_dir)

    else:
        print("bbc-fulltext.zip has already been extracted")

download_data(url, 'data')

File already exists
bbc-fulltext.zip has already been extracted


The function first creates data_dir if it does not exst. Next if the bbc-fulltext.zip file does not exist it will be downloaded from the URL. If bbc-fulltext.zip has not been extracted yet, it will be extracted to data_dir

With that we are going to focus on reading the data contained in the news articles (in .txt format) into the memory. To do that we will define the read_data() function which takes a data directory path (data_dir) and reads the .txt files (except for README file)

In [7]:
def read_data(data_dir):

    # This will contain the full list of stories
    news_stories = []

    print("Reading files")

    i = 0 # Just used for printing progress
    for root, dirs, files in os.walk(data_dir):

        for fi, f in enumerate(files):

            # We don't read the readme file
            if 'README' in f:
                continue

            # Printing progress
            i += 1
            print("."*i, f, end='\r')

            # Open the file
            with open(os.path.join(root, f), encoding='latin-1') as f:

                story = []
                # Read all the lines
                for row in f:

                    story.append(row.strip())

                # Create a single string with all the rows in the doc
                story = ' '.join(story)
                # Add that to the list
                news_stories.append(story)

        print('', end='\r')

    print(f"\nDetected {len(news_stories)} stories")
    return news_stories

with read_data() function. We can use it read the data and print some samples as well as some statistics

In [8]:
news_stories = read_data(os.path.join('data', 'bbc'))

# Printing some stats and sample data
print(f"{sum([len(story.split(' ')) for story in news_stories])} words found in the total news set")
print('Example words (start): ',news_stories[0][:50])
print('Example words (end): ',news_stories[-1][-50:])

Reading files
..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

As can be seen from the above result, there are 2225 stories with close to a milion words. In the next step we will tokenize each story (in the form of a long string) to a list of tokens (or words). Along with that we will perform some preprocessing on the text: lowercase all the chracfters and remove punctuation.

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(
    num_words=None,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True, split=' '
)

As can be seen above some of the most popular keywrd arugment and their default values used when defining a tokenizer:



*   num_words: Deifnines the size of the vocabulary. Defaults to None, meaning it will consider all the words appearing in the text corpus. If set to the integer n, it will only consider the n most common words appearing in the corpus

*   filters: Defines any characters that need to be omitted during preprocessing. By default it defines a string containing most of the common punctuation marks and symbols.

*   lower: defines whether the text needs to be converted to lowercase

*   split: defines the character that the words will be tokenized on.


Once the tokenizer is defined, fit_on_texts() emthod will be call with a list of strings (where each string is a news article) so that the tokenizer will learn the vocabulary and map the words to the unique IDs



In [10]:
tokenizer.fit_on_texts(news_stories)

It is a time to take a moment to analyze what the tokenizer has produced after it has been fitted on the text. Once it has been fitted, the Tokenizer will have 2 important attributes populated: word_index and index_word. Here word_index is a dictionary that maps each word to a unique ID. The index_word attribute is the opposite of word_index, that is a dictionary that maps each unique word ID to the corressponding word

In [11]:
n_vocab = len(tokenizer.word_index.items()) + 1
print(f"Vocabulary size: {n_vocab}")
print("\nWords at the top")
print('\t', dict(list(tokenizer.word_index.items())[:10]))
print("\nWords at the bottom")
print('\t', dict(list(tokenizer.word_index.items())[-10:]))

Vocabulary size: 32360

Words at the top
	 {'the': 1, 'to': 2, 'of': 3, 'and': 4, 'a': 5, 'in': 6, 'for': 7, 'is': 8, 'that': 9, 'on': 10}

Words at the bottom
	 {"taipei's": 32350, 'taller': 32351, 'petronas': 32352, 'skyscraper': 32353, 'packaged': 32354, 'inserting': 32355, 'solves': 32356, 'idefence': 32357, 'pls': 32358, 'm3u': 32359}


Note how we are using the length of the word_index dictionary to derive the vocabulary size. We need an additional 1 as the ID 0 is a reserved ID and will not be used for any word.

The more frequent a word in the corpus, the lower the ID will be. Words such as "the", "to",  and "of" which tend to be common (and are called stop words) are infact the most common words. As the next step , we are going to refine our tokenizer object to have a limited-size vocabulary. Because we are working with a relatively small corpus, we have to ensure the vocabulary is not too large as it can lead to poorly learned word vectors due to the lack of data.

In [12]:
n_vocab = 15000
tokenizer = Tokenizer(
    num_words=n_vocab-1,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True, split=' ', oov_token='',
)

tokenizer.fit_on_texts(news_stories)
print("Data fitted on the tokenizer")

Data fitted on the tokenizer


Since we have a total vocabulary of more than 30000 words, we will restrict the eize of the vocabulary to 15000. This means the tokenizer will only keep the most common 15000 words as the vocabulary. When we restrict a vocabulary this way a new problem arises. As the tokenizer's vocabulary does not encompass all possible words in the true vocabulary, out-of-vocabulary words (or OOV words) can rear their heads. Some soluations are to replace OOV words with a special token (such  as <UNK>) or remove them from the corpus. This is possible by passing the string we want to replace OOV tokens to the oov_token argument in the Tokenizer. In this case, we will remove OOV words. If we are careful when setting the size of the vocabulary, omitting some of the rare words would not harn learning the context of words accurately

It is time to convert a string of the first 100 cahracters to the first story in our corpos (stored in the news_stories variable)

In [13]:
print(f"Original: {news_stories[0][:100]}")

Original: Musicians to tackle US red tape  Musicians' groups are to tackle US visa regulations which are blame


Then we can call the tokenizer's texts_to_sequence() method to convert a list of documents (where each document is a string) to a list of list of word IDs (that is , each domuent is converted to a list of word IDs)

In [14]:
print(f"Sequence IDs: {tokenizer.texts_to_sequences([news_stories[0][:100]])[0]}")

Sequence IDs: [2609, 3, 1342, 49, 1294, 4686, 11185, 862, 25, 3, 1342, 49, 2529, 3218, 35, 25, 3696]


We now have our tikenizer sorted. There is nothing left to do but to convert all of our news articles to sequences of word IDs with a signle line of code


In [15]:
news_sequences = tokenizer.texts_to_sequences(news_stories)

In [16]:
sample_word_ids = news_sequences[0][:5]
sample_phrase = ' '.join([tokenizer.index_word[wid] for wid in sample_word_ids])
print(f"Sample phrase: {sample_phrase}")
print(f"Sample word IDs: {sample_word_ids}\n")

Sample phrase: musicians to tackle us red
Sample word IDs: [2609, 3, 1342, 49, 1294]



Now it is a time to consider a window size of 1. This means for a given target word, we define the context as one word frim each side of the target word

In [17]:
window_size = 1 # How many words to consider left and right

We have all ingredients to define extract skip-grams from the sample phrase.

In [18]:
inputs, labels = tf.keras.preprocessing.sequence.skipgrams(
    sample_word_ids,
    vocabulary_size=n_vocab,
    window_size=window_size, negative_samples=1.0, shuffle=False,
    categorical=False, sampling_table=None, seed=None
)


print("Sample skip-grams")

Sample skip-grams


Now it is a time to look at some of important arguments which have been used.


*   sequence(list[str] or list[int]) : A list of words or word IDS

*   Vocabulary_size (int):  Size of the vocabulary

*   window_size (int): size of the window to be considered for the context

*   window_size : defines the length on each side

*   Nevative_samples (int): Fraction of negative candidates to generate. For example, a value of 1 means that there will be an equal number of possitive and negative skipgram candidates. A value of 0 means there will not be any negative candiates
*   Shuffle (bool): Whetehre to produce labels as categorical (that is one-hot encoded) or integers


*   Sampling_table : An arry size of the same size as the vocabulary. An element is a given position to the array represents the probability of sampleing the word indexed by taht position in the tokenizer's word ID to word mapping



*   seed(int). If shuffling is enabled, this is the random seed to be used for shuffling




In [19]:
# With the inputs and labels generated. It is time to print some data
print("Sample skip-grams")
for inp, lbl in zip(inputs, labels):
    print(f"\tInput: {inp} ({[tokenizer.index_word[wi] for wi in inp]}) / Label: {lbl}")

Sample skip-grams
	Input: [2609, 3] (['musicians', 'to']) / Label: 1
	Input: [3, 2609] (['to', 'musicians']) / Label: 1
	Input: [3, 1342] (['to', 'tackle']) / Label: 1
	Input: [1342, 3] (['tackle', 'to']) / Label: 1
	Input: [1342, 49] (['tackle', 'us']) / Label: 1
	Input: [49, 1342] (['us', 'tackle']) / Label: 1
	Input: [49, 1294] (['us', 'red']) / Label: 1
	Input: [1294, 49] (['red', 'us']) / Label: 1
	Input: [3, 2732] (['to', 'forum']) / Label: 0
	Input: [3, 9965] (['to', 'larry']) / Label: 0
	Input: [49, 1569] (['us', 'spokeswoman']) / Label: 0
	Input: [1294, 13245] (['red', 'tyler']) / Label: 0
	Input: [1342, 11710] (['tackle', 'gangsters']) / Label: 0
	Input: [49, 7694] (['us', 'belonging']) / Label: 0
	Input: [1342, 11420] (['tackle', 'penned']) / Label: 0
	Input: [2609, 8421] (['musicians', 'michel']) / Label: 0


For example, since the word "sales" appears in the context of the word "ad", it is considered a positive candidate. On the other hand, since the word "racing" (randomly sampled from the vocabulary) does not appear in the context of the word "ad". It is added as a negative candidate.

When selecting negative candiates, the skipgram() function selects them randomly, giving uniform weights to all the words in the vocabulary. However, the original paper exapliants that this can lead to poor performance. A better strategy is to use the unigram distribution as a prior for selecting negative context words.

a unigram distribution referes to the frequency distribution of individual words in a text or a language corpus. In simpler terms, it represetns how often each distinct word appears in a given body of text.

A unigram is the smalles lunguistic unit, representing a single word without considering its context or enighboring words. Analyzing the unigram distribution can provide insights into the vocabulary richness, word usage patterns and potentially even the topic or theme of the text. This distribution is often used as a foundational step in various language processing tasks, such as langauge modeling, text classificaiton and information retrival.

To do that we can use tf.random.log_uniform_candiate_sampler() function. This function will take a batch of positive context candiates of shape [b,num_true], where b is the batch size and num_true is the number of true candiates per example (1 for the skip-gram model), and it outputs a [num_sampled] size array where num_sampled is the number of negative samples we need.

In [20]:
inputs, labels = tf.keras.preprocessing.sequence.skipgrams(
    sample_word_ids,
    vocabulary_size=len(tokenizer.word_index.items())+1,
    window_size=window_size, negative_samples=0, shuffle=False,
)

inputs, labels = np.array(inputs), np.array(labels)

Note taht we are specifying negative_samples = 0, as we will be generating negative samples with the candiate ssampler. Now it is a time to use the tf.random.log_uniform_candidate_sampler() function to generate negative candidates.

In [21]:
negative_sampling_candidates, true_expected_count, sampled_expected_count = tf.random.log_uniform_candidate_sampler(
    # A true context word that appears in the context of the target
    true_classes=inputs[:1,1:], # [b, 1] sized tensor
    num_true=1, # number of true words per example
    num_sampled=10,
    unique=True,
    range_max=n_vocab,
    name="negative_sampling"
)

This function takes some arguments:


1.   true_classes(np.ndarray or tf.Tensor) - A tensor containing true target words. This needs to be a [b, num_true] sized array, where num_true denotes the number of true context candidates per example. Since we have 1 context word per example this is 1

1.   num_true (int): The number of true context erms per example

1.   num_sampled (int): The number of negative samples to generate

1.   unique (bool): whether to genrate unique samples with replacement
2.   range_max (int): The size of the vocabulary


It will return

2.   sampled_candiates(tf.tensor) :  A tensor of size [num_sampled] containing negative candiates

2.   true_expected_count(tf.tensor): A tensor of size[b, num_true]: The probability of each true candidate being sampled


2.   Sampled_expected count (tf.tensor): A tensor of size [num_sapled]: The probabilities of each negative sample occuring alon g with true candiates. if sampled from corpus



In [22]:
print(f"Positive sample: {inputs[:1,1:]}")
print(f"Negative samples: {negative_sampling_candidates}")
print(f"true_expected_count: {true_expected_count}")
print(f"sampled_expected_count: {sampled_expected_count}")

Positive sample: [[3]]
Negative samples: [    9    30     6    56     1    78   592  3898 13259     4]
true_expected_count: [[0.22761466]]
sampled_expected_count: [1.0378349e-01 3.5725005e-02 1.4257596e-01 1.9716201e-02 3.7742513e-01
 1.4295651e-02 1.9257633e-03 2.9331696e-04 8.6263499e-05 1.8987593e-01]
