In [33]:
import nltk
from nltk.util import ngrams
import collections

In [70]:
def readFile(fileName):
    '''Function to read the file and return the sentences in a list'''
    
    with open(fileName, 'r') as f:
        sentences = [line.strip() for line in f.readlines()]
    return sentences


def build_vocabulary(tokens, min_frequency=3):
    '''Function to Build the vocabulary, grouping rare tokens as '<unk>'''
    vocab = []
    vocab.append('<unk>')  # Add <unk> for rare words
    vocab.append('<STOP>')  # Add <STOP> for sentence endings
    
    token_counts = collections.Counter(tokens)
    # Include tokens with frequency >= min_frequency, plus '<unk>' and '<STOP>'
    vocab.extend([token for token, count in token_counts.items() if count >= min_frequency])

    return vocab, token_counts


def preprocess_sentence(sentence, vocab):
    '''Function to process the sentence to have unk and start and stop tags'''

    words = sentence.split()
    tokens = ['<START']
    tokens.extend([word if word in vocab else '<unk>' for word in words])
    tokens.append('<STOP>')  # Add <STOP> at the end of each sentence
    return tokens
        

def create_ngrams(tokens, n):
    '''Function to create N-grams'''
    
    ngrams = list(nltk.ngrams(tokens, n))
    return ngrams

In [22]:
# read the file to get the list of sentences
sentences = readFile('1b_benchmark.train.tokens.txt')
sentences[:3]

['Having a little flexibility on that issue would go a long way to putting together a final package .',
 'Long before the advent of e-commerce , Wal-Mart \'s founder Sam Walton set out his vision for a successful retail operation : " We let folks know we \'re interested in them and that they \'re vital to us-- \' cause they are , " he said .',
 'A spokesman said the company has been affected by the credit crunch in the United States .']

In [29]:
# generate the token from the sentences
tokens = [word for sentence in sentences for word in sentence.split()]
tokens[:10]

['Having',
 'a',
 'little',
 'flexibility',
 'on',
 'that',
 'issue',
 'would',
 'go',
 'a']

In [60]:
# get the vocabulary
vocab, token_counts = build_vocabulary(tokens)

In [61]:
print(f"Total length of vocabulary is ::: {len(vocab)}")

Total length of vocabulary is ::: 26602


In [62]:
vocab[:10]

['<unk>',
 '<STOP>',
 'Having',
 'a',
 'little',
 'flexibility',
 'on',
 'that',
 'issue',
 'would']

In [71]:
# preprocess each sentences
process_sentences = [preprocess_sentence(sentence, vocab) for sentence in sentences]

In [76]:
# Flatten the list of processed sentences into a single list of tokens
flattened_tokens = [token for sentence in process_sentences for token in sentence]

In [78]:
# check two sentences of the processed lines
flattened_tokens[:10]

['<START',
 'Having',
 'a',
 'little',
 'flexibility',
 'on',
 'that',
 'issue',
 'would',
 'go']

In [79]:
# create N-Grams
unigram = create_ngrams(flattened_tokens, 1)
bigram = create_ngrams(flattened_tokens, 2)
trigram = create_ngrams(flattened_tokens, 3)

In [87]:
unigram[:5]

[('<START',), ('Having',), ('a',), ('little',), ('flexibility',)]

In [88]:
bigram[:5]

[('<START', 'Having'),
 ('Having', 'a'),
 ('a', 'little'),
 ('little', 'flexibility'),
 ('flexibility', 'on')]

In [89]:
trigram[:5]

[('<START', 'Having', 'a'),
 ('Having', 'a', 'little'),
 ('a', 'little', 'flexibility'),
 ('little', 'flexibility', 'on'),
 ('flexibility', 'on', 'that')]