# N-Gram Frequency Calculator from Scratch

Source: https://stackoverflow.com/questions/17531684/n-grams-in-python-four-five-six-grams

In [2]:
# import packages 
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import string
import pandas as pd 
import numpy as np 

In [3]:
from bs4 import BeautifulSoup
import urllib.request 

# download example text as an example 
url = 'http://www.textfiles.com/etext/FICTION/mobydick'
response = urllib.request.urlopen(url) 
html = response.read()
soup = BeautifulSoup(html,"html5lib")
data = soup.get_text(strip=True)
print (data[:1000])

1851
                                   MOBY DICK;
                                  OR THE WHALE
                               by Herman Melville
ETYMOLOGY
  ETYMOLOGY
  (Supplied by a Late Consumptive Usher to a Grammar School)

  The pale Usher- threadbare in coat, heart, body, and brain; I see
him now. He was ever dusting his old lexicons and grammars, with a
queer handkerchief, mockingly embellished with all the gay flags of
all the known nations of the world. He loved to dust his old grammars;
it somehow mildly reminded him of his mortality.

  "While you take in hand to school others, and to teach them by
what name a whale-fish is to be called in our tongue leaving out,
through ignorance, the letter H, which almost alone maketh the
signification of the word, you deliver that which is not true."
                                                        HACKLUYT

  "WHALE. * * * Sw. and Dan. hval. This animal is named from roundness
or rolling; for in Dan. hvalt is arched or vaulte

In [5]:
# since the file is too big, reduce the file 
data = data[:int(len(data)/3)]

In [7]:
# remove punctuations 
translator = str.maketrans('', '', string.punctuation)
data = data.translate(translator)

# split data into tokens 
tokens = [t for t in data.split()] 

tokens[:10]

['1851',
 'MOBY',
 'DICK',
 'OR',
 'THE',
 'WHALE',
 'by',
 'Herman',
 'Melville',
 'ETYMOLOGY']

In [6]:
# remove stop words 
clean_tokens = tokens.copy() # or tokens[:]
for token in tokens:
    if token.lower() in stopwords.words('english'): 
        clean_tokens.remove(token)    

clean_tokens[:10]

['1851',
 'MOBY',
 'DICK',
 'WHALE',
 'Herman',
 'Melville',
 'ETYMOLOGY',
 'ETYMOLOGY',
 'Supplied',
 'Late']

## Function 

In [8]:
def ngram(tokens,ngrams): 
    '''
    Looks through each word in the document and grab next ngrams and returns tuple 
    '''
    return [tuple(tokens[i:i+ngrams]) for i in range(len(tokens)-ngrams+1)]

def FrequencyCalculator(tokens):
    ''' 
    Given list of tokens, calculate each token's frequency. 
    By putting the final result into the dictionary, it removes duplicate entries. 
    '''
    word_count = [tokens.count(word) for word in tokens]
    return dict(zip(tokens,word_count))

## Unigram

In [9]:
ngram_words = ngram(clean_tokens,ngrams=1)
word_count_dict = FrequencyCalculator(ngram_words)

# Show top 20 most frequency words 
result = sorted(word_count_dict.items(), key=lambda x: x[1],reverse=True)[:20]
result 

[(('one',), 247),
 (('ye',), 213),
 (('whale',), 185),
 (('upon',), 183),
 (('like',), 175),
 (('old',), 156),
 (('man',), 156),
 (('said',), 153),
 (('Queequeg',), 152),
 (('would',), 143),
 (('Captain',), 122),
 (('little',), 121),
 (('Ahab',), 116),
 (('though',), 114),
 (('great',), 113),
 (('sea',), 112),
 (('ship',), 111),
 (('yet',), 102),
 (('seemed',), 101),
 (('time',), 99)]

## Bigram

In [10]:
ngram_words2 = ngram(clean_tokens,ngrams=2)
word_count_dict2 = FrequencyCalculator(ngram_words2)

# Show top 20 most frequency words 
result2 = sorted(word_count_dict2.items(), key=lambda x: x[1],reverse=True)[:20]
result2 

[(('Captain', 'Ahab'), 41),
 (('Captain', 'Peleg'), 30),
 (('Moby', 'Dick'), 25),
 (('Sperm', 'Whale'), 22),
 (('white', 'whale'), 19),
 (('New', 'Bedford'), 16),
 (('old', 'man'), 13),
 (('sperm', 'whale'), 13),
 (('young', 'man'), 13),
 (('Captain', 'Bildad'), 13),
 (('Mrs', 'Hussey'), 12),
 (('Aye', 'aye'), 12),
 (('Cape', 'Horn'), 11),
 (('one', 'side'), 11),
 (('one', 'hand'), 10),
 (('ye', 'ye'), 10),
 (('never', 'mind'), 9),
 (('something', 'like'), 9),
 (('dont', 'know'), 9),
 (('Father', 'Mapple'), 9)]