In [1]:
import wikipedia
import heapq
import collections

In [2]:
def ngrams(text,n):
    """ Returns all n-grams from text with their counts
    n is an integer
    text is a list of words(strings)
    """
    output = []
    
    # Append all 3-grams in text
    for i in range(len(text)-n+1):
        output.append(tuple(text[i:i+n]))
    
    # Update counts of unique n-grams in a dictionary
    counts = collections.defaultdict(int)
    for item in output:
            counts[item]+=1
    
    # items is list of 2-tuples where first element is the count 
    #nand the second is n-tuple representing the n-gram
    items = [(y,x) for (x,y) in counts.items()]
    return items
    

In [3]:
def topNgrams(items, k):
    """ Returns top k n-grams from list of tuples of count and n-grams
    k is an integer
    items is a list of 2-tuples returned by the function ngrams
    """
    # A min heap is used since top k elements can be identified in O(nlogk) time. A traditional
    # sort and select top-k would have taken O(nlogn) time
    heap = []
    for item in items:
        if len(heap) < k or item[0] > heap[0][0]:
            if len(heap)==10:
                heapq.heappop(heap)
            heapq.heappush(heap, item)
    return heap

In [4]:
# Download content from wikipedia
text = wikipedia.page("N-gram").content
text.lower()
text=text.split()



In [5]:
trigrams = ngrams(text,3)
top10 = topNgrams(trigrams, 10)

# Print top-10 n-grams
for item, num in zip(sorted(top10, reverse=True),xrange(1,11)):
    print '{0:>2}. {1:<20} : {2:>8}'.format(num, " ".join(item[1]), item[0])

 1. serve as the         :        6
 2. part of the          :        4
 3. n-gram models are    :        4
 4. the probability of   :        3
 5. the number of        :        3
 6. the language model   :        3
 7. of a possible        :        3
 8. have been used       :        3
 9. can also be          :        3
10. be used for          :        3
