# Task


- Write a Python Jupyter notebook
- That loads the TXT version of the top 10 books from project Gutenberg
( https://www.gutenberg.org/browse/scores/top )
and outputs:
    - the 10 most common words in these books
    - the 10 most common bigrams from the text
    - the 3 most frequent bigrams for each book that are unique to that book  

if additional python requirements are necessary to run the notebook, provide a `requirements.txt` file that lists them

In [32]:
import os
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /Users/hassan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hassan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [33]:
stop_words = set(stopwords.words("english"))
data = {}

filenames = os.listdir('books')
filenames.sort()

all_bigrams = {}

for filename in filenames:

    file = open('books/{}'.format(filename), encoding = 'utf8')
    text = file.read()
    file.close()

    words = word_tokenize(text)

    cleaned_up = []
    list_for_bigrams = []

    for word in words:
        word = word.lower()
        if word not in string.punctuation and word not in stop_words and word.isalpha():
            cleaned_up.append(word)

    fdist_words = FreqDist(cleaned_up)
    bigrams = nltk.bigrams(cleaned_up)
    
    fdist_bigrams = FreqDist(bigrams)
    
    most_common_words = fdist_words.most_common(10)
    most_common_words = list(map(lambda x: x[0], most_common_words))
    
    most_common_bigrams = fdist_bigrams.most_common(10)
    most_common_bigrams = list(map(lambda x: ' '.join(x[0]), most_common_bigrams))
    
    for bigram in most_common_bigrams:
        if bigram in all_bigrams:
            all_bigrams[bigram] += 1
        else:
            all_bigrams[bigram] = 1
    
    key = filename[2: -4]
    data[key] = {}
    data[key] = {
        'most_common_words': most_common_words,
        'most_common_bigrams': most_common_bigrams
    }

In [41]:
# printing the results

for book in data:
    arr = []
    name = ' '.join(book.split('_'))
    print('{}: \n'.format(name))
    for bigram in data[book]['most_common_bigrams']:
        if all_bigrams[bigram] == 1:
            arr.append((bigram))
            
            if len(arr) == 3:
                data[book]['unique_bigrams'] = arr
                break
                
    common_words = ', '.join(data[book]['most_common_words'])
    most_common_bigrams = ', '.join(data[book]['most_common_bigrams'])
    book_bigrams = ', '.join(data[book]['unique_bigrams'])
    
    print('\t 10 most common words: {}'.format(common_words))
    print('\t 10 most common bigrams: {}'.format(most_common_bigrams))
    print('\t 3 most frequent bigrams: {}\n'.format(book_bigrams))

pride and prejudice: 

	 10 most common words: elizabeth, could, would, darcy, said, much, bennet, must, bingley, jane
	 10 most common bigrams: lady catherine, miss bingley, miss bennet, said elizabeth, sir william, young man, de bourgh, miss darcy, project gutenberg, colonel fitzwilliam
	 3 most frequent bigrams: lady catherine, miss bingley, miss bennet

pen pictures of british battles: 

	 10 most common words: project, battle, enemy, men, german, work, british, one, fire, guns
	 10 most common bigrams: project gutenberg, project electronic, electronic works, united states, battle fleet, gutenberg literary, literary archive, archive foundation, machine guns, electronic work
	 3 most frequent bigrams: electronic works, united states, battle fleet

the adventures of sherlock holmes: 

	 10 most common words: said, upon, holmes, one, would, man, could, little, see, may
	 10 most common bigrams: said holmes, sherlock holmes, project gutenberg, could see, baker street, lord simon, young