In [1]:
import sys;
print('system (python) info')
print(sys.version)
print('')

import nltk; print( 'nltk ' + nltk.__version__)
from nltk import word_tokenize;

import bs4; print( 'bs4 ' + bs4.__version__)
from bs4 import BeautifulSoup, SoupStrainer;

import re; print('re ' + re.__version__)
import requests; print('requests ' + requests.__version__)

system (python) info
3.6.0 |Anaconda custom (x86_64)| (default, Dec 23 2016, 13:19:00) 
[GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)]

nltk 3.2.2
bs4 4.5.3
re 2.2.1
requests 2.18.4


# Write Up

### Question 1

I decided to simply divide the length of each text by the highest vocabulary size. There are various approaches, but this is the most straightforward way to interpret the results as each score is computed on a linear scale. This syntax is as follows and can be reviewed in the function named `get_vocabulary_score` below:

```
float(vocab_size) / highest_vocab_size
```

### Question 2

To get the long word score, I simply count the number of long words are found in each corpus. Then I divide each number by the largest of these numbers. This syntax can reviewed in the function named `get_long_word_count` below and is summarized here:

```
    minimum_size = 15
    long_words = [w for w in vocab if len(w) >= minimum_size]
    return len(long_words)
```

### Question 3
We first get the lexical diversity as explained in homework 1 by simply taking the number of unique words divided by the entire number of words. Then we find the _text difficulty_ by summing all three scores we have come calculated.

Using this formula, it appears that the text with the most complexity is `A Modern History, From the Time of Luther to the Fall of Napoleon`

# Code

In [2]:
url_to_fetch_images = 'http://www.gutenberg.org/wiki/Children%27s_Instructional_Books_(Bookshelf)'

## Fetch all text from the server

The following methods are used to fetch all text from the server

In [3]:
def get_text_from_url(url):
    return requests.get(url).text

In [4]:
def get_all_links_from_html(html):
    tags = BeautifulSoup(html, 'html.parser', parse_only=SoupStrainer('a', href=True))
    urls = [str(tag.attrs['href']) for tag in tags]
    return urls

In [5]:
def relevent_link(link):
    if '/ebooks/' in link:
        return True
    return False

In [6]:
def get_book_ids_from_links(links):
    return [link.split('/ebooks/').pop() for link in links]

In [7]:
def get_book_text_urls_from_ids(ids):
    url_template = 'http://www.gutenberg.org/files/{}/{}.txt'
    # url_template = 'http://www.gutenberg.org/files/{}/{}-h/{}-h.htm'
    return [url_template.format(id, id, id) for id in ids]

### Get all texts

Note that there will be a few texts that cannot be loaded

In [8]:
#all_texts = [{ 'corpus': 'a bad bunny.'}, { 'corpus': 'one 1 sad day did no wrong on this day'}, { 'corpus': 'the big bad bear died'}]
all_texts = []

def fetch_all_texts():
    if len(all_texts) > 0:
        print(len(all_texts))
        return
    # get all the html from the home page of some site
    home_page_html = get_text_from_url(url_to_fetch_images)
    # get all the links from the html
    all_links_from_home_page = get_all_links_from_html(home_page_html)
    # remove unhelpful links
    relevent_links_from_home_page = list(filter(relevent_link, all_links_from_home_page))

    book_ids = get_book_ids_from_links(relevent_links_from_home_page)
    book_urls = get_book_text_urls_from_ids(book_ids)
    print('found {} books'.format(len(book_urls)))
    book_i = 0
    for url in book_urls:
        print ('fetching book via {}'.format(url), end=' ')
        book_text = get_text_from_url(url)
        print('found text with {} characters'.format(len(book_text)))
        all_texts.append({
            'url': url,
            'corpus': book_text
        })

        book_i += 1
        # provide a number to cap
        if book_i == 200:
            return
        
    
    # append to all_texts a dictionary with the raw text
    return 1
fetch_all_texts()

found 104 books
fetching book via http://www.gutenberg.org/files/7841/7841.txt found text with 68479 characters
fetching book via http://www.gutenberg.org/files/5742/5742.txt found text with 69077 characters
fetching book via http://www.gutenberg.org/files/13539/13539.txt found text with 231140 characters
fetching book via http://www.gutenberg.org/files/7425/7425.txt found text with 238071 characters
fetching book via http://www.gutenberg.org/files/16046/16046.txt found text with 83404 characters
fetching book via http://www.gutenberg.org/files/22420/22420.txt found text with 203822 characters
fetching book via http://www.gutenberg.org/files/23424/23424.txt found text with 78 characters
fetching book via http://www.gutenberg.org/files/18217/18217.txt found text with 140578 characters
fetching book via http://www.gutenberg.org/files/24053/24053.txt found text with 99211 characters
fetching book via http://www.gutenberg.org/files/24644/24644.txt found text with 89469 characters
fetching 

1

# Score Vocabulary Size

Various methods to help get vocabulary size

In [9]:
def get_words_from_corpus(corpus):
    # use regex to only take in actually alphabetic characters
    corpus = re.sub('[^a-zA-Z\s]', '', corpus)
    # get the lowercase of all letter
    corpus = corpus.lower()
    # vectorize all the words
    return word_tokenize(corpus)

In [10]:
def get_unique_words(words):
    return set(words)

In [12]:
def get_vocabulary_score(vocab_size, highest_vocab_size):
    return float(vocab_size) / highest_vocab_size

get_vocabulary_score(10, 100)

0.1

In [13]:
def get_long_word_count(vocab):
    minimum_size = 15
    long_words = [w for w in vocab if len(w) >= minimum_size]
    return len(long_words)

get_long_word_count(['kj', 'superlongwordthatmakesnosense'])

1

In [14]:
def get_lexical_diversity(words, unique_words):
    return float(len(unique_words)) / len(words)

### Get vocabulary score for each item

In [15]:
# Loop through all_texts
# Get the largest text
# Score each item based on largest text
#  - Add text scores to each record of all_texts
def score_vocabulary_size():
    highest_vocab = 0
    highest_long_word_count = 0
    
    # get the vocabulary size of each item (and the highest item)
    for text in all_texts:
        corpus = text['corpus']
        words = get_words_from_corpus(corpus)
        unique_words = get_unique_words(words)

        vocabulary_size = len(unique_words)
        long_word_count = get_long_word_count(unique_words)
        lexical_diversity = get_lexical_diversity(words, unique_words)

        text['vocabulary_size'] = vocabulary_size
        text['long_word_count'] = long_word_count
        text['lexical_diversity'] = lexical_diversity

        if vocabulary_size > highest_vocab:
            highest_vocab = vocabulary_size
        if long_word_count > highest_long_word_count:
            highest_long_word_count = long_word_count

    # score each item
    for text in all_texts:
        vocabulary_size = text['vocabulary_size']
        long_word_count = text['long_word_count']
        lexical_diversity = text['lexical_diversity']

        vocabulary_score = get_vocabulary_score(vocabulary_size, highest_vocab)
        long_word_score = get_vocabulary_score(long_word_count, highest_long_word_count)
        text_difficulty = vocabulary_score + long_word_score + lexical_diversity

        text['vocabulary_score'] = vocabulary_score
        text['long_word_score'] = long_word_score
        print('text difficulty: {0:.2f}, v score: {1:.2f}, lw score: {2:.2f}, lexical diversity: {3:.2f}, url: {4}'.format(text_difficulty, vocabulary_score, long_word_score, lexical_diversity, text['url']))

score_vocabulary_size()

text difficulty: 0.29, v score: 0.10, lw score: 0.04, lexical diversity: 0.14, url: http://www.gutenberg.org/files/7841/7841.txt
text difficulty: 0.32, v score: 0.12, lw score: 0.05, lexical diversity: 0.15, url: http://www.gutenberg.org/files/5742/5742.txt
text difficulty: 0.57, v score: 0.32, lw score: 0.14, lexical diversity: 0.11, url: http://www.gutenberg.org/files/13539/13539.txt
text difficulty: 0.52, v score: 0.31, lw score: 0.09, lexical diversity: 0.11, url: http://www.gutenberg.org/files/7425/7425.txt
text difficulty: 0.33, v score: 0.12, lw score: 0.09, lexical diversity: 0.12, url: http://www.gutenberg.org/files/16046/16046.txt
text difficulty: 0.30, v score: 0.16, lw score: 0.08, lexical diversity: 0.06, url: http://www.gutenberg.org/files/22420/22420.txt
text difficulty: 0.91, v score: 0.00, lw score: 0.00, lexical diversity: 0.91, url: http://www.gutenberg.org/files/23424/23424.txt
text difficulty: 0.38, v score: 0.17, lw score: 0.09, lexical diversity: 0.11, url: http: