In [15]:
import requests
import pickle
import re
from string import ascii_lowercase, digits
from collections import Counter
from bs4 import BeautifulSoup, NavigableString

In [2]:
def load_pickle(path):
  with open(path, 'rb') as handle:
    return pickle.load(handle)

def save_pickle(variable, path):
    with open(path, 'wb') as handle:
        pickle.dump(variable, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [3]:
def exclude_black_list(content, black_list):
    return False if content.lower() in black_list else True

def between(start, end, exclude=[]):
    while start != end:
        if isinstance(start, NavigableString):
            yield start
        elif start.name in exclude:
            start = start.next_element
        start = start.next_element

def format_file_name(title, titles):
    allowed_letters = ascii_lowercase + digits + '_'

    title = title.split('\n')
    title = title[0] if len(title[0]) > 0 else title[1]
    title = title.strip().replace(' ', '_').lower()
    title = ''.join([letter for letter in title if letter in allowed_letters])
    titles.append(title)
    
    if title in titles:
        title = title + str(titles.count(title))
        
    return title + '.txt', titles

def format_text(start, end, exclude, num_words):
    text = ' '.join(t for t in between(start, end, exclude))
    text = '\n'.join(text.split("\n")[1:]).strip()
    num_words.append(len(text.split(' ')))
    return text, num_words

def save_txt(text, path):
    with open(path, 'w', encoding='utf-8') as f:
        f.write(text)
        
def words_summary(num_words):
    print('Number of unique files with fairy tales:', len(num_words))
    print('Total number of words in all fairy tales:', sum(num_words))
    print('Average number of words in a fairy tale: %d' % (sum(num_words)/len(num_words)))
    print('Number of words in the shortest story: %d, in the longest story: %d' % (min(num_words), max(num_words)))

## Website scraping

In [35]:
main_urls = ['https://www.pitt.edu/~dash/folktexts.html',
             'https://www.pitt.edu/~dash/folktexts2.html']
base_url = 'https://www.pitt.edu/~dash/'

In [5]:
links = []

for url in main_urls:
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')

    for a in soup.find_all('a'):
        a_href = a.get('href')
        links.append(a_href)

In [27]:
internal_links = [link.split('#')[0] for link in links if link != None and 'folktexts' not in link and 'http' not in link]
internal_links.remove('')
internal_links = list(set(internal_links))

In [32]:
# Checkpoint

#save_pickle(links, 'data/links.pickle')
#save_pickle(internal_links, 'data/internal_links.pickle')

In [76]:
pages = []
total_progress = len(internal_links)
p = 0

for link in internal_links:
    progress = p*100/total_progress
    
    sub_url = base_url + link
    sub_page = requests.get(sub_url)
    
    pages.append(sub_page)
    
    if p in range(0, total_progress, 20):
        print(progress, '%')
    p += 1

0.0 %
5.747126436781609 %
11.494252873563218 %
17.24137931034483 %
22.988505747126435 %
28.735632183908045 %
34.48275862068966 %
40.229885057471265 %
45.97701149425287 %
51.724137931034484 %
57.47126436781609 %
63.2183908045977 %
68.96551724137932 %
74.71264367816092 %
80.45977011494253 %
86.20689655172414 %
91.95402298850574 %
97.70114942528735 %


In [80]:
# Checkpoint

#save_pickle(pages, 'data/pages.pickle')

## Text processing

In [4]:
pages = load_pickle('data/pages.pickle')
internal_links = load_pickle('data/internal_links.pickle')

In [5]:
soup_list = [BeautifulSoup(page.content, 'html.parser') for page in pages]
print(len(internal_links)==len(soup_list))

True


In [6]:
h2_count = {}
h2_content = []

for i in range(len(internal_links)):
    h2_list = soup_list[i].find_all('h2')
    h2_content.extend([h2.text.strip().lower() for h2 in h2_list])
    h2_count[internal_links[i]] = len(h2_list)

h2_content_count = Counter(h2_content).most_common()

In [7]:
h2_content_count

[('contents', 260),
 ('links to related sites', 75),
 ('related links', 32),
 ('germany', 13),
 ('the changeling', 6),
 ('links to related tales', 6),
 ('the tongue-cut sparrow', 6),
 ("the devil's bridge", 6),
 ('notes and bibliography', 5),
 ('links', 5),
 ('norway', 5),
 ('the hare and the tortoise', 5),
 ('jacob and wilhelm grimm', 5),
 ('the girl without hands', 5),
 ('the dog and the wolf', 4),
 ('the town mouse and the country mouse', 4),
 ('cain and abel', 4),
 ('the two brothers', 4),
 ('the ant and the grasshopper', 4),
 ('the two frogs', 3),
 ('the talkative tortoise', 3),
 ("the brahman's wife and the mongoose", 3),
 ('the lion and the hare', 3),
 ('the eternal jew on the matterhorn', 3),
 ('links to related sites.', 3),
 ('the language of animals', 3),
 ('links to additional texts', 3),
 ('the hand of glory', 3),
 ('jack and the beanstalk', 3),
 ('the alp', 3),
 ('the frog prince', 3),
 ('the cat and the mice', 3),
 ('the werewolf', 3),
 ("death's messengers", 3),
 ('the b

In [8]:
h2_black_list = ['contents', 'links to related sites', 'related links', 'links to related tales', 'notes and bibliography', 'links']

In [9]:
type1 = [k for k,v in h2_count.items() if v != 0]
type2 = [k for k,v in h2_count.items() if v == 0]
print('Pages with h2: %d, Pages without h2: %d' % (len(type1), len(type2)))

content_dict = {internal_links[i]: soup_list[i] for i in range(len(internal_links))}

Pages with h2: 313, Pages without h2: 35


In [10]:
type1_content = {}

for page in type1:
    h2_list = content_dict[page].find_all('h2')
    h2_text = [h2.text.strip().lower() for h2 in h2_list]
    type1_content[page] = h2_text.count('contents') + h2_text.count('table of contents')

In [11]:
type1_with_content = [k for k,v in type1_content.items() if v != 0]
type1_without_content = [k for k,v in type1_content.items() if v == 0]

print('Pages with contents section: %d, Pages without contents section: %d' % (len(type1_with_content), len(type1_without_content)))

Pages with contents section: 262, Pages without contents section: 51


In [12]:
page_skip = 0
titles = []
num_words = []
exclude = ['h3']
end = soup_list[0].new_tag('hr')

for page in type1_with_content:
    type1_page_links = content_dict[page].find_all('a', attrs={'name': True})

    for a in type1_page_links[1:]:
        start = a.find_parent('h2')
        if start != None and exclude_black_list(start.get_text(), h2_black_list):
            title, titles = format_file_name(start.get_text(), titles)
            text, num_words = format_text(start, end, exclude, num_words)
            save_txt(text, 'data/tales/' + title) if num_words[-1] >= 15 else num_words.pop()
        else:
            page_skip += 1

In [13]:
words_summary(num_words)

Number of unique files with fairy tales: 2308
Total number of words in all fairy tales: 1608449
Average number of words in a fairy tale: 696
Number of words in the shortest story: 19, in the longest story: 11084


## KAGGLE

In [16]:
kaggle_file = open('data/kaggle_tales.txt', 'r').read()

patterns = [re.compile('_The Moral_.*?_Another_.*?\n\n\n', re.DOTALL),
            re.compile('\[Illustration.*?\]', re.DOTALL),
            re.compile(r'_.*?_\n\n')]

for pattern in patterns:
    kaggle_file = re.sub(pattern, '', kaggle_file)
tales = kaggle_file.split('\n\n\n\n\n')

In [17]:
kaggle_num_words = []

for tale in tales:
    tale = tale.replace('\n\n', '\n').strip()
    if len(tale)>0:
        tale_split = tale.split('\n')
        title, titles = format_file_name(tale_split[0], titles)
        text = '\n'.join(tale_split[1:])
        kaggle_num_words.append(len(text.split(' ')))
        save_txt(text, 'data/tales/' + title)

In [18]:
words_summary(kaggle_num_words)

Number of unique files with fairy tales: 836
Total number of words in all fairy tales: 2168464
Average number of words in a fairy tale: 2593
Number of words in the shortest story: 1, in the longest story: 105959
