In [132]:
# setup
import urllib
from bs4 import BeautifulSoup
from bs4.element import Comment
import re

from collections import defaultdict, deque

import concurrent

### Zadanie 3.
Napisz własny system do indeksowania stron internetowych, który

• przegląda strony i zapamiętuje liczbę wystąpień poszczególnych słów na poszczególnych stronach;

• zachowuje się podobnie jak pythonowy słownik, gdzie kluczem jest słowo, a wartością lista stron na których to słowo występuje (bądź lista pusta). Strony powinny być uszeregowane malejąco względem podanej liczby wystąpień. Możesz też zaproponować własną strategię rankowania stron.

Zakładamy, że indeksujemy tylko stronę wskazaną jako parametr odpowiedniej funkcji czy metody, oraz strony do których da się dojść po linkach a href w nie więcej niż z góry zadana liczba kroków.

In [166]:
def load(url):
    
    try:
        urllib.request.urlopen(url)
    except urllib.request.HTTPError as e:
        print('Ignored: ', e)
        return -1
    except urllib.request.URLError as e:
        print('URLError: ', e)
        return -1
    except Exception as e:
        print('Error: ', e)
        return -1
    
    page = urllib.request.urlopen(url).read()
    data = BeautifulSoup(page, "html.parser")
    return data


def find_hrefs(url):
    content = load(url)
    if content == -1:
        return []
    ahrefs = content.findAll('a', href=True)
    
    res = []
    for a in ahrefs:
        link = a['href']
        
        if link is None or len(link) == 0:
            continue
        if re.search('^http', link) is not None:
            res.append(link)
        else:
            if link[0] != '/':
                res.append(url + link)
            else:
                link_start = link.split('/')[1]
                find = re.search('/'+link_start+'/', url)
                if find is not None:
#                     print(link_start, link, url, find.start(), url[:find.start()])
                    res.append(url[:find.start()] + link)
                else:
                    res.append(url + link)
    return res


def count_words(url):
    '''
    returns dictionary of - word: (url, #occurances)
    '''
    def find_words(url):
        def text_from_html(url):
            def tag_visible(element):
                return not element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]'] and not isinstance(element, Comment)

            content = load(url)
            if content == -1:
                return ""
            texts = content.findAll(text=True)
            visible_texts = filter(tag_visible, texts)
            return u" ".join(t.strip() for t in visible_texts)
        
        text = text_from_html(url)
        text = [re.sub('\W+', '', t) for t in text.lower().split()]
        return list(filter(len, text))
    
    words = find_words(url)
    word_counts = defaultdict(int)
    for w in words:
        word_counts[w] += 1
        
    res = defaultdict(list)
    for word, occurances in word_counts.items():
        res[word] = (url, occurances)
    return res


def sort_dict(d):
    return dict(sorted(d.items(), key=lambda kv: kv[1], reverse=True))


def BFS(url, steps, max_steps):
    '''
    returns dict - word: list of (website, #occurances)
    '''
    q = deque()
    q.append((url, steps))
    visited = {url}
    website_word_counts = defaultdict(list)
    
    while(len(q)):
        url, steps = q.popleft()
        if steps > max_steps:
            break
        
        print(f'depth: {steps} / {max_steps}\t:\t{url}')
        for word, wb_occ in count_words(url).items():
            website_word_counts[word].append(wb_occ)
            
        for h in find_hrefs(url):
            if h not in visited:
                visited.add(h)
                q.append((h, steps+1))
    
    # sort websites by no of appereances for every word
    for k, v in website_word_counts.items():
        website_word_counts[k] = list(sorted(website_word_counts[k], key=lambda kv: kv[1], reverse=True))
    return website_word_counts

In [167]:
%%time
# url = 'https://pl.wikipedia.org/wiki/Specjalna:Losowa_strona'
url = 'https://en.wikipedia.org/wiki/Alpman'
bfs = BFS(url, 0, 1)

depth: 0 / 1	:	https://en.wikipedia.org/wiki/Alpman
depth: 1 / 1	:	https://en.wikipedia.org/wiki/Alpman#mw-head
depth: 1 / 1	:	https://en.wikipedia.org/wiki/Alpman#p-search
depth: 1 / 1	:	https://en.wiktionary.org/wiki/Alpman
depth: 1 / 1	:	https://en.wikipedia.org/wiki/Ayten_Alpman
depth: 1 / 1	:	https://en.wikipedia.org/wiki/Fatma_Serpil_Alpman
depth: 1 / 1	:	https://en.wikipedia.org/wiki/Surname
depth: 1 / 1	:	https://en.wikipedia.org/w/index.php?title=Special:WhatLinksHere/Alpman&namespace=0
depth: 1 / 1	:	https://en.wikipedia.org/wiki/Wikipedia:Manual_of_Style/Linking
depth: 1 / 1	:	https://en.wikipedia.org/wiki/Given_name
depth: 1 / 1	:	https://en.wikipedia.org/w/index.php?title=Alpman&oldid=925236056
depth: 1 / 1	:	https://en.wikipedia.org/wiki/Help:Category
depth: 1 / 1	:	https://en.wikipedia.org/wiki/Category:Surnames
depth: 1 / 1	:	https://en.wikipedia.org/wiki/Category:Articles_with_short_description
depth: 1 / 1	:	https://en.wikipedia.org/wiki/Category:All_set_index_article

In [125]:
def most_occurances(bfs):
    word_occ_counts = {}
    for k, v in bfs.items():
        word_occ_counts[k] = sum(list(map(lambda kv : kv[1], v)))
    word_occ_counts = sort_dict(word_occ_counts)
    woc = next(iter(word_occ_counts.items()))
    return woc

def most_websites_for_word(bfs):
    word_web_counts = {}
    for k, v in bfs.items():
        word_web_counts[k] = len(v)
    word_web_counts = sort_dict(word_web_counts)
    wwc = next(iter(word_web_counts.items()))
    return wwc

In [172]:
woc = most_occurances(bfs)
wwc = most_websites_for_word(bfs)
print(f'The most occurances had the word \'{woc[0]}\': {woc[1]}')
print(f'The word \'{wwc[0]}\' appeared on most websites: {wwc[1]}')

The most occurances had the word 'the': 4028
The word 'from' appeared on most websites: 50


#### Multiprocessing

In [145]:
def BFS_concurrent(url, max_depth):
    visited = {url}

    all_websites = [url]
    cur_websites = [url]
    
    # get all hrefs for every website
    for _ in range(max_depth):
        with concurrent.futures.ThreadPoolExecutor() as executor:
            cur_websites = list(executor.map(find_hrefs, cur_websites))[0]
        all_websites += cur_websites
    all_websites = list(set(all_websites))
        
    # find words for every website
    with concurrent.futures.ThreadPoolExecutor() as executor:
        list_d = list(executor.map(count_words, all_websites))  # list of dicts - word: (website, #occurances)
    
    website_word_counts = defaultdict(list)
    for d in list_d:
        with concurrent.futures.ThreadPoolExecutor() as executor:
            executor.map(lambda kv: website_word_counts[kv[0]].append(kv[1]), d.items())
        
    # sort websites by no of appereances for every word
    for k, v in website_word_counts.items():
        website_word_counts[k] = list(sorted(website_word_counts[k], key=lambda kv: kv[1], reverse=True))
    
    return website_word_counts

In [146]:
%%time
bfs_concurrent = BFS_concurrent(url, 1)

Ignored:  HTTP Error 404: Not Found
Ignored:  HTTP Error 404: Not Found
Ignored:  HTTP Error 403: Forbidden
Ignored:  HTTP Error 404: Not Found
Ignored:  HTTP Error 404: Not Found
Ignored:  HTTP Error 404: Not Found
Ignored:  HTTP Error 404: Not Found
Ignored:  HTTP Error 404: Not Found
59
CPU times: user 5.18 s, sys: 249 ms, total: 5.43 s
Wall time: 6.34 s


In [135]:
woc = most_occurances(bfs_concurrent)
wwc = most_websites_for_word(bfs_concurrent)

print(f'The most occurances had the word \'{woc[0]}\': {woc[1]}')
print(f'The word \'{wwc[0]}\' appeared on most websites: {wwc[1]}')

The most occurances had the word 'the': 4013
The word 'from' appeared on most websites: 50
