In [1]:
# setup
import urllib
from bs4 import BeautifulSoup
from bs4.element import Comment
import re

from collections import defaultdict, deque

import concurrent

### Zadanie 3.
Napisz własny system do indeksowania stron internetowych, który

• przegląda strony i zapamiętuje liczbę wystąpień poszczególnych słów na poszczególnych stronach;

• zachowuje się podobnie jak pythonowy słownik, gdzie kluczem jest słowo, a wartością lista stron na których to słowo występuje (bądź lista pusta). Strony powinny być uszeregowane malejąco względem podanej liczby wystąpień. Możesz też zaproponować własną strategię rankowania stron.

Zakładamy, że indeksujemy tylko stronę wskazaną jako parametr odpowiedniej funkcji czy metody, oraz strony do których da się dojść po linkach a href w nie więcej niż z góry zadana liczba kroków.

In [2]:
def load(url):
    
    try:
        urllib.request.urlopen(url)
    except urllib.request.HTTPError as e:
        print('Ignored: ', e)
        return -1
    except urllib.request.URLError as e:
        print('URLError: ', e)
        return -1
    except Exception as e:
        print('Error: ', e)
        return -1
    
    page = urllib.request.urlopen(url).read()
    data = BeautifulSoup(page, "html.parser")
    return data


def find_hrefs(url):
    content = load(url)
    if content == -1:
        return []
    ahrefs = content.findAll('a', href=True)
    
    res = []
    for a in ahrefs:
        link = a['href']
        
        if link is None or len(link) == 0:
            continue
        if re.search('^http', link) is not None:
            res.append(link)
        else:
            if link[0] != '/':
                res.append(url + link)
            else:
                link_start = link.split('/')[1]
                find = re.search('/'+link_start+'/', url)
                if find is not None:
                    res.append(url[:find.start()] + link)
                else:
                    res.append(url + link)
    return res


def count_words(url):
    '''
    returns dictionary of - word: (url, #occurances)
    '''
    def find_words(url):
        def text_from_html(url):
            def tag_visible(element):
                return element.parent.name not in ['style', 'script', 'head', 'title', 'meta', '[document]'] and not isinstance(element, Comment)

            content = load(url)
            if content == -1:
                return ""
            texts = content.findAll(text=True)
            visible_texts = filter(tag_visible, texts)
            return u" ".join(t.strip() for t in visible_texts)
        
        text = text_from_html(url)
        text = [re.sub('\W+', '', t) for t in text.lower().split()]
        return list(filter(len, text))
    
    words = find_words(url)
    word_counts = defaultdict(int)
    for w in words:
        word_counts[w] += 1
        
    res = defaultdict(list)
    for word, occurances in word_counts.items():
        res[word] = (url, occurances)
    return res


def sort_dict(d):
    return dict(sorted(d.items(), key=lambda kv: kv[1], reverse=True))


def BFS(url, steps, max_steps):
    '''
    returns dict - word: list of (website, #occurances)
    '''
    q = deque()
    q.append((url, steps))
    visited = {url}
    website_word_counts = defaultdict(list)
    
    while(len(q)):
        url, steps = q.popleft()
        if steps > max_steps:
            break
        
        print(f'depth: {steps} / {max_steps}\t:\t{url}')
        for word, wb_occ in count_words(url).items():
            website_word_counts[word].append(wb_occ)
    
        for h in find_hrefs(url):
            if h not in visited:
                visited.add(h)
                q.append((h, steps+1))
    
    # sort websites by no of appereances for every word
    for k, v in website_word_counts.items():
        website_word_counts[k] = list(sorted(website_word_counts[k], key=lambda kv: kv[1], reverse=True))
    return website_word_counts

In [6]:
def most_occurances(bfs):
    word_occ_counts = {}
    for k, v in bfs.items():
        word_occ_counts[k] = sum(list(map(lambda kv : kv[1], v)))
    word_occ_counts = sort_dict(word_occ_counts)
    woc = next(iter(word_occ_counts.items()))
    return woc

def most_websites_for_word(bfs):
    word_web_counts = {}
    for k, v in bfs.items():
        word_web_counts[k] = len(v)
    word_web_counts = sort_dict(word_web_counts)
    wwc = next(iter(word_web_counts.items()))
    return wwc

In [5]:
%%time
url = ['https://pl.wikipedia.org/wiki/Beat_Squad', 'https://pl.wikipedia.org/wiki/Bad_Wildbad']

bfs = BFS(url[0], 0, 1)

depth: 0 / 1	:	https://pl.wikipedia.org/wiki/Beat_Squad
depth: 1 / 1	:	https://pl.wikipedia.org/wiki/Beat_Squad#mw-head
depth: 1 / 1	:	https://pl.wikipedia.org/wiki/Beat_Squad#p-search
depth: 1 / 1	:	https://pl.wikipedia.org/wiki/Plik:Czarny_(Beat_Squad).jpg
depth: 1 / 1	:	https://pl.wikipedia.org/wiki/1998_w_muzyce
depth: 1 / 1	:	https://pl.wikipedia.org/wiki/Polska
depth: 1 / 1	:	https://pl.wikipedia.org/wiki/Hip-hop
depth: 1 / 1	:	https://pl.wikipedia.org/wiki/G-Funk
depth: 1 / 1	:	https://pl.wikipedia.org/wiki/Beat_Squad#cite_note-Beat-1
depth: 1 / 1	:	https://pl.wikipedia.org/wiki/Wydawnictwo_muzyczne
depth: 1 / 1	:	https://pl.wikipedia.org/wiki/Blend_Records
depth: 1 / 1	:	https://pl.wikipedia.org/wiki/Rafi_(raper)
depth: 1 / 1	:	https://pl.wikipedia.org/wiki/Koni
depth: 1 / 1	:	https://pl.wikipedia.org/wiki/Pozna%C5%84
depth: 1 / 1	:	https://pl.wikipedia.org/wiki/Beat_Squad#cite_note-pl-2
depth: 1 / 1	:	https://pl.wikipedia.org/wiki/Killaz_Group
depth: 1 / 1	:	https://pl.wikiped

In [7]:
woc = most_occurances(bfs)
wwc = most_websites_for_word(bfs)
print(f'The most occurances had the word \'{woc[0]}\': {woc[1]}')
print(f'The word \'{wwc[0]}\' appeared on most websites: {wwc[1]}')

The most occurances had the word 'w': 4773
The word 'do' appeared on most websites: 70


#### Multiprocessing

In [8]:
def BFS_concurrent(url, max_depth):
    visited = {url}

    all_websites = [url]
    cur_websites = [url]
    
    # get all hrefs for every website
    for _ in range(max_depth):
        with concurrent.futures.ThreadPoolExecutor() as executor:
            cur_websites = list(executor.map(find_hrefs, cur_websites))[0]
        all_websites += cur_websites
    all_websites = list(set(all_websites))

    # find words for every website
    with concurrent.futures.ThreadPoolExecutor() as executor:
        list_d = list(executor.map(count_words, all_websites))  # list of dicts - word: (website, #occurances)
    
    website_word_counts = defaultdict(list)
    for d in list_d:
        with concurrent.futures.ThreadPoolExecutor() as executor:
            executor.map(lambda kv: website_word_counts[kv[0]].append(kv[1]), d.items())
        
    # sort websites by no of appereances for every word
    for k, v in website_word_counts.items():
        website_word_counts[k] = list(sorted(website_word_counts[k], key=lambda kv: kv[1], reverse=True))
    
    return website_word_counts

In [9]:
%%time
bfs_concurrent = BFS_concurrent(url[0], 1)

Ignored:  HTTP Error 404: Not Found
Ignored:  HTTP Error 404: Not Found
Ignored:  HTTP Error 404: Not FoundIgnored: 
 HTTP Error 404: Not Found
Ignored:  HTTP Error 404: Not Found
Ignored:  HTTP Error 404: Not Found
Error:  'ascii' codec can't encode character '\u015b' in position 24: ordinal not in range(128)
Ignored:  HTTP Error 404: Not Found
Ignored:  HTTP Error 404: Not Found
Ignored:  HTTP Error 404: Not Found
Ignored:  HTTP Error 404: Not Found
Ignored:  HTTP Error 403: Forbidden
Ignored:  HTTP Error 404: Not Found
Ignored:  HTTP Error 404: Not Found
CPU times: user 9.36 s, sys: 497 ms, total: 9.85 s
Wall time: 26 s


In [10]:
woc = most_occurances(bfs_concurrent)
wwc = most_websites_for_word(bfs_concurrent)

print(f'The most occurances had the word \'{woc[0]}\': {woc[1]}')
print(f'The word \'{wwc[0]}\' appeared on most websites: {wwc[1]}')

The most occurances had the word 'w': 4777
The word 'do' appeared on most websites: 70
