In [5]:
import numpy as np
from urllib.request import urlopen
from lxml import etree
import spacy
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import heapq
import re

nlp = spacy.load('en_core_web_md')




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hugho\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hugho\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:
def similarity(url_1, url_2, xpath_expression):
    def crawl(url):
        htmlparser = etree.HTMLParser()
        response = urlopen(url)
        tree = etree.parse(response, htmlparser)
        root = tree.getroot()
        xpatheval = etree.XPathEvaluator(root)
        content = [elem.text for elem in root.xpath(xpath_expression) if elem.text is not None]
        text_string = ' '.join(content)
        return text_string

    def summarize(text_string):
        #remove numbers
        article_text = re.sub(r'\[[0-9]*\]', ' ', text_string)
        #remove white space characters
        article_text = re.sub(r'\s+', ' ', article_text)
        #remove any non letters
        formatted_article_text = re.sub('[^a-zA-Z]', ' ', article_text)
        #remove white space again???
        formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text)
        #tokenize article into sentences
        sentence_list = nltk.sent_tokenize(article_text)
        stopwords = nltk.corpus.stopwords.words('english')

        #create word frequencies dict
        word_frequencies = {}
        for word in nltk.word_tokenize(formatted_article_text):
            if word not in stopwords:
                if word not in word_frequencies.keys():
                    word_frequencies[word] = 1
                else:
                    word_frequencies[word] += 1
        #get max word frequency
        maximum_frequency = max(word_frequencies.values())
        #divide frequencies by max frequency for some reason - scale them down
        for word in word_frequencies.keys():
            word_frequencies[word] = (word_frequencies[word]/maximum_frequency)

        #get sentence scores based on word frequency
        # for each word in the sentence, 
        # take the word freq score and total sentence score is the sum of all the word freq scores in the sentence
        sentence_scores = {}
        for sent in sentence_list:
            # tokenise the sentences - covert to all lower case
            for word in nltk.word_tokenize(sent.lower()):
                if word in word_frequencies.keys():
                    # if less than 30 words in the sentence?
                    if len(sent.split(' ')) < 30:
                        if sent not in sentence_scores.keys():
                            # add sent to dict => score = frequency score of first word
                            sentence_scores[sent] = word_frequencies[word]
                        else: 
                            # sent in dict => increase sent score by next word freq score
                            sentence_scores[sent] += word_frequencies[word]
        #get 7 largest elements from sentence_scores into a list
        # don't know what the key part does
        summary_sentences = heapq.nlargest(7, sentence_scores, key=sentence_scores.get)

        # join list into one text string
        summary = ' '.join(summary_sentences)
        print(f'Summary: \n{summary}\n')

        return

    def calc(content1, content2):
        doc1 = nlp(content1)
        doc2 = nlp(content2)
        s = doc1.similarity(doc2)
        print(f'Text Similarity: {s}')

        return

    content1 = crawl(url_1)
    summarize(content1)
    content2 = crawl(url_2)
    summarize(content2)
    result = calc(content1, content2)

    return




In [35]:
url_1 = 'https://www.bbc.com/future/article/20210507-the-myths-that-hint-at-past-disasters'
url_2 = 'https://www.bbc.com/worklife/article/20210507-why-we-glorify-the-cult-of-burnout-and-overwork'

xpath_exp = './/article//div[contains(@class, "body")]//*'

similarity(url_1, url_2, xpath_exp)



['“One thing that we can learn from these ancient stories is that sea level rise cannot be stopped very easily by sea defences like sea walls,” says Nunn.', '“I imagine that global warming and climate change and rising sea levels might inspire new geomyths,” she says.', 'Sea levels started to rise nearly 15,000 years ago with the end of the last ice age.', 'In this story the bad behaviour of a man named Goonyah caused the sea to flood the land and he then organised the people to stop it.', 'In two of the stories, the city’s elaborate defences imply the inhabitants had been fighting a losing battle against the sea for generations.', "“I'm a conventionally trained geologist and I can tell you that a lot of other conventionally trained geoscientists really don't like this kind of thing.", 'In another, he led the people up a mountain to escape the water where they worked together to roll heated rocks into the sea.']
['People work long hours all over the world, for many different reasons.',