In [117]:
import httplib2
from bs4 import BeautifulSoup



base_url = 'https://en.wikipedia.org/'

speedrun_examples_site = 'https://wikispeedruns.com/'



def get_and_read_page_and_get_all_links(page_url):

    http = httplib2.Http()
    status, response = http.request(page_url)

    if(status.status != 200): # exit if page was not fetched
        return -1

    soup = BeautifulSoup(response , 'html.parser')

    soup = soup.find("div", id = "bodyContent") # filter only by body content to get rid of additionals sections

    url_list = {}
    for link in soup.find_all('a'):
        if link.has_attr('href') and link.has_attr('title'):
            url_list[link['title']] = link['href']

    return url_list

def read_page_and_get_all_links(page):

    soup = BeautifulSoup(page , 'html.parser')

    soup = soup.find("div", id = "bodyContent") # filter only by body content to get rid of additionals sections

    url_list = {}
    for link in soup.find_all('a'):
        if link.has_attr('href') and link.has_attr('title'):
            url_list[link['title']] = link['href']

    # print(url_list)

    return url_list

        

In [24]:
from torchtext.vocab import GloVe
glove = GloVe(name='6B', dim=100)

.vector_cache\glove.6B.zip: 862MB [04:44, 3.03MB/s]                               
100%|█████████▉| 399999/400000 [00:23<00:00, 16720.12it/s]


In [None]:
import torch

word1 = "king"
word2 = "dragon"
cosine_similarity = torch.nn.functional.cosine_similarity(glove[word1], glove[word2], dim=0)
print(f"Cosine similarity between '{word1}' and '{word2}': {cosine_similarity:.4f}")

Cosine similarity between 'king' and 'dragon': 0.4227


In [None]:

from tensorflow import keras
import numpy as np

with open('words.txt') as f: # txt with all english words to create a extensive vocab
    texts = f.read()

tokenizer = keras.preprocessing.text.Tokenizer(char_level=False)
tokenizer.fit_on_texts([texts])

print("Number of unique words in dictionary =", len(tokenizer.word_index))


Number of unique words in dictionary = 422978


In [102]:
import spacy

nlp = spacy.load("en_core_web_sm")  # loads spacy's english core library
sw_spacy = nlp.Defaults.stop_words

def remove_stopwords(text):
    res = []
    for word in text:
        if word not in sw_spacy:
            res.append(word)
    return res

def fetch_sentence_embedding(sentence):
    
    words = sentence.lower().split() # convert to lower case and split into words
    words = remove_stopwords(words)

    tensor_list = []

    for word in words:
        tensor_list.append(glove[word])

    if(len(tensor_list) == 0):
        return -1

    stacked_tensors = torch.stack(tensor_list)
    sentence_vector = torch.mean(stacked_tensors , dim=0) # average out the word vectors to get sentence vector/tensor

    return sentence_vector


def compare_sentences(sentence1 , sentence2):
    embedding_sentence1 = fetch_sentence_embedding(sentence1)
    
    embedding_sentence2 = fetch_sentence_embedding(sentence2)

    if isinstance(embedding_sentence1, torch.Tensor) == False or isinstance(embedding_sentence2, torch.Tensor) == False:
        return -1
    
    cosine_similarity = torch.nn.functional.cosine_similarity(embedding_sentence1, embedding_sentence2, dim=0)
    return cosine_similarity.item()


In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By

import time

chrome_options = Options()
chrome_options.add_argument("--disable-extensions")
# chrome_options.add_argument("--headless=new") # to run it headless

driver = webdriver.Chrome(options=chrome_options)

base_url = 'https://en.wikipedia.org/wiki/'

visited = {}

def speed_run(start , end , timeout):
    
    page_url = base_url + start.replace(' ','_')

    driver.get(page_url)

    if(start.replace('_',' ') == end):
        print(f"Run for {end} complete")
        return
    
    try:
        element_present = EC.presence_of_element_located(
            (By.ID, 'bodyContent'))
        WebDriverWait(driver, timeout).until(element_present)
    except TimeoutException:
        print("Timed out waiting for page to load")
    finally:
        
        visited[start] = page_url
        
        print("Page visited : ",start)

        html = driver.page_source
        links = read_page_and_get_all_links(html)

        

        max_score = -1
        most_promising_link = ''

        all_links = {}

        

        for key , value in links.items():
            cur_score = compare_sentences(key , end)
            all_links[key] = cur_score

            if(key in visited):
                print("cycle detectd")
            
            if(cur_score > max_score and '/wiki/' in value and '.orgSpecial' not in value and value.replace('/wiki/Wikipedia:','').replace('/wiki/','') not in visited):
                max_score = cur_score
                most_promising_link = value.replace('/wiki/Wikipedia:','').replace('/wiki/','')

        print(most_promising_link , max_score)

        speed_run(most_promising_link , end , timeout)


        # print(dict(sorted(all_links.items() , reverse=True)))

start_time = time.perf_counter()
# speed_run('Bob Dylan','Painting', 5)
# speed_run('Mona Lisa','Bob Dylan', 5)
speed_run('Warsaw','Research',5)

# speed_run('Pink salmon','Hatsune Miku',5)

end_time = time.perf_counter()

elapsed_time = end_time - start_time
print(f"Found page in {elapsed_time:.4f} seconds")
        


Page visited :  Warsaw
Children%27s_Memorial_Health_Institute 0.7591485381126404
Page visited :  Children%27s_Memorial_Health_Institute
cycle detectd
Category:Medical_research_institutes_in_Poland 0.810234546661377
Page visited :  Category:Medical_research_institutes_in_Poland
Category:Medical_research_institutes_by_country 0.8696174621582031
Page visited :  Category:Medical_research_institutes_by_country
Category:Medical_research_institutes 0.9207379817962646
Page visited :  Category:Medical_research_institutes
https://commons.wikimedia.orgCategory:Medical_research_institutes 0.9207379817962646
Page visited :  https://commons.wikimedia.orgCategory:Medical_research_institutes
https://en.wiktionary.orgSpecial:Search/Https://commons.wikimedia.orgCategory:Medical_research_institutes 0.9207379817962646
Page visited :  https://en.wiktionary.orgSpecial:Search/Https://commons.wikimedia.orgCategory:Medical_research_institutes
Why_was_the_page_I_created_deleted%3F 0.44333237409591675
Page visit