In [13]:
import selenium as se
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from nltk import tokenize
from time import sleep
from parsel import Selector
import csv
import pandas as pd
from keyword_extractor import *
import requests
from bs4 import BeautifulSoup
from pprint import pprint


In [14]:
# driver = webdriver.Chrome("chrome_driver/chromedriver_linux64/chromedriver")

# driver.get('https://duckduckgo.com/')
# #driver.maximize_window()
# driver.minimize_window()
# sleep(1)

In [15]:
def retrieve_top_urls(driver):
    #list to store top 10 retrieved urls
    retrieved_urls = []
    
    #get url for top 10 results
    for i in range(10):
        path = "//div[@id=\'r1-" + str(i) + "\']/div/h2/a[@class=\'result__a\']"
        link = driver.find_element_by_xpath(path)
        url = link.get_attribute('href')
        retrieved_urls.append(url)
        
    return retrieved_urls

In [16]:
def execute_query(query, driver):
    # locate search form by_name
    driver.find_element_by_name('q').clear()
    search_query = driver.find_element_by_name('q')

    # send_keys() to simulate the search text key strokes
    search_query.send_keys(query)
    sleep(1)

    # .send_keys() to simulate the return key 
    search_query.send_keys(Keys.RETURN)
    sleep(2)
    
    retrieved_urls = retrieve_top_urls(driver)
    
    return retrieved_urls

In [17]:
def process_query(query):
    
    driver = webdriver.Chrome("chrome_driver/chromedriver_linux64/chromedriver")

    driver.get('https://duckduckgo.com/')
    #driver.maximize_window()
    driver.minimize_window()
    sleep(1)

    print("Query: " + query + '\n')
    
    #get initial rankings for the query
    print("Initial rankings")
    initial_ranks = execute_query(query, driver)
    for rank, url in enumerate(initial_ranks):
        print('Rank ' + str(rank + 1) + ': ' + str(url) + '\n')
        
    merged_keywords = set([])
        
    #extract top keywords from the retrieved urls
    #print('Keywords Dictionary')
    
    for url in initial_ranks:
        keywords = keywordExtractor(url, 10)
        print(str(keywords) + '\n')
        
        #combined list for keywords from all retrieved pages
        for keyword in keywords:
            merged_keywords.add(keyword)
            
    print('Merged Keywords from all urls')
    print(merged_keywords)
        
    print('------------------------------------------------------------------------------------------------------------------')
    print('\n')
    
    driver.close()
    
    return initial_ranks, merged_keywords

In [None]:
# f = open('queries.txt', 'r')
# queries = f.read().split('\n')

# for query in queries:
#     process_query(query)

In [19]:
# keywords database structure

# keyword_db = {
#     'python' : {
#         'score'    : 255.0, 
#         'urls'     : ['https://docs.python.org/3/tutorial/index.html', ]
#     },
# }

#load keywords database from file

filename = "./data/keyword_db.data"

with open(filename, 'rb') as f:
    keyword_db = pickle.load(f)


In [20]:
# load clustering

filename = "./data/partition.data"

with open(filename, 'rb') as f:
    partition = pickle.load(f)


In [21]:
clusters = {}

for word in partition:
    i_cluster = partition[word]
    word_score = keyword_db[word]['score']
    if i_cluster not in clusters:
        clusters[i_cluster] = {
            'cluster_score' : word_score, 
            'cluster_size' : 1, 
            'words' : [word]
        }
    else:
        clusters[i_cluster]['cluster_score'] += word_score
        clusters[i_cluster]['cluster_size'] += 1
        clusters[i_cluster]['words'].append(word)

In [23]:
# query and getting merged keywords

query = 'nasdaq market'
initial_ranks, merged_keywords = process_query(query)

Query: nasdaq market

Initial rankings
Rank 1: https://www.nasdaq.com/

Rank 2: https://money.cnn.com/data/markets/

Rank 3: https://www.marketwatch.com/investing/index/comp

Rank 4: https://www.nasdaq.com/market-activity/indexes

Rank 5: https://www.nasdaq.com/authors/the-market-intelligence-desk-team

Rank 6: https://www.nasdaq.com/news-and-insights/markets

Rank 7: https://www.nasdaq.com/about

Rank 8: https://www.marketwatch.com/investing/nasdaq-stock-market

Rank 9: http://www.nasdaqtrader.com/Trader.aspx?id=TradingUSEquities

Rank 10: https://www.investopedia.com/terms/n/nasdaq-smallcap-market.asp

{'data': 0.002271059982625813, 'nasdaq': 0.003760253085110521, 'market': 0.0038237432647749534, 'commodities': 0.015744093100007895, 'var': 0.018432677503803816, 'republic': 0.021103897422225498, 'quotes': 0.02127410957887517, 'calendar': 0.022034695898317212, 'solutions': 0.022034695898317212, 'islands': 0.022734389975619978}

{'updated': 0.010775619890632157, 'dow': 0.007507719388063

In [28]:
initial_ranks

['https://www.nasdaq.com/',
 'https://money.cnn.com/data/markets/',
 'https://www.marketwatch.com/investing/index/comp',
 'https://www.nasdaq.com/market-activity/indexes',
 'https://www.nasdaq.com/authors/the-market-intelligence-desk-team',
 'https://www.nasdaq.com/news-and-insights/markets',
 'https://www.nasdaq.com/about',
 'https://www.marketwatch.com/investing/nasdaq-stock-market',
 'http://www.nasdaqtrader.com/Trader.aspx?id=TradingUSEquities',
 'https://www.investopedia.com/terms/n/nasdaq-smallcap-market.asp']

In [25]:
# finding best matching cluster

cluster_scores = {}

max_score = 0
best_matching_cluster_id = -1

for word in merged_keywords:
    if word in partition: 
        word_score = keyword_db[word]['score']
        i_cluster = partition[word]
        if i_cluster not in cluster_scores:
            cluster_scores[i_cluster] = word_score
        else:
            cluster_scores[i_cluster] += word_score
            
        if(cluster_scores[i_cluster] > max_score):
            best_matching_cluster_id = i_cluster
            max_score = best_matching_cluster_id
            

best_matching_cluster = clusters[best_matching_cluster_id]['words']    
print(best_matching_cluster)

['stocks', 'stock', 'market', 'ago', 'apr.', 'screener', 'updated', 'apr', 'wed', 'politics', 'essentials', 'investing', 'trading', 'management']


As we can see, it identifies the best cluster related to query 'nasdaq markets'. 
The chosen best cluster has all the terms related to stocks and markets.

In [26]:
def seed_score(rank):
    #
    #
    return 0

In [None]:
url_scores = []

# score urls
for i in range(len(initial_ranks)):

    score = seed_score(i)
    
    url = initial_ranks[i]
    #print(url)
    
    headers={'User-Agent': 'Mozilla/5.0'}
    
    tries = 5
    while tries>0 :
        try:
            res = requests.get(url, headers=headers, timeout=3)
            tries = 0
        except:
            tries -= 1
    html_page = res.content
    
    soup = BeautifulSoup(html_page, 'html.parser')
    text = soup.find_all(text=True)

    soup = BeautifulSoup(html_page, 'html.parser')
    text = soup.find_all(text=True)

    page_text = ''
    blacklist = [
        '[document]',
        'noscript',
        'header',
        'html',
        'meta',
        'head', 
        'input',
        'script',
        # there may be more elements you don't want, such as "style", etc.
    ]

    for t in text:
        if t.parent.name not in blacklist:
            page_text += '{} '.format(t)
            
    
    for word in best_matching_cluster:
        cnt = page_text.count(word)
        score += cnt * keyword_db[word]['score']
        
    url_scores.append((url, score))
    

# sort urls based on scores

url_scores = sorted(url_scores, key=lambda pair: pair[1], reverse=True)

In [63]:
for rank in range(len(url_scores)):
    print('Rank ', rank+1, ' : ', url_scores[rank])

Rank  1  :  ('https://www.marketwatch.com/investing/nasdaq-stock-market', 10722.780811664628)
Rank  2  :  ('https://www.investopedia.com/terms/n/nasdaq-smallcap-market.asp', 1570.1317290156721)
Rank  3  :  ('https://www.nasdaq.com/authors/the-market-intelligence-desk-team', 1055.9988964431948)
Rank  4  :  ('https://money.cnn.com/data/markets/', 1048.1574068559394)
Rank  5  :  ('https://www.nasdaq.com/about', 1009.7156984127919)
Rank  6  :  ('https://www.nasdaq.com/news-and-insights/markets', 933.8417011189289)
Rank  7  :  ('https://www.nasdaq.com/market-activity/indexes', 607.2179579019202)
Rank  8  :  ('http://www.nasdaqtrader.com/Trader.aspx?id=TradingUSEquities', 476.16341346832644)
Rank  9  :  ('https://www.nasdaq.com/', 240.279998046875)
Rank  10  :  ('https://www.marketwatch.com/investing/index/comp', 165.28534369705386)
