In [1]:
from lib.robots import check_robots_txt
from lib.shingling import *
from lib.renderer import *
from lib.crawler import crawl_url
from lib.data import *
from lib.normalize import normalize_corpus
from engine import *

import json

from web.site_generator import build_site_data, build_sites
from web.publish import Publish

import config as cfg

# Main 

In [2]:
print('Configuration File:\n')
_ = [print(i,':', vars(cfg)[i]) for i in list(dir(cfg)) if '__' not in i]

Configuration File:

bert_weight : 0.7
browser_user_agent : Googlebot
crawler_seed : https://data-science-blog.github.io/Customer-Data-Platform/
embedding_size : 100
i_type : bm25
per_sec_crawl_rate : 1
pr_weight : 0.7
sg_gh_user : jroakes-locomotive
sg_page_template : ---
layout: post
title:  {title}
categories: [{topic}]
---

{content}


sg_save_folder : files
sg_sites : [{'topic': 'python software', 'org_name': 'python-software'}, {'topic': 'data science', 'org_name': 'data-science-blog'}, {'topic': 'search engine optimization', 'org_name': 'search-engine-optimization-blog'}]
sim_weight : 0.7
title_boost : 3
transformer_model : distilbert-base-uncased


## Crawling

In [3]:
crawler = Crawler()
crawler.crawl('https://data-science-blog.github.io/Customer-Data-Platform/')

Crawling: https://data-science-blog.github.io/Customer-Data-Platform/
Crawling: https://data-science-blog.github.io/Data
Crawling: https://data-science-blog.github.io/categories
Crawling: https://data-science-blog.github.io/about
Crawling: https://data-science-blog.github.io/
Crawling: https://data-science-blog.github.io/Data/
Crawling: https://data-science-blog.github.io/Consistency-Database-Systems/
Crawling: https://data-science-blog.github.io/Big-Data
Crawling: https://python-software.github.io/Eric-Software
Crawling: https://data-science-blog.github.io/Dataintensive-Computing/
Crawling: https://data-science-blog.github.io/Big-Data/
Crawling: https://data-science-blog.github.io/Black-Swan-Data/
Crawling: https://data-science-blog.github.io/Berkeley-Institute-For-Data-Science/
Crawling: https://data-science-blog.github.io/Coding-Bootcamp/
Crawling: https://data-science-blog.github.io/Chief-Data-Officer/
Crawling: https://data-science-blog.github.io/Committee-On-Data-For-Science-And-

In [7]:
import copy
newcrawler = copy.deepcopy(crawler)

In [1]:
get_ipython().config

{'IPKernelApp': {'connection_file': 'C:\\Users\\jroak\\AppData\\Roaming\\jupyter\\runtime\\kernel-e9c54d4e-dbb3-4ca4-a3a2-a23bce777dfb.json'}}

## Rendering

In [4]:
crawler.render()

RuntimeError: This event loop is already running

## Indexing

In [None]:
indexer = Indexer(crawler)
indexer.build_index()
indexer.build_bert_embeddings()

## Searching

In [None]:
search_term = 'python programming'
df = indexer.search_index(search_query)

# Individual Components

## Crawl Page

In [None]:
url = "https://locomotive.agency/"
page_data = crawl_url(url)
print("## Status " + "#"*20 + "\n")
print(page_data['status'])
print("\n## Meta " + "#"*20 + "\n")
print(json.dumps(page_data['meta'], indent=4))
print("\n## Content " + "#"*20 + "\n")
print(page_data['content'])

## Shingling

In [None]:
content1 = """Many packages don't create a build for every single release which forces your pip to build from source. If you're happy to use the latest pre-compiled binary version, use --only-binary :all: to allow pip to use an older binary version."""
content2 = """Most packages don't create a build for every single release which forces your pip to build from source. If you're happy to use the latest pre-compiled binary version, use --only-binary :all: to allow pip to use an older binary version."""
content3 = """The C++ Build Tools allow you to build C++ libraries and applications targeting Windows desktop. They are the same tools that you find in Visual Studio 2019, Visual Studio 2017, and Visual Studio 2015 in a scriptable standalone installer. Now you only need to download the MSVC compiler toolset you need to build C++ projects on your build servers."""


hashdb = HashLookup()

hashdb.add_hash('content1', content1)
hashdb.add_hash('content2', content2)
hashdb.add_hash('content3', content3)

print("## Hashes " + "#"*20 + "\n")
print(hashdb.get_hash('content1')[:5])
print(hashdb.get_hash('content2')[:5])
print(hashdb.get_hash('content3')[:5])
print("\n## Length " + "#"*20 + "\n")
print(len(hashdb))
print("\n## Similarity " + "#"*20 + "\n")
print(hashdb.get_similarity_df(content2))

## PageRank

In [None]:
url_list = ['urla', 'urlb', 'urlc', 'urld', 'urle']
link_tuples = [('urla','urlb'), ('urlc','urlb'), ('urla','urle'), ('urle','urla'), ('urlc','urlb'), ('urld','urle'), ('urle','urlb')]

pr_valid = {'url': {0: 'urlb', 1: 'urle', 2: 'urla', 3: 'urld', 4: 'urlc'}, 'score': {0: 0.3625498007448575, 1: 0.2544205750109898, 2: 0.19976269190396267, 3: 0.09163346617009499, 4: 0.09163346617009499}}
df = build_pagerank_df(url_list, link_tuples)

print("## PageRank " + "#"*20 + "\n")
print(df)

# Bert

In [None]:
from lib.bert import *

queries = ['trim a chrismas tree', 'jog on a path', 'kindle ebook', 'italian restaurant', 'internet landing page']
ngrams = ['decorate a tree', 'run on a road', 'electric book', 'cafe in italy', 'website homepage']

bert = BERT(dims=None)
bert.add_terms(ngrams)

for q in queries:
    best, sim = bert.get_most_similar(q)
    print("Query: {} {} ===> Best Guess: {} ({})".format(q, ' '*(25-len(q)), best, round(sim,4)))