In [1]:
from lib.robots import check_robots_txt
from lib.shingling import *
from lib.renderer import *
from lib.crawler import crawl_url
from lib.data import *
from lib.normalize import normalize_corpus
from engine import *

import json

from web.site_generator import build_site_data, build_sites
from web.publish import Publish

import config as cfg

# Main 

In [2]:
print('Configuration File:\n')
_ = [print(i,':', vars(cfg)[i]) for i in list(dir(cfg)) if '__' not in i]

Configuration File:

bert_weight : 0.7
browser_user_agent : Googlebot
crawler_seed : https://data-science-blog.github.io/Customer-Data-Platform/
embedding_size : 100
i_type : bm25
per_sec_crawl_rate : 1
pr_weight : 0.7
sg_gh_user : jroakes-locomotive
sg_page_template : ---
layout: post
title:  {title}
categories: [{topic}]
---

{content}


sg_save_folder : files
sg_sites : [{'topic': 'python software', 'org_name': 'python-software'}, {'topic': 'data science', 'org_name': 'data-science-blog'}, {'topic': 'search engine optimization', 'org_name': 'search-engine-optimization-blog'}]
sim_weight : 0.7
title_boost : 3
transformer_model : distilbert-base-uncased


## Crawling

In [3]:
crawler = Crawler()
crawler.crawl('https://data-science-blog.github.io/Customer-Data-Platform/')

Crawling: https://data-science-blog.github.io/Customer-Data-Platform/
Crawling: https://data-science-blog.github.io/Consistency-Database-Systems/
Crawling: https://data-science-blog.github.io/about
Crawling: https://data-science-blog.github.io/Big-Data
Crawling: https://data-science-blog.github.io/
Crawling: https://python-software.github.io/Eric-Software
Crawling: https://data-science-blog.github.io/Data
Crawling: https://data-science-blog.github.io/categories
Crawling: https://data-science-blog.github.io/Data/
Crawling: https://data-science-blog.github.io/Committee-On-Data-For-Science-And-Technology/
Crawling: https://python-software.github.io/Core-Python-Programming
Crawling: https://python-software.github.io/Intel-Parallel-Studio
Crawling: https://data-science-blog.github.io/Berkeley-Institute-For-Data-Science
Crawling: https://python-software.github.io/History-Of-Python
Crawling: https://search-engine-optimization-blog.github.io/Audio-Search-Engine
Crawling: https://data-science-b

## Rendering

In [4]:
crawler.render()

[I:pyppeteer.launcher] Browser listening on: ws://127.0.0.1:54834/devtools/browser/96ca47b5-f2df-4962-9944-b9e520c41ad4


Rendering: https://data-science-blog.github.io/Customer-Data-Platform/
Rendering: https://data-science-blog.github.io/Consistency-Database-Systems/
Rendering: https://data-science-blog.github.io/about
Rendering: https://data-science-blog.github.io/Big-Data
Rendering: https://data-science-blog.github.io/
Rendering: https://python-software.github.io/Eric-Software
Rendering: https://data-science-blog.github.io/Data
Rendering: https://data-science-blog.github.io/categories
Rendering: https://data-science-blog.github.io/Data/
Rendering: https://data-science-blog.github.io/Committee-On-Data-For-Science-And-Technology/
Rendering: https://python-software.github.io/Core-Python-Programming
Rendering: https://python-software.github.io/Intel-Parallel-Studio
Rendering: https://data-science-blog.github.io/Berkeley-Institute-For-Data-Science
Rendering: https://python-software.github.io/History-Of-Python
Rendering: https://search-engine-optimization-blog.github.io/Audio-Search-Engine
Rendering: https:

[I:pyppeteer.connection] connection closed


## Indexing

In [5]:
indexer = Indexer(crawler)
indexer.build_index()
indexer.build_bert_embeddings()

loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json from cache at C:\Users\jroak\.cache\torch\transformers\a41e817d5c0743e29e86ff85edc8c257e61bc8d88e4271bb1b243b6e7614c633.1ccd1a11c9ff276830e114ea477ea2407100f4a3be7bdc45d37be9e37fa71c7e
Model config {
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "finetuning_task": null,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "n_heads": 12,
  "n_layers": 6,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torchscript": false,
  "use_bfloat16": false,
  "vocab_size": 30522
}

loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-pytorch_model.bin from cache at C:\Users\jroak\.c

## Searching

In [8]:
search_term = 'python programming'
df = indexer.search_index(search_term)
df

Unnamed: 0,title,url,description
67fb3a56f4a5b4ada650888415692c1c,Demo Site | Demo site generated from Wikipedia...,https://python-software.github.io/,Demo site generated from Wikipedia topics
ccb6272667b7c94201d9f774c6ed2d47,Core Python Programming Python Software | Demo...,https://python-software.github.io/Core-Python-...,Core Python Programming is a textbook on the P...
983c83a5a8b3496d96406d4917e3df92,Eric Software Python Software | Demo Site,https://python-software.github.io/Eric-Software,eric is a free integrated development environm...
6e1416433f99a49c8f4b2830708db8e6,History Of Python Python Software | Demo Site,https://python-software.github.io/History-Of-P...,The programming language Python was conceived ...
8495394a15fb395351dd4bfd823fecb7,Circuitpython Python Software | Demo Site,https://python-software.github.io/Circuitpython/,CircuitPythonis an open source derivative of t...
e68978a1b854f773f72b4b0b23a48fa7,Comparison Of Integrated Development Environme...,https://python-software.github.io/Comparison-O...,The following tables list notable software pac...
798b6e9b7b0bcfdeb5fd901e0d14754f,Activestate Python Software | Demo Site,https://python-software.github.io/Activestate/,ActiveState Software Inc. is a Canadian softwa...
0b6ffb38999298c6c1de11ca9fd6e70f,Biopython Python Software | Demo Site,https://python-software.github.io/Biopython/,The Biopython Project is an open-source collec...
eed0f1a092770413483ff8bab22e17cb,Anaconda Python Distribution Python Software |...,https://python-software.github.io/Anaconda-Pyt...,Anaconda is a free and open-sourcedistribution...
0d12d0e7e810721289a525936b9722d0,Intel Parallel Studio Python Software | Demo Site,https://python-software.github.io/Intel-Parall...,Intel Parallel Studio XE is a software develop...


# Individual Components

## Crawl Page

In [9]:
url = "https://locomotive.agency/"
page_data = crawl_url(url)
print("## Status " + "#"*20 + "\n")
print(page_data['status'])
print("\n## Meta " + "#"*20 + "\n")
print(json.dumps(page_data['meta'], indent=4))
print("\n## Content " + "#"*20 + "\n")
print(page_data['content'])

## Status ####################

200

## Meta ####################

{
    "description": "LOCOMOTIVE\u00ae - 2019 U.S. Search Awards \"Best SEO Agency\". We are an agency team of enterprise technical, and on-page SEO specialists: Moving you forward.",
    "lang": "en",
    "keywords": "",
    "favicon": "",
    "canonical": "https://locomotive.agency/",
    "encoding": "utf-8",
    "robots": [
        "max-snippet:-1",
        "max-image-preview:large",
        "max-video-preview:-1"
    ]
}

## Content ####################

Our team was humbled to be a finalist for 3 U.S. Search Awards, and receive one of the top awards of the night for SEO agencies. All awards are judged upon results for clients, as well as agency operations, by a panel of SEO industry veteran judges.


## Shingling

In [10]:
content1 = """Many packages don't create a build for every single release which forces your pip to build from source. If you're happy to use the latest pre-compiled binary version, use --only-binary :all: to allow pip to use an older binary version."""
content2 = """Most packages don't create a build for every single release which forces your pip to build from source. If you're happy to use the latest pre-compiled binary version, use --only-binary :all: to allow pip to use an older binary version."""
content3 = """The C++ Build Tools allow you to build C++ libraries and applications targeting Windows desktop. They are the same tools that you find in Visual Studio 2019, Visual Studio 2017, and Visual Studio 2015 in a scriptable standalone installer. Now you only need to download the MSVC compiler toolset you need to build C++ projects on your build servers."""


hashdb = HashLookup()

hashdb.add_hash('content1', content1)
hashdb.add_hash('content2', content2)
hashdb.add_hash('content3', content3)

print("## Hashes " + "#"*20 + "\n")
print(hashdb.get_hash('content1')[:5])
print(hashdb.get_hash('content2')[:5])
print(hashdb.get_hash('content3')[:5])
print("\n## Length " + "#"*20 + "\n")
print(len(hashdb))
print("\n## Similarity " + "#"*20 + "\n")
print(hashdb.get_similarity_df(content2))

## Hashes ####################

[-2145608475, -2092676559, -2100324990, -2106062289, -2101729913]
[-2145608475, -2092676559, -2100324990, -2106062289, -2101729913]
[-2091386696, -2135102216, -2102983285, -2091906514, -2113156462]

## Length ####################

3

## Similarity ####################

    urlhash       sim
1  content2  1.000000
0  content1  0.886792
2  content3  0.000000


## PageRank

In [11]:
url_list = ['urla', 'urlb', 'urlc', 'urld', 'urle']
link_tuples = [('urla','urlb'), ('urlc','urlb'), ('urla','urle'), ('urle','urla'), ('urlc','urlb'), ('urld','urle'), ('urle','urlb')]

pr_valid = {'url': {0: 'urlb', 1: 'urle', 2: 'urla', 3: 'urld', 4: 'urlc'}, 'score': {0: 0.3625498007448575, 1: 0.2544205750109898, 2: 0.19976269190396267, 3: 0.09163346617009499, 4: 0.09163346617009499}}
df = build_pagerank_df(url_list, link_tuples)

print("## PageRank " + "#"*20 + "\n")
print(df)

## PageRank ####################

            score
urlhash          
urlb     0.362550
urle     0.254421
urla     0.199763
urld     0.091633
urlc     0.091633


# Bert

In [12]:
from lib.bert import *

queries = ['trim a chrismas tree', 'jog on a path', 'kindle ebook', 'italian restaurant', 'internet landing page']
ngrams = ['decorate a tree', 'run on a road', 'electric book', 'cafe in italy', 'website homepage']

bert = BERT(dims=None)
bert.add_terms(ngrams)

for q in queries:
    best, sim = bert.get_most_similar(q)
    print("Query: {} {} ===> Best Guess: {} ({})".format(q, ' '*(25-len(q)), best, round(sim,4)))

loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json from cache at C:\Users\jroak\.cache\torch\transformers\a41e817d5c0743e29e86ff85edc8c257e61bc8d88e4271bb1b243b6e7614c633.1ccd1a11c9ff276830e114ea477ea2407100f4a3be7bdc45d37be9e37fa71c7e
Model config {
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "finetuning_task": null,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "n_heads": 12,
  "n_layers": 6,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torchscript": false,
  "use_bfloat16": false,
  "vocab_size": 30522
}

loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-pytorch_model.bin from cache at C:\Users\jroak\.c

Query: trim a chrismas tree       ===> Best Guess: decorate a tree (0.8258)
Query: jog on a path              ===> Best Guess: run on a road (0.8056)
Query: kindle ebook               ===> Best Guess: electric book (0.6704)
Query: italian restaurant         ===> Best Guess: cafe in italy (0.7463)
Query: internet landing page      ===> Best Guess: website homepage (0.7787)
