In [1]:
from lib.robots import check_robots_txt
from lib.shingling import *
from lib.renderer import *
from lib.crawler import crawl_url
from lib.data import *
import json

from web.site_generator import build_site_data, build_sites
from web.publish import Publish

import config as cfg

To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html


## Crawl Page

In [4]:
url = "https://locomotive.agency/jfkaljflsjf/"
page_data = crawl_url(url)
print("## Status " + "#"*20 + "\n")
print(page_data['status'])
print("\n## Meta " + "#"*20 + "\n")
print(json.dumps(page_data['meta'], indent=4))
print("\n## Content " + "#"*20 + "\n")
print(page_data['content'])

## Status ####################

404

## Meta ####################

{
    "description": "",
    "lang": "en",
    "keywords": "",
    "favicon": "",
    "canonical": null,
    "encoding": "utf-8",
    "robots": [
        "noindex",
        "follow"
    ]
}

## Content ####################

Adapt is now . Same team, new name.


In [7]:
info = crawl_url('http://example.com/')
assert info['meta']['robots'] == []
assert info['meta']['canonical'] == None
assert info['domain'] == 'example.com'
assert info['title'] == 'Example Domain'

AssertionError: 

In [4]:
from goose3 import Goose

g = Goose({'browser_user_agent': cfg.browser_user_agent, 'parser_class':'soup'})
r = g.fetcher.fetch_obj('http://example.com/')
page = g.extract(raw_html=r.content)

In [5]:
vars(r)

{'_content': b'<!doctype html>\n<html>\n<head>\n    <title>Example Domain</title>\n\n    <meta charset="utf-8" />\n    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n    <meta name="viewport" content="width=device-width, initial-scale=1" />\n    <style type="text/css">\n    body {\n        background-color: #f0f0f2;\n        margin: 0;\n        padding: 0;\n        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;\n        \n    }\n    div {\n        width: 600px;\n        margin: 5em auto;\n        padding: 2em;\n        background-color: #fdfdff;\n        border-radius: 0.5em;\n        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);\n    }\n    a:link, a:visited {\n        color: #38488f;\n        text-decoration: none;\n    }\n    @media (max-width: 700px) {\n        div {\n            margin: 0 auto;\n            width: auto;\n        }\n    }\n    </style>    \n</head>\n\n<body>

In [3]:
info

{'meta': {'description': '',
  'lang': None,
  'keywords': '',
  'favicon': '',
  'canonical': None,
  'encoding': 'utf-8',
  'robots': []},
 'image': None,
 'domain': None,
 'title': 'Example Domain',
 'cleaned_text': 'This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission.',
 'opengraph': {},
 'tags': [],
 'tweets': [],
 'movies': [],
 'links': [],
 'authors': [],
 'publish_date': None,
 'final_url': None,
 'status': 200,
 'headers': {'Content-Encoding': 'gzip', 'Accept-Ranges': 'bytes', 'Cache-Control': 'max-age=604800', 'Content-Type': 'text/html; charset=UTF-8', 'Date': 'Wed, 20 Nov 2019 13:28:20 GMT', 'Etag': '"3147526947+gzip"', 'Expires': 'Wed, 27 Nov 2019 13:28:20 GMT', 'Last-Modified': 'Thu, 17 Oct 2019 07:18:26 GMT', 'Server': 'ECS (agb/5386)', 'Vary': 'Accept-Encoding', 'X-Cache': 'HIT', 'Content-Length': '648'},
 'link_hash': '84238dfc8092e5d9c0dac8ef93371a07.1574256501.334681'

In [14]:
r.text

'<!doctype html>\n<html>\n<head>\n    <title>Example Domain</title>\n\n    <meta charset="utf-8" />\n    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n    <meta name="viewport" content="width=device-width, initial-scale=1" />\n    <style type="text/css">\n    body {\n        background-color: #f0f0f2;\n        margin: 0;\n        padding: 0;\n        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;\n        \n    }\n    div {\n        width: 600px;\n        margin: 5em auto;\n        padding: 2em;\n        background-color: #fdfdff;\n        border-radius: 0.5em;\n        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);\n    }\n    a:link, a:visited {\n        color: #38488f;\n        text-decoration: none;\n    }\n    @media (max-width: 700px) {\n        div {\n            margin: 0 auto;\n            width: auto;\n        }\n    }\n    </style>    \n</head>\n\n<body>\n<div>\n    <

## Shingling

In [14]:
content1 = """Many packages don't create a build for every single release which forces your pip to build from source. If you're happy to use the latest pre-compiled binary version, use --only-binary :all: to allow pip to use an older binary version."""
content2 = """Most packages don't create a build for every single release which forces your pip to build from source. If you're happy to use the latest pre-compiled binary version, use --only-binary :all: to allow pip to use an older binary version."""
content3 = """The C++ Build Tools allow you to build C++ libraries and applications targeting Windows desktop. They are the same tools that you find in Visual Studio 2019, Visual Studio 2017, and Visual Studio 2015 in a scriptable standalone installer. Now you only need to download the MSVC compiler toolset you need to build C++ projects on your build servers."""


hashdb = HashLookup()

hashdb.add_hash('content1', content1)
hashdb.add_hash('content2', content2)
hashdb.add_hash('content3', content3)

print("## Hashes " + "#"*20 + "\n")
print(hashdb.get_hash('content1')[:5])
print(hashdb.get_hash('content2')[:5])
print(hashdb.get_hash('content3')[:5])
print("\n## Length " + "#"*20 + "\n")
print(len(hashdb))
print("\n## Similarity " + "#"*20 + "\n")
print(hashdb.get_similarity_df(content2))

## Hashes ####################

[-2145608475, -2092676559, -2100324990, -2106062289, -2101729913]
[-2145608475, -2092676559, -2100324990, -2106062289, -2101729913]
[-2091386696, -2135102216, -2102983285, -2091906514, -2113156462]

## Length ####################

3

## Similarity ####################

        url       sim
1  content2  1.000000
0  content1  0.886792
2  content3  0.000000


## PageRank

In [22]:
url_list = ['urla', 'urlb', 'urlc', 'urld', 'urle']
link_tuples = [('urla','urlb'), ('urlc','urlb'), ('urla','urle'), ('urle','urla'), ('urlc','urlb'), ('urld','urle'), ('urle','urlb')]

pr_valid = {'url': {0: 'urlb', 1: 'urle', 2: 'urla', 3: 'urld', 4: 'urlc'}, 'score': {0: 0.3625498007448575, 1: 0.2544205750109898, 2: 0.19976269190396267, 3: 0.09163346617009499, 4: 0.09163346617009499}}
df = build_pagerank_df(url_list, link_tuples)

print("## PageRank " + "#"*20 + "\n")
print(df)

## PageRank ####################

    url     score
0  urlb  0.362550
1  urle  0.254421
2  urla  0.199763
3  urld  0.091633
4  urlc  0.091633


# Bert

In [20]:
from lib.bert import *

queries = ['trim a chrismas tree', 'jog on a path', 'kindle ebook', 'italian restaurant', 'internet landing page']
ngrams = ['decorate a tree', 'run on a road', 'electric book', 'cafe in italy', 'website homepage']

bert = BERT(dims=None)
bert.add_terms(ngrams)

for q in queries:
    best, sim = bert.get_most_similar(q)
    print("Query: {} {} ===> Best Guess: {} ({})".format(q, ' '*(25-len(q)), best, round(sim,4)))

Query: trim a chrismas tree       ===> Best Guess: decorate a tree (0.5298)
Query: jog on a path              ===> Best Guess: run on a road (0.5744)
Query: kindle ebook               ===> Best Guess: electric book (0.4624)
Query: italian restaurant         ===> Best Guess: cafe in italy (0.4957)
Query: internet landing page      ===> Best Guess: website homepage (0.6674)
