In [1]:
import requests
import os

In [2]:
import pandas as pd
import numpy as np

In [3]:
from bs4 import BeautifulSoup
from pathlib import Path

In [12]:
from cleantext import clean

In [4]:
filename = 'bookmarks.html'

In [5]:
processed = []
with open(filename) as f:
    for line in f.readlines():
        line = line.replace("<p>", "")
        line = line.replace("</DL>", "</DL></DT>")
        if line.lstrip().startswith("<DT><A "):
            line = line.rstrip() + "</DT>\n"
        processed.append(line)
filetxt = ''.join(processed)

In [6]:
soup = BeautifulSoup(filetxt, 'html.parser')

In [7]:
def visit_dt(dt, parents, collector):
    if dt.name != 'dt':
        raise Exception(f"Unknown node passed as dt: {dl.name}")
    folder = None
    created = None
    modified = None

    for c in dt.children:
        if c.name == 'h3':
            folder = c.string
            created = c['add_date'] if c.has_attr('add_date') else None
            modified = c['last_modified'] if c.has_attr('last_modified') else None
        elif c.name == 'dl':
            p = [*parents, (folder, created, modified)]
            visit_dl(c, p, collector)
        elif c.name == 'a':
            if not c.has_attr('href'):
                raise Exception(c)
            href = c['href']
            if not c.has_attr('add_date'):
                raise Exception(c)
            l_created = c['add_date']
            l_updated = c['last_modified'] if c.has_attr('last_modified') else None
            l_text = c.string
            collector.append((l_text, href, l_created, l_updated, parents))

In [8]:
def visit_dl(dl, parents, collector):
    if dl.name != 'dl':
        raise Exception(f"Unknown node passed as dl: {dl.name}")
    for n in dl.children:
        if n.name == None:
            next
        elif n.name == 'dt':
            visit_dt(n, parents, collector)
        elif n.name == 'dl':
            visit_dl(dl, parents, collector)
        else:
            raise Exception(f"Unknown node {n.name}")

In [9]:
collector = []
visit_dl(soup.dl, [], collector)

In [10]:
len(collector)

3341

In [11]:
!mkdir -p outputs

In [None]:
start = 0
batch = 50

for i, c in enumerate(collector):
    if i < start:
        continue
    url = c[1]
    if not url.startswith("http"):
        continue
    try:
        resp = requests.get(url, timeout=1.0)
    except Exception as e:
        print(f"Error while requesting {url}: {e}")
        continue
    
    filename = f"outputs/out_{i:05}.html"
    with open(filename, "w") as f:
        f.write(resp.text)
#         print(f"Output to file {filename}")
    if i > start + batch:
        break

In [None]:
import random
import itertools

In [13]:
def cleantxt(txt):
    return clean(text=txt,
            fix_unicode=True,
            to_ascii=True,
            lower=True,
            no_line_breaks=True,
            no_urls=False,
            no_emails=False,
            no_phone_numbers=False,
            no_numbers=False,
            no_digits=False,
            no_currency_symbols=False,
            no_punct=False,
            replace_with_punct="",
            replace_with_url=":url:",
            replace_with_email=":email:",
            replace_with_phone_number="",
            replace_with_number=":num:",
            replace_with_digit=":digit:",
            replace_with_currency_symbol="$",
            lang="en")

In [None]:
files = os.listdir('outputs/')

In [None]:
def prepare_texts(files):
    result = []
    for file in files:
        filename = f"outputs/{file}"
#         print(f"Reading file {filename}")
        with open(filename, "r") as f:
            txt = f.read()
        soup = BeautifulSoup(txt, "html.parser")
        title = soup.title.text if soup.title else ""
        content = []
        txt = cleantxt(title)
        content.append(f"<title>{txt}</title>")

        main = soup.find(id=['main', 'content'])
        if not main:
            main = soup.body
        if not main:
            continue
        for c in main.descendants:
            if c.name in ['h1', 'h2', 'h3']:
                txt = cleantxt(c.text)
                content.append(f"<{c.name}>{txt}</{c.name}>")
            elif c.name in ['p']:
                txt = cleantxt(c.text[0:200])
                if len(txt) == 0:
                    continue
                content.append(f"<p>{txt}</p>")
            if len(content) >= 10:
                result.append("\n".join(content))
                break
    return result

# Identify Topics

### Create Embeddings

In [14]:
api_key = ''

In [15]:
import cohere
import time

In [16]:
co = cohere.Client(api_key)

In [17]:
model = 'medium'

In [18]:
files = os.listdir('outputs/')

In [19]:
batch = 16

def batch_embed(inputs):
    result = []
    for i in range(0, len(inputs), batch):        
        texts = inputs[i:i+batch]
        response = co.embed(
            texts=texts, 
            model=model,
            truncate="RIGHT"
        )
        result.extend(response.embeddings)
        print(f"batch complete: {i}")
        time.sleep(1.0)
    return result

## Plot the embeddings

In [None]:
import umap
import altair as alt
import re

In [None]:
reducer = umap.UMAP(n_neighbors=100)
umap_embeds = reducer.fit_transform(embeddings)

In [None]:
keys = list(result.keys())

In [None]:
keys = [re.sub('out_0+|\.html', '', k) for k in keys]

In [None]:
items = [collector[int(k)] for k in keys]

In [None]:
titles = [item[0] for item in items]

In [None]:
titles

In [None]:
df = pd.DataFrame(titles, columns=['titles'])

In [None]:
df

In [None]:
len(umap_embeds)

In [None]:
df['x'] = umap_embeds[:,0]
df['y'] = umap_embeds[:,1]

In [None]:
df

In [None]:

# Plot
chart = alt.Chart(df).mark_circle(size=60).encode(
    x=#'x',
    alt.X('x',
        scale=alt.Scale(zero=False),
        axis=alt.Axis(labels=False, ticks=False, domain=False)
    ),
    y=
    alt.Y('y',
        scale=alt.Scale(zero=False),
        axis=alt.Axis(labels=False, ticks=False, domain=False)
    ),
    tooltip=['titles']
).configure(background="#FDF7F0"
).properties(
    width=700,
    height=400,
    title='Bookmarks'
)

chart.interactive()

# Plot Only using Titles

In [20]:
len(collector)

3341

In [21]:
titles = [c[0] for c in collector]

In [22]:
len(titles)

3341

In [None]:
embeddings = batch_embed(titles)

In [None]:
len(embeddings)

In [None]:
reducer = umap.UMAP(n_neighbors=100)
umap_embeds = reducer.fit_transform(embeddings)

In [None]:
df = pd.DataFrame(titles, columns=['titles'])

In [None]:
len(df)

In [None]:
df['x'] = umap_embeds[:,0]
df['y'] = umap_embeds[:,1]

In [None]:

# Plot
chart = alt.Chart(df).mark_circle(size=60).encode(
    x=#'x',
    alt.X('x',
        scale=alt.Scale(zero=False),
        axis=alt.Axis(labels=False, ticks=False, domain=False)
    ),
    y=
    alt.Y('y',
        scale=alt.Scale(zero=False),
        axis=alt.Axis(labels=False, ticks=False, domain=False)
    ),
    tooltip=['titles']
).configure(background="#FDF7F0"
).properties(
    width=700,
    height=400,
    title='Bookmarks'
)

chart.interactive()