<img src="https://raw.githubusercontent.com/instill-ai/cookbook/main/images/Logo.png" alt="Instill Logo" width="300"/>

# Semantic Web Insights

In [1]:
!pip install instill-sdk==0.12.1 --quiet

In [2]:
from IPython.display import IFrame, display, Markdown

import requests
import concurrent.futures
from google.protobuf.json_format import MessageToDict
import numpy as np
import os

from instill.clients.client import init_pipeline_client
pipeline = init_pipeline_client(api_token=os.environ['INSTILL_API_TOKEN'])

### **[Crawl Website](https://www.instill.tech/docs/component/operator/web#crawl-website)** to Generate High-quality Markdown

In [29]:
max_k = 10

response_crawler = pipeline.trigger_namespace_pipeline(
    "george_strong",
    "website-to-markdown",
    [{"max-k": max_k,
      "url": "https://www.instill.tech/"}]
)

In [30]:
md_pages = MessageToDict(response_crawler)['outputs'][0]['crawled-content']

print(md_pages[0])

### Chunk Markdown

In [7]:
chunked_pages = []

for web_page in md_pages:
    response = pipeline.trigger_namespace_pipeline(
        "george_strong",
        "chunk-markdown",
        [{"md-input": web_page,
          "chunk-strategy": "Markdown",
          "max-chunk-length": 1200,
          "chunk-overlap": 1}]
    )
    response = MessageToDict(response)
    chunks = [item['text'] for item in response['outputs'][0]['response']]
    chunked_pages.append(chunks)

### Embed Chunks

#### OpenAI Embedding

In [129]:
def process_chunk(chunk):
    embedding = pipeline.trigger_namespace_pipeline(
        "george_strong",
        "embed-chunk",
        [{"chunk-input": chunk}]
    )
    return MessageToDict(embedding)['outputs'][0]['embed-result']

In [130]:
def embed_chunks(chunks):

    embeddings = np.empty((0, 1536), float)

    # Use ThreadPoolExecutor to parallelize the embedding process
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_chunk, chunk) for chunk in chunks]

        for count, future in enumerate(concurrent.futures.as_completed(futures), 1):
            embedding = future.result()
            embeddings = np.append(embeddings, [embedding], axis=0)
    
    return embeddings

In [47]:
embedded_pages = []

for page in chunked_pages:
    embedded_pages.append(embed_chunks(page))

In [None]:
embedded_pages[0].shape

#### Jina CLIP V1 Embedding Model

In [8]:
def embed_chunks_jina_clipv1(text_chunks):

    url = 'https://api.instill.tech/v1alpha/organizations/instill-ai/models/jina-clip-v1/trigger'
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {os.environ["INSTILL_API_TOKEN"]}'
    }

    embeddings = [{"text": text, "type": "text"} for text in text_chunks]

    data = {
        "taskInputs": [
            {
                "data": {
                    "embeddings": embeddings
                }
            }
        ]
    }

    response = requests.post(url, headers=headers, json=data)

    if response.status_code == 200:
        vectors = [embedding['vector'] for embedding in response.json()['taskOutputs'][0]['data']['embeddings']]
        return vectors
    else:
        print(f"Error: Received status code {response.status_code}")
        return None

In [9]:
embedded_pages = []

for page in chunked_pages:
    embedded_pages.append(embed_chunks_jina_clipv1(page))

### Clustering and Visualization

In [10]:
!pip install umap-learn --quiet
!pip install bokeh --quiet

In [11]:
import umap
from sklearn.cluster import KMeans

from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category10

  from .autonotebook import tqdm as notebook_tqdm


#### UMAP Dimensionality Reduction

In [12]:
flattened_embeddings = [vector for page in embedded_pages for vector in page]
flattened_chunks = [chunk for chunks in chunked_pages for chunk in chunks]

X = np.array(flattened_embeddings)

umap_model = umap.UMAP(n_components=2, n_neighbors=40, random_state=42)
reduced_embeddings = umap_model.fit_transform(X)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


#### K-Means Clustering

In [13]:
num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(reduced_embeddings)

#### Interactive Plot using Bokeh

In [14]:
source = ColumnDataSource(data=dict(
    x=reduced_embeddings[:, 0],
    y=reduced_embeddings[:, 1],
    text=flattened_chunks,
    cluster=cluster_labels
))

colors = Category10[num_clusters]
source.data['color'] = [colors[label] for label in cluster_labels]

plot = figure(title='Visualize Crawled Website Embeddings',
              tools="pan,wheel_zoom,box_zoom,reset",
              x_axis_label='UMAP 1',
              y_axis_label='UMAP 2',
              width=900,
              height=600)

plot.scatter('x', 'y', source=source, size=8, color='color', alpha=0.6)

hover_tool = HoverTool()
hover_tool.tooltips = """
    <div style="width: 400px; white-space: normal;">
        <div><strong>Text Chunk:</strong></div>
        <div>@text</div>
    </div>
"""

plot.add_tools(hover_tool)

output_notebook()
show(plot)