# Scrape and Prepare data script

___

## Index Site

In [None]:
from bs4 import BeautifulSoup
import requests
from os.path import join
import time

In [None]:
base_url = 'https://docs.seldon.io/projects/seldon-core/en/v2/contents/about/index.html'

In [None]:
from bs4 import BeautifulSoup
import urllib.request
from os.path import join
from urllib.parse import urlparse, urljoin


base_url = 'https://docs.seldon.io/projects/seldon-core/en/v2'
start_url = '/contents/about/index.html'


def get_links(url):
    time.sleep(0.1)
    print('scraping, ', url)
    html = urllib.request.urlopen(url)
    soup = BeautifulSoup(html, 'html.parser')
    urls = []
    links = soup.find_all('a', class_='md-nav__link')
    for link in links:
        link = link.get('href')
        if link.startswith('#'):
            continue
        if link.startswith('http'):
            continue
        full_url = urljoin(url, link)
        urls.append(full_url)
    return urls

In [None]:

seen = set([base_url + start_url])
frontier = [base_url + start_url]

while frontier:
    url = frontier.pop()
    links = get_links(url)
    for url in links:
        if url in seen:
            continue
        seen.add(url)
        frontier.append(url)


In [None]:
with open('data/urls.txt', 'w') as f:
    for url in seen:
        f.write(url + '\n')

In [None]:
with open('data/urls.txt', 'r') as f:
    urls = f.read().splitlines()


## Extract text content

In [None]:
def get_text_content(url):
    # print('scraping, ', url)
    data = {}
    time.sleep(0.1)
    html = urllib.request.urlopen(url)
    soup = BeautifulSoup(html, 'html.parser')
    article = soup.find_all('article', class_='md-content__inner md-typeset')[0]
    for section in article.find_all('section'):
        section_id = section.get('id')
        text = section.get_text()
        data[section_id] = text
    return data

In [None]:
from tqdm import tqdm

data = {}
for url in tqdm(urls):
    url_data = {'sections': {}}
    url_data['sections'] = get_text_content(url)
    key = (url
        .replace('https://docs.seldon.io/projects/seldon-core/en/v2/', '')
        .replace('contents/', '')
        .replace('/index.html', '')
        .replace('.html', '')
    )
    url_data['meta'] = {
        'url': url,
        'key': key,
    }
    data[key] = url_data

In [None]:
import json
with open('data/scraped-docs.json', 'w') as f:
    json.dump(data, f)

In [None]:
import json
with open('data/scraped-docs.json', 'r') as f:
    data = json.load(f)

## Process Content

In [None]:
def gen_split_overlap(seq, size, overlap):
    seq = seq.split(' ')
    if size < 1 or overlap < 0:
        raise ValueError('size must be >= 1 and overlap >= 0')

    for i in range(0, len(seq) - overlap, size - overlap):
        yield ' '.join(seq[i:i + size])

In [None]:
import uuid

items = []

for page, page_data in data.items():
    page_meta = page_data['meta']
    for section, text in page_data['sections'].items():
        for text in gen_split_overlap(text, 256, 64):
            key = str(uuid.uuid4())
            items.append({
                'key': key,
                'page': page,
                'section': section,
                'text': text,
                'meta': page_meta
            })
        

In [None]:
import json
with open('data/processed-docs.json', 'w') as f:
    json.dump(items, f)

In [None]:
import json
with open('data/processed-docs.json', 'r') as f:
    items = json.load(f)

## Vectorise content

In [2]:
import openai
import dotenv
import os
dotenv.load_dotenv()
openai.api_key = os.environ['MLSERVER_MODEL_OPENAI_API_KEY']

for item in tqdm(items):
    text_string = item['text']
    model_id = "text-embedding-ada-002"
    item['embedding'] = openai.Embedding.create(
            input=text_string,
            model=model_id
        )['data'][0]['embedding']

In [None]:
import json
with open('deployment/jobs/create_vdb/vectorized-docs.json', 'w') as f:
    json.dump(items, f)