# Crawl, load, and split Evidence Central

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from datetime import datetime
import json
import os
import requests
import time
from typing import Iterator
from urllib.parse import urljoin, urlparse

from bs4 import BeautifulSoup, Tag
from tqdm import tqdm

from models.crawl_utils import get_page, save_page
from models.load_evidence_central import load_evidence_central
from models.load_utils import Loader, load_docs_from_jsonl, save_docs_to_jsonl
from models.split_model import MarkdownSyntacticEmbeddingSplitter

## Crawl

In [None]:
# config
host = 'https://evidencecentral.org/sitemap.xml'
source = "evidence_central"
crawl_dir = f'../data/raw/{source}'
delay_seconds = 5

if not os.path.exists(crawl_dir):
    os.makedirs(crawl_dir)
    

In [None]:
def extract_links(xml_content):
    #Parse the XML content using BeautifulSoup
    soup = BeautifulSoup(xml_content, 'xml')

    #Find all <loc> tags in the XML
    loc_tags = soup.find_all('loc')

    #Extract and store the text inside each <loc> tag
    loc_texts = [loc.get_text() for loc in loc_tags]

    return loc_texts

In [None]:
def get_path(url):
    path_components = urlparse(url).path.split('/')
    # print(path_components)
    return os.path.join(crawl_dir, f"{path_components[-1]}.json")

In [None]:
def get_sitemap(host):
    headers = {
        'Content-Type': 'text/xml',
    }
    response = requests.get(host, headers=headers)
    return response

In [None]:
def get_page_from_post(path):
    path_route = path.replace("https://evidencecentral.org", "")
    headers = {
        'Content-Type': 'application/json',
    }

    json_data = {
        'tblnm': 'virctnrnyvw',
        'templateId': 116,
        'parameters': '{"tblEty":314,"prnFomtplhdrID":116,"path":"'+path_route+'","templateKey":"PAGES/CONTENT_RECENCY_VIEWER","uiKey":"AF24A18C-F8BD-4742-AAF8-955E7AF86206","dynamicStateGuid":"d0450ef1-aa1e-4240-8ef5-a1721f3e3564","isFormValid":true,"fileTemplate":"recency"}',
        'formValues': '{"tblEty":314}',
    }
    response = requests.post('https://evidencecentral.org/api/fileTemplate/process/html', headers=headers, json=json_data)

    return response

    # Note: json_data will not be serialized by requests
    # exactly as it was in the original request.
    #data = '{"tblnm":"virctnrnyvw","templateId":116,"parameters":"{\\"tblEty\\":314,\\"prnFomtplhdrID\\":116,\\"path\\":\\"/recency/evidence/make-a-record\\",\\"templateKey\\":\\"PAGES/CONTENT_RECENCY_VIEWER\\",\\"uiKey\\":\\"AF24A18C-F8BD-4742-AAF8-955E7AF86206\\",\\"dynamicStateGuid\\":\\"d0450ef1-aa1e-4240-8ef5-a1721f3e3564\\",\\"isFormValid\\":true,\\"fileTemplate\\":\\"recency\\"}","formValues":"{\\"tblEty\\":314}"}'
    #response = requests.post('https://evidencecentral.org/api/fileTemplate/process/html', headers=headers, data=data)

In [None]:
response = get_sitemap(host)
if response.status_code != 200:
    print(f"ERROR {response.status_code}")
xml_content = response.content

In [None]:
xml_links = extract_links(xml_content)
print(len(xml_links))
print(xml_links[2])

In [None]:
for url in xml_links:
    # print(url)
    path_file =  get_path(url)
    print(path_file)
    if os.path.exists(path_file):
        continue
    response = get_page_from_post(url)
    if response.status_code != 200:
        print("Error!", response.status_code , url)
        continue
    time.sleep(delay_seconds)
    save_page(path_file,url,response.content.decode('utf-8'))

print("End")

## Load

In [None]:
# config
# input_dir is now crawl_dir, and output_dir is now load_dir, and output_filename is now load_filename
load_dir = f'../data/load/{source}/'
today = datetime.today().strftime('%Y-%m-%d')
load_filename = os.path.join(load_dir, f"{today}.jsonl")

if not os.path.exists(load_dir):
    os.makedirs(load_dir)

In [None]:
loader = Loader(load_evidence_central, crawl_dir)
docs = loader.load(verbose=True)
len(docs)

In [None]:
print("metadat: ", docs[0].metadata)
print()
print("content: ", docs[0].page_content)

In [None]:
save_docs_to_jsonl(docs, load_filename)

## Split

In [None]:
# configure
# input_path is now load_filename, output_dir is now split_dir, and output filename is now split_filename
split_dir = f'../data/split/{source}/'
today = datetime.today().strftime('%Y-%m-%d')
# output filename is now split_filename
split_filename = os.path.join(split_dir, f"{today}.jsonl")

if not os.path.exists(split_dir):
    os.makedirs(split_dir)

In [None]:
docs = load_docs_from_jsonl(load_filename)
len(docs)

In [None]:
text_splitter = MarkdownSyntacticEmbeddingSplitter()

In [None]:
splits = text_splitter.split_documents(docs, verbose=True)
len(splits)

In [None]:
for ix, split in enumerate(splits[:25]):
    print(ix, split.metadata)
    print(split.page_content)
    print("\n!!! SPLIT !!!\n")

In [None]:
save_docs_to_jsonl(splits, split_filename)