# Crawl, load, and split Taylor and Tyler transcripts

In [None]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [None]:
from datetime import datetime
import os
from urllib.parse import urlparse

from langchain.document_loaders import UnstructuredWordDocumentLoader
from langchain.schema.document import Document
import openai
from tqdm import tqdm

from models.load_utils import save_docs_to_jsonl
from models.split_model import SyntacticEmbeddingSplitter
from models.split_utils import get_openai_embedder

In [None]:

path_dir  = '../data/raw/tnt/'
output_dir = '../data/split/tnt/'

today = datetime.today().strftime('%Y-%m-%d')

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

if not os.path.exists(path_dir):
    os.makedirs(path_dir)

In [None]:
file_data = []
docxs = []
file_names = os.listdir(path_dir)
for file in file_names:
    docxs.append(path_dir + file)
for doc in tqdm(docxs):
    loader = UnstructuredWordDocumentLoader(doc, mode="elements")
    data = []
    data = loader.load()
    file_data.extend(data)


In [None]:
# this file is huge, so use the faster openai embeddings instead of the default voyageai
openai.api_key = os.environ["OPENAI_API_KEY"]
openai_embedder = get_openai_embedder(openai)
text_splitter = SyntacticEmbeddingSplitter(embedder=openai_embedder, split_threshold=0.80)

In [None]:
def load_docx(page_cont, page_met, page_title) -> Document:
    """Load docxs from tnt document"""
    title = page_title
    content = page_cont
    page_met["title"] = page_title
    metadata = page_met

    return Document(page_content=content, metadata=metadata)

In [None]:
len(file_data)

In [None]:
docs = []
pages = []
for item in file_data:
    if len(item.page_content.strip()) == 0:
        print('skipping', item.metadata.get('category'))
        continue
    if item.metadata.get("page_number") is None:
        page_num = ""
    else:
        page_num = str(item.metadata["page_number"])

    title = item.metadata["filename"].replace(".docx", "") + " - page_number:" + page_num
    content = item.page_content
    pages.append(load_docx(content, item.metadata, title))
len(pages)

In [None]:
splits = text_splitter.split_documents(pages, verbose=True)

In [None]:
print(f"Created {len(splits)} splits")
for ix, split in enumerate(splits[:10]):
    print(ix, split.metadata["page_number"], split.metadata["title"])
    print(split.page_content)
    print("\n!!! SPLIT !!!\n")
    docs.extend(splits)

In [None]:
len(docs)

In [None]:
filename = os.path.join(output_dir, f"{today}.jsonl")
save_docs_to_jsonl(docs, filename)
