# Crawl, load, and split Taylor and Tyler transcripts

In [None]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [None]:
from datetime import datetime
import os
from urllib.parse import urlparse

from langchain.document_loaders import UnstructuredWordDocumentLoader
from langchain.schema.document import Document
import openai
from tqdm import tqdm

from models.load_utils import create_pages_from_unstructured_elements, save_docs_to_jsonl
from models.split_model import SyntacticEmbeddingSplitter
from models.split_utils import get_openai_embedder

In [None]:
path_dir  = '../data/raw/tnt/'
output_dir = '../data/split/tnt/'

today = datetime.today().strftime('%Y-%m-%d')

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

if not os.path.exists(path_dir):
    os.makedirs(path_dir)

In [None]:
file_data = []
docxs = []
file_names = os.listdir(path_dir)
for file in file_names:
    docxs.append(path_dir + file)
for doc in tqdm(docxs):
    # load file
    loader = UnstructuredWordDocumentLoader(doc, mode="elements")
    data = loader.load()
    # create pages
    title = doc.split("/")[-1].replace(".docx", "")
    pages = create_pages_from_unstructured_elements(data, title)
    file_data.extend(pages)

In [None]:
len(docxs)

In [None]:
len(file_data)

In [None]:
text_splitter = SyntacticEmbeddingSplitter()

In [None]:
splits = text_splitter.split_documents(file_data, verbose=True)

In [None]:
print(f"Created {len(splits)} splits")
for ix, split in enumerate(splits[:10]):
    print(ix, split.metadata["title"])
    print(split.page_content)
    print("\n!!! SPLIT !!!\n")

In [None]:
filename = os.path.join(output_dir, f"{today}.jsonl")
save_docs_to_jsonl(splits, filename)
