# Create segments from pre-processed texts using segmentation model

In [None]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [None]:
from datetime import datetime
import json
import os
import pickle
import re

from IPython.display import display, Markdown
import openai
import pinecone
from sentence_transformers import SentenceTransformer
import spacy
from tqdm.autonotebook import tqdm

from models.data_utils import get_paragraph_texts_and_ids, get_segment_texts_and_ids
from models.segment_train import get_mpnet_embedder, get_openai_embedder,\
    syntactic_paragraph_features, predict_using_features_and_ensemble

In [None]:
# configure
input_dir = '../data/pre_process'
segment_model_path = '../data/segment/model/2023-04-12.pkl'
segment_threshold = 0.55
max_segment_len = 500
output_dir = '../data/segment/output'
today = datetime.today().strftime('%Y-%m-%d')

# spacy
parser = spacy.load("en_core_web_sm")

# mpnet embedder for segmentation
mpnet = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
mpnet_embedder = get_mpnet_embedder(mpnet)

# openai embedder for segmentation
openai.organization = os.environ['OPENAI_ORG']
openai.api_key = os.environ['OPENAI_KEY']
openai.Engine.list()
openai_embedder = get_openai_embedder(openai)

In [None]:
# load segment classifier
with open(segment_model_path,'rb') as f:
    clf = pickle.load(f)

## Create segments

In [None]:
# create segmenter
predictor = predict_using_features_and_ensemble(syntactic_paragraph_features,
                                                openai_embedder,
                                                mpnet_embedder,
                                                parser,
                                                clf,
                                                segment_threshold)

In [None]:
# read conf talks
segments = []
for filename in tqdm(os.path.join(input_dir, f)
                 for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f))):
    with open(filename, 'r') as f:
        data = json.load(f)

    # get paragraphs
    paragraph_texts_and_ids = get_paragraph_texts_and_ids(data['content'])
    paragraphs = [paragraph_text_id[0] for paragraph_text_id in paragraph_texts_and_ids]

    # get segments
    segmentation = predictor(paragraphs)
    segment_texts_and_ids = get_segment_texts_and_ids(paragraph_texts_and_ids,
                                                      segmentation,
                                                      max_segment_len = max_segment_len,
                                                     )

    # create segment
    for segment_text_and_id in segment_texts_and_ids:
        segments.append({
            'year': data['year'],
            'month': data['month'],
            'url': data['url'],
            'anchor': segment_text_and_id[1],
            'title': data['title'],
            'author': data['author'],
            'text': segment_text_and_id[0],
        })
len(segments)

In [None]:
segments[0]

## Save segments

In [None]:
filename = os.path.join(output_dir, f"{today}.json")
with open(filename, 'w', encoding='utf-8') as f:
    json.dump(segments, f, ensure_ascii=False, indent=2)