In [8]:
import re

def parse_text_file(file_path):
    list_of_tuples = []

    with open(file_path, 'r') as file:
        for line in file:
            parts = line.split(" | ", 1)  # Split the line into two parts: line number + code, and content
            if len(parts) == 2:
                number, code = parts[0].strip().split()
                line_number = int(number)
                content = parts[1].rstrip()
                list_of_tuples.append((line_number, code, content))

    return list_of_tuples

In [9]:
def super_clean_text(input_string):
    # Convert the string to lowercase
    lowercased_string = input_string.lower()

    # Remove all characters that are not a-z or 0-9 using regex
    cleaned_string = re.sub(r'[^a-z0-9]', '', lowercased_string)

    return cleaned_string


def get_index_of_sentence(s, text):
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    lines_clean = []
    for l in lines:
        c = super_clean_text(l)
        lines_clean.append(c)
    median = len(s)//2
    quartile_size = len(s)//4
    delta = max(quartile_size//2,1)
    chunks = (
        s[max(0,median-quartile_size-(2*delta)):median+quartile_size-(2*delta)],
        s[median-quartile_size-(1*delta):median+quartile_size-(1*delta)],
        s[median-quartile_size:median+quartile_size],
        s[median-quartile_size+(1*delta):median+quartile_size+(1*delta)],
        s[median-quartile_size+(2*delta):median+quartile_size+(2*delta)],
    )
    for chunk in chunks:
        s_clean = super_clean_text(chunk)
        #print(f"s_clean: {s_clean}")
        for i, lc in enumerate(lines_clean):
            #print(f"lc: {lc}")
            if s_clean in lc:
                #print(f"i: {i}")
                return i
    return -1

In [10]:
def split_text_into_sentences(text, line_numbers, codes):
    original_text = text
    # Remove extra tabs
    text = re.sub(r'\t+', ' ', text)

    # Remove all asterisks and pound signs
    text = re.sub(r'[\*#]', '', text)

    # Replace multiple newlines with a placeholder to identify paragraphs
    text = re.sub(r'\n{2,}', r'###PARAGRAPH###', text)

    # Replace single newlines with spaces
    text = re.sub(r'\n+', ' ', text)

    # Remove consecutive spaces
    text = re.sub(r'\s{2,}', ' ', text)

    # Special handling for "v." in legal citations and other exceptions
    text = re.sub(r'\b(v\.)\s+', r'\1###NO_SPLIT###', text)

    # Protect abbreviations, titles, section names, numbering like "1\.", and Roman numerals like "II.", "Pp."
    text = re.sub(r'\b(Dr|Mr|Ms|Mrs|U\.S|Jr|Sr|Cf|cf|art|Art|Pp|pp|[0-9]+|I{1,3}|IV|V|VI|VII|VIII|IX|X|XI|XII|Ibid)\.\s+', r'\1.###NO_SPLIT###', text)

    # Protect single-letter section names like "A.", "B."
    text = re.sub(r'\b([A-Z])\.\s+', r'\1###NO_SPLIT###', text)

    # Protect section numbers like "(1)", "(2)", "(a)"
    text = re.sub(r'\((\d+|[a-zA-Z])\)\s+', r'(\1)###NO_SPLIT###', text)

    def split_after_pattern(text, pattern):
        # Use re.split but include the pattern in the split
        parts = re.split(pattern, text)

        # Reassemble the parts to ensure each matched pattern is followed by its text
        sentences = []
        for i in range(1, len(parts)):  # Iterate over pattern matches
            sentences.append(parts[i].strip())

        if len(parts)==1:
            sentences = [text]
        #elif len(parts) > 1:
        #    print(f"[split_after_pattern] sentences: {sentences}")

        return sentences

    # Adjusted regex pattern to handle sentence splitting, especially around footnotes
    sentence_endings = re.compile(
        r'(?<!###NO_SPLIT###)\.\s+(?=[A-Z])|'   # Split at normal sentence boundaries
        r'(?<=\.\")\s+(?=[A-Z])|'               # Period followed by quote and space
        r'(?<=[!?])\s+(?=[A-Z])|'              # Exclamation or question mark followed by space
        r'(?<=\))\s+(?=[A-Z])|'                 # Closing parenthesis followed by space
        r'(?<=\])\s+(?=[A-Z])|'                 # Footnote marker followed by space, but only if followed by an uppercase letter (new sentence)
        r'(?<=\.\s)(?=\(?[A-Z])|'              # Handling for "(iv)", "(ii)", etc.
        r'(?<=\.\s)(?=[A-Z]\w+ v\.\s[A-Z])|'   # Handling for "Nixon v. Fitzgerald"
        r'(?=\[\d{4}-\d{2}-\d{2})|'             # Split before a timestamp pattern "[2024-07-23..."
        r'(?<=\.\s)(?=Pp\.\s[0-9]+)'           # Handling for "Pp. 24-28"
    )

    # Split the text into sentences
    sentences = sentence_endings.split(text.strip())

    # Apply split_after_pattern to each sentence and flatten the result
    # Pattern to match the entire timestamp, descriptor, and model line
    # split after timestamp metadata of assistant or tool message
    pattern = r'(\[\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} UTC\] \([A-Za-z]+\) - Model: [^\:]{5,6}: )'
    temp_sentences = []
    for s in sentences:
        temp_sentences.extend(split_after_pattern(s, pattern))
    sentences = temp_sentences

    # split after timestamp metadata of user message
    pattern = r'(\[\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} UTC\] \(User\): )'
    temp_sentences = []
    for s in sentences:
        temp_sentences.extend(split_after_pattern(s, pattern))
    sentences = temp_sentences

    # Replace placeholder back to original
    sentences = [s.replace("###NO_SPLIT###", " ") for s in sentences]

    # Correctly merge footnote markers back into the sentence when inside quotes or before dashes
    merged_sentences = []
    temp_sentence = ""
    for sentence in sentences:
        if re.match(r'^\[\^\d+\]$', sentence):
            temp_sentence += sentence  # Append footnote marker to previous sentence
        else:
            if temp_sentence:
                merged_sentences.append(temp_sentence.strip())
                temp_sentence = ""
            merged_sentences.append(sentence.strip())

    # Ensure any remaining temp_sentence is added
    if temp_sentence:
        merged_sentences.append(temp_sentence.strip())

    # Split based on paragraph boundaries
    final_sentences_with_paragraphs = []
    for sentence in merged_sentences:
        final_sentences_with_paragraphs.extend(sentence.split('###PARAGRAPH###'))

    # Remove the unnecessary colons
    final_sentences_with_paragraphs = [re.sub(r'^: ', '', s).strip() for s in final_sentences_with_paragraphs]

    # Return the cleaned list of sentences
    cleaned_sentences = [sentence.strip() for sentence in final_sentences_with_paragraphs if sentence.strip()]

    line_number_for_each_sentence, code_for_each_sentence = [], []
    for s in cleaned_sentences:
        i = get_index_of_sentence(s, original_text)
        line_number_for_each_sentence.append(line_numbers[i])
        code_for_each_sentence.append(codes[i])

    return cleaned_sentences, line_number_for_each_sentence, code_for_each_sentence

In [11]:
def sliding_window(list_of_tuples, n):
    whole_trajectory = []
    extra_buffer_for_whole_trajectory = []
    for i in range(len(list_of_tuples) - n + 1):
        cumulative_line_numbers = tuple()
        cumulative_codes = tuple()
        cumulative_content = ""
        window_group = []
        for j in range(n):
            cumulative_line_numbers += (list_of_tuples[i + j][0],)
            cumulative_codes += (list_of_tuples[i + j][1],)
            cumulative_content += list_of_tuples[i + j][2] + "\n"
            window_group.append((cumulative_line_numbers, cumulative_codes, cumulative_content.rstrip()))
        #print('-' * 80)  # Separator for clarity
        whole_trajectory.append(window_group[-1])
        try:
            temp = cumulative_content + list_of_tuples[i + n][2] + "\n" + list_of_tuples[i + n + 1][2]
        except:
            try:
                temp = cumulative_content + list_of_tuples[i + n][2]
            except:
                temp = cumulative_content
        if (i-2 >= 0):
            temp = list_of_tuples[i - 2][2] + "\n" + list_of_tuples[i - 1][2] + "\n" + temp
        elif (i-1 >= 0):
            temp = list_of_tuples[i - 1][2] + "\n" + temp
        extra_buffer_for_whole_trajectory.append(temp)
    return whole_trajectory, extra_buffer_for_whole_trajectory

In [12]:
from functools import cmp_to_key
import re


def check_if_string_is_contained_by_a_dict_item(string, dict_of_items):
    for key, val in dict_of_items.items():
        if string in key[2]:
            return True, val
    return False, None


def check_if_string_contains_a_dict_item(string, dict_of_items):
    for key, val in dict_of_items.items():
        if key[2] in string:
            del dict_of_items[key]  # remove since it needs to be replaced by the larger entry, in the main func
            return True, val
    return False, None


def handle_potential_overlap_between_candidate_and_pool(candidate_sentence, line_number, code, pool_dict, index):
    candidate_sentence_is_contained_by_an_item_of_pool_dict, potential_index = check_if_string_is_contained_by_a_dict_item(candidate_sentence, pool_dict)
    if not candidate_sentence_is_contained_by_an_item_of_pool_dict:
        candidate_sentence_contains_an_item_of_pool_dict, potential_index = check_if_string_contains_a_dict_item(candidate_sentence, pool_dict)
        if candidate_sentence_contains_an_item_of_pool_dict:
            pool_dict[(line_number, code, candidate_sentence)] = potential_index
            #print("candidate_sentence_contains_an_item_of_pool_dict")
        else:
            pool_dict[(line_number, code, candidate_sentence)] = index
            index += 1
            #print("candidate_sentence totally unique; adding it")
    else:
        pass
        #print("candidate_sentence_is_contained_by_an_item_of_pool_dict")
    return index


def combine_sentence_lists(list1, list2, list1_line_numbers, list1_codes, list2_line_numbers, list2_codes):
    combined_sentences_dict = dict()
    index = 0
    for i, s1 in enumerate(list1):
          for j, s2 in enumerate(list2):
              if (s1 in s2) and (s2 not in s1):  #we choose s2
                  index = handle_potential_overlap_between_candidate_and_pool(s2, list2_line_numbers[j], list2_codes[j], combined_sentences_dict, index)
                  continue
              elif (s2 in s1) and (s1 not in s2):  #we choose s1
                  index = handle_potential_overlap_between_candidate_and_pool(s1, list1_line_numbers[i], list1_codes[i], combined_sentences_dict, index)
                  continue
              elif (s2 == s1):  #we choose s1
                  index = handle_potential_overlap_between_candidate_and_pool(s1, list1_line_numbers[i], list1_codes[i], combined_sentences_dict, index)
                  continue
              index = handle_potential_overlap_between_candidate_and_pool(s1, list1_line_numbers[i], list1_codes[i], combined_sentences_dict, index)
              index = handle_potential_overlap_between_candidate_and_pool(s2, list2_line_numbers[j], list2_codes[j], combined_sentences_dict, index)
    return combined_sentences_dict


def determine_sentence_order(essay, sentence_container, sentence_prime_container):
    sentence = sentence_container[2]
    sentence_prime = sentence_prime_container[2]
    essay_c = super_clean_text(essay)
    sentence_c = super_clean_text(sentence[:-2])  # burn the last 2 chars
    sentence_prime_c = super_clean_text(sentence_prime[:-2])  # burn the last 2 chars
    # Find the starting index of sentence and sentence_prime in the essay
    index_sentence = essay_c.find(sentence_c)
    index_sentence_prime = essay_c.find(sentence_prime_c)

    # Check which sentence comes first
    if index_sentence < index_sentence_prime:
        return -1  # sentence comes before sentence_prime
    elif index_sentence > index_sentence_prime:
        return 1  # sentence_prime comes before sentence
    else:
        return 0  # Both sentences appear at the same position (unlikely)

def sort_sentences_by_order(sentence_list, essay):
    # Use cmp_to_key to convert the comparison function to a key function
    sorted_sentence_list = sorted(sentence_list, key=cmp_to_key(lambda s1, s2: determine_sentence_order(essay, s1, s2)))
    return sorted_sentence_list


def grab_col_from_list_of_tuples(list_of_tuples, col_number):
    return [tup[col_number] for tup in list_of_tuples]


In [None]:
file_path = '/content/collated-unredacted.txt'
list_of_tuples = parse_text_file(file_path)
for item in list_of_tuples[:5]:  # Print the first 5 tuples for verification
    print(item)

whole_trajectory2, extra_buffer_for_whole_trajectory = sliding_window(list_of_tuples, 4)

In [14]:
full_merged = []

prev_line_numbers, prev_codes, prev_content = whole_trajectory2[0]
prev_extra_buffer = extra_buffer_for_whole_trajectory[0]

line_numbers, codes, content = whole_trajectory2[1]
extra_buffer = extra_buffer_for_whole_trajectory[1]

prev_as_sentences, prev_line_numbers_each_sentence, prev_codes_each_sentence = split_text_into_sentences(prev_content, prev_line_numbers, prev_codes)
as_sentences, line_numbers_each_sentence, codes_each_sentence = split_text_into_sentences(content, line_numbers, codes)

temp_dict = combine_sentence_lists(prev_as_sentences, as_sentences, prev_line_numbers_each_sentence, prev_codes_each_sentence, line_numbers_each_sentence, codes_each_sentence)
temp_merged = list(temp_dict.keys())
temp_merged_sorted = sort_sentences_by_order(temp_merged, extra_buffer)
full_merged += list(temp_merged_sorted)

for i, window in enumerate(whole_trajectory2[2:], start=2):
    line_numbers, codes, content = window
    extra_buffer = extra_buffer_for_whole_trajectory[i]

    nextchunk_as_sentences, nextchunk_line_numbers, nextchunk_codes = split_text_into_sentences(content, line_numbers, codes)

    snorf_amount = min(len(nextchunk_as_sentences), len(full_merged))
    input1 = full_merged[(-1*snorf_amount):]       #aliquot the last chunk
    full_merged = full_merged[:(-1*snorf_amount)]  #yes really remove it

    input1_line_numbers = grab_col_from_list_of_tuples(input1, 0)
    input1_codes = grab_col_from_list_of_tuples(input1, 1)
    input1_as_sentences = grab_col_from_list_of_tuples(input1, 2)

    temp_dict = combine_sentence_lists(input1_as_sentences, nextchunk_as_sentences, input1_line_numbers, input1_codes, nextchunk_line_numbers, nextchunk_codes)
    temp_merged = list(temp_dict.keys())
    temp_merged_sorted = sort_sentences_by_order(temp_merged, extra_buffer)
    full_merged += list(temp_merged_sorted)


In [None]:
for s in full_merged:
    print(s)


In [19]:
from datetime import datetime, timezone

def collect_all_timestamp_sentences(sentenceTuples_list):
    timestamp_sentences = []
    for s in sentenceTuples_list:
        if re.match(r'^\[\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} UTC\]', s[2]):
            timestamp_sentences.append(s)
    # sort by first item of tuples (the sentence index)
    timestamp_sentences = sorted(timestamp_sentences, key=lambda x: x[0])

    # parse timestamps and add actual timestamp object with UTC timezone to each tuple
    timestamp_sentences_parsed = []
    for ts_tuple in timestamp_sentences:
        s = ts_tuple[2]
        # typical value of s: '[2024-08-19 00:03:56 UTC] (User):'
        # grab only what is inside the brackets using a regex
        timestamp_string = re.search(r'(\[.*?\])', s).group(1)
        timestamp_object = datetime.strptime(timestamp_string, '[%Y-%m-%d %H:%M:%S UTC]')
        timestamp_object = timestamp_object.replace(tzinfo=timezone.utc)
        new_tuple = (ts_tuple[0], ts_tuple[1], ts_tuple[2], timestamp_object)
        timestamp_sentences_parsed.append(new_tuple)
    return timestamp_sentences_parsed

all_timestamp_sentences = collect_all_timestamp_sentences(full_merged)


In [None]:
for s in all_timestamp_sentences:
    print(s)

In [None]:
full_merged[0]

In [None]:
import torch
print(torch.cuda.is_available())  # Should return True
print(torch.cuda.current_device())  # Should return 0 (the index of the GPU)
print(torch.cuda.get_device_name(0))  # Should return "Tesla T4"


In [None]:
!pip install sentence-transformers

In [None]:
import time
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-mpnet-base-v2', device='cuda')
sentences = ["This is a test sentence"] * 1000

start_time = time.time()
embeddings = model.encode(sentences, convert_to_tensor=True)
end_time = time.time()

print(f"Time taken: {end_time - start_time} seconds")

In [22]:
def compute_similarity(embedding1, embedding2):
    """Compute cosine similarity between two embeddings."""
    return util.pytorch_cos_sim(embedding1.to('cuda'), embedding2.to('cuda')).item()


def get_best_hit(query_sentence, subject_pool):
    best_hit = (None, -1.0)
    query_sentence_embedding, _ = get_message_embedding(query_sentence)
    total = len(subject_pool)
    i = 0
    for candidate_sentence_tuple in subject_pool:
        candidate_sentence = candidate_sentence_tuple[2]
        candidate_embedding, _ = get_message_embedding(candidate_sentence)
        similarity = compute_similarity(query_sentence_embedding, candidate_embedding)
        if similarity > best_hit[1]:
            best_hit = (candidate_sentence_tuple, similarity)
            #print(f"{(100*i/total)}% | best hit so far: {best_hit}")
        i += 1
    #print(f"final best hit: {best_hit}")
    return best_hit


SENTENCE_TIMESTAMPS = {}


def get_timestamp_just_before(index_of_sentence_tuple, ts_sentences_tuple_list):
    for i, ts_sentence_tuple in enumerate(ts_sentences_tuple_list):
        if ts_sentence_tuple[0] > index_of_sentence_tuple:
            return ts_sentences_tuple_list[i-1]
    return None

In [23]:
for s in full_merged:
    index_of_sentence_tuple = s[0]
    timestamp_just_before = get_timestamp_just_before(index_of_sentence_tuple, all_timestamp_sentences)
    SENTENCE_TIMESTAMPS[s] = timestamp_just_before

In [24]:
from sentence_transformers import SentenceTransformer, util
import hashlib

model = SentenceTransformer('all-mpnet-base-v2', device='cuda')
SIMILARITY_THRESHOLD = 0.15
EMBEDDING_TRACKER_DICT = {}

def get_message_embedding(message_text):
    """Generate an SBERT embedding and SHA-256 hash for the given text."""
    sha256_hash = hashlib.sha256(message_text.encode('utf-8')).hexdigest()
    if sha256_hash in EMBEDDING_TRACKER_DICT:
        return EMBEDDING_TRACKER_DICT[sha256_hash], sha256_hash
    embedding = model.encode(message_text, convert_to_tensor=True, device='cuda')
    EMBEDDING_TRACKER_DICT[sha256_hash] = embedding
    return embedding, sha256_hash

def get_sentences_before(ts, source_pool, ts_map_dict):
    filtered_pool = []
    for candidate_tuple in source_pool:
        candidate = ts_map_dict[candidate_tuple]
        if candidate is None:
            continue
        candidate_ts = candidate[3]
        if candidate_ts < ts:
            filtered_pool.append(candidate_tuple)
    return filtered_pool

In [None]:
import pickle

for sentence_tuple in full_merged:
    sentence = sentence_tuple[2]
    sentence_embedding, sentence_hash = get_message_embedding(sentence)
    EMBEDDING_TRACKER_DICT[sentence_hash] = sentence_embedding
    print(sentence_tuple[0])


with open('EMBEDDING_TRACKER_DICT.pickle', 'wb') as handle:
    # Move embeddings to CPU before pickling
    EMBEDDING_TRACKER_DICT_CPU = {k: v.cpu() for k, v in EMBEDDING_TRACKER_DICT.items()}
    pickle.dump(EMBEDDING_TRACKER_DICT_CPU, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [None]:
# for reloading the pickle file
with open('EMBEDDING_TRACKER_DICT.pickle', 'rb') as handle:
    EMBEDDING_TRACKER_DICT = pickle.load(handle)
    # Move back to CUDA
    EMBEDDING_TRACKER_DICT = {k: v.to('cuda') for k, v in EMBEDDING_TRACKER_DICT.items()}


In [53]:
def get_ancestry(query_string, subject_pool, ts_map_dict, outfile):
    best = get_best_hit(query_string, subject_pool)
    ts_of_best = ts_map_dict[best[0]][3] if best[0] else None
    if ts_of_best:
        ts = ts_of_best.strftime('%Y-%m-%d %H:%M:%S UTC')
        line_number = best[0][0]
        line_number_constant_width = str(line_number).rjust(6)
        line_category = best[0][1]
        content = best[0][2]
        similarity = best[1]
        formatted_line = f"{ts}: {line_number_constant_width} {line_category} {similarity:.8f} {content}"
        print(formatted_line)
        outfile.write(formatted_line + '\n')  # Write to file with a newline character
    similarity = best[1]
    if best[0]:
        q2 = best[0][2]
        ts_of_best = ts_map_dict[best[0]][3]
        pool1 = get_sentences_before(ts_of_best, subject_pool, ts_map_dict)
        if similarity > 0.1 and len(subject_pool) > 0:
            get_ancestry(q2, pool1, ts_map_dict, outfile)
        #else:
        #   return best


In [None]:
query = "My spoon is too big."
current_datetime_utc = datetime.now(timezone.utc)
ts = current_datetime_utc.strftime('%Y-%m-%d %H:%M:%S UTC')
print(f"{ts}:                       {query}")
result = get_ancestry(query, full_merged, SENTENCE_TIMESTAMPS)
#print(result)

In [None]:
!pip install odfpy
!sudo apt-get install pandoc

In [None]:
import subprocess
import os
import re


def convert_odt_to_txt(odt_file_path, txt_file_path):
    # Run the pandoc command to convert the ODT file to TXT
    command = ['pandoc', '-s', odt_file_path, '-o', txt_file_path]
    subprocess.run(command, check=True)

    # Read the contents of the newly created TXT file
    with open(txt_file_path, 'r', encoding='utf-8') as txt_file:
        text_content = txt_file.read()

    return text_content


def split_text_into_sentences(text):
    original_text = text
    # Remove extra tabs
    text = re.sub(r'\t+', ' ', text)

    # Remove all asterisks and pound signs
    text = re.sub(r'[\*#]', '', text)

    # Replace multiple newlines with a placeholder to identify paragraphs
    text = re.sub(r'\n{2,}', r'###PARAGRAPH###', text)

    # Replace single newlines with spaces
    text = re.sub(r'\n+', ' ', text)

    # Remove consecutive spaces
    text = re.sub(r'\s{2,}', ' ', text)

    # Special handling for "v." in legal citations and other exceptions
    text = re.sub(r'\b(v\.)\s+', r'\1###NO_SPLIT###', text)

    # Protect abbreviations, titles, section names, numbering like "1\.", and Roman numerals like "II.", "Pp."
    text = re.sub(r'\b(Dr|Mr|Ms|Mrs|U\.S|Jr|Sr|Cf|cf|art|Art|Pp|pp|[0-9]+|I{1,3}|IV|V|VI|VII|VIII|IX|X|XI|XII|Ibid)\.\s+', r'\1.###NO_SPLIT###', text)

    # Protect single-letter section names like "A.", "B."
    text = re.sub(r'\b([A-Z])\.\s+', r'\1###NO_SPLIT###', text)

    # Protect section numbers like "(1)", "(2)", "(a)"
    text = re.sub(r'\((\d+|[a-zA-Z])\)\s+', r'(\1)###NO_SPLIT###', text)

    def split_after_pattern(text, pattern):
        # Use re.split but include the pattern in the split
        parts = re.split(pattern, text)

        # Reassemble the parts to ensure each matched pattern is followed by its text
        sentences = []
        for i in range(1, len(parts)):  # Iterate over pattern matches
            sentences.append(parts[i].strip())

        if len(parts)==1:
            sentences = [text]
        #elif len(parts) > 1:
        #    print(f"[split_after_pattern] sentences: {sentences}")

        return sentences

    # Adjusted regex pattern to handle sentence splitting, especially around footnotes
    sentence_endings = re.compile(
        r'(?<!###NO_SPLIT###)\.\s+(?=[A-Z])|'   # Split at normal sentence boundaries
        r'(?<=\.\")\s+(?=[A-Z])|'               # Period followed by quote and space
        r'(?<=[!?])\s+(?=[A-Z])|'              # Exclamation or question mark followed by space
        r'(?<=\))\s+(?=[A-Z])|'                 # Closing parenthesis followed by space
        r'(?<=\])\s+(?=[A-Z])|'                 # Footnote marker followed by space, but only if followed by an uppercase letter (new sentence)
        r'(?<=\.\s)(?=\(?[A-Z])|'              # Handling for "(iv)", "(ii)", etc.
        r'(?<=\.\s)(?=[A-Z]\w+ v\.\s[A-Z])|'   # Handling for "Nixon v. Fitzgerald"
        r'(?=\[\d{4}-\d{2}-\d{2})|'             # Split before a timestamp pattern "[2024-07-23..."
        r'(?<=\.\s)(?=Pp\.\s[0-9]+)'           # Handling for "Pp. 24-28"
    )

    # Split the text into sentences
    sentences = sentence_endings.split(text.strip())

    # Apply split_after_pattern to each sentence and flatten the result
    # Pattern to match the entire timestamp, descriptor, and model line
    # split after timestamp metadata of assistant or tool message
    pattern = r'(\[\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} UTC\] \([A-Za-z]+\) - Model: [^\:]{5,6}: )'
    temp_sentences = []
    for s in sentences:
        temp_sentences.extend(split_after_pattern(s, pattern))
    sentences = temp_sentences

    # split after timestamp metadata of user message
    pattern = r'(\[\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} UTC\] \(User\): )'
    temp_sentences = []
    for s in sentences:
        temp_sentences.extend(split_after_pattern(s, pattern))
    sentences = temp_sentences

    # Replace placeholder back to original
    sentences = [s.replace("###NO_SPLIT###", " ") for s in sentences]

    # Correctly merge footnote markers back into the sentence when inside quotes or before dashes
    merged_sentences = []
    temp_sentence = ""
    for sentence in sentences:
        if re.match(r'^\[\^\d+\]$', sentence):
            temp_sentence += sentence  # Append footnote marker to previous sentence
        else:
            if temp_sentence:
                merged_sentences.append(temp_sentence.strip())
                temp_sentence = ""
            merged_sentences.append(sentence.strip())

    # Ensure any remaining temp_sentence is added
    if temp_sentence:
        merged_sentences.append(temp_sentence.strip())

    # Split based on paragraph boundaries
    final_sentences_with_paragraphs = []
    for sentence in merged_sentences:
        final_sentences_with_paragraphs.extend(sentence.split('###PARAGRAPH###'))

    # Remove the unnecessary colons
    final_sentences_with_paragraphs = [re.sub(r'^: ', '', s).strip() for s in final_sentences_with_paragraphs]

    # Return the cleaned list of sentences
    cleaned_sentences = [sentence.strip() for sentence in final_sentences_with_paragraphs if sentence.strip()]

    return cleaned_sentences #, line_number_for_each_sentence, code_for_each_sentence



odt_file_path = "/content/PerfectLegitimateBribe-anonymized.odt"
txt_file_path = "/content/PerfectLegitimateBribe-anonymized.txt"

# Convert the ODT file to a TXT file and read the content
extracted_text = convert_odt_to_txt(odt_file_path, txt_file_path)

sentences_list = split_text_into_sentences(extracted_text)


for i, sentence in enumerate(sentences_list):
    print(f'sentence #{i}: {sentence}')

In [119]:
current_datetime_utc = datetime.now(timezone.utc)
ts = current_datetime_utc.strftime('%Y-%m-%d %H:%M:%S UTC')
with open('ancestry-unredacted.txt', 'a') as file:
    for i, sentence in enumerate(sentences_list):
        i_constant_width = str(i).rjust(4)
        formatted_line = f"{ts}:    sentence #{i_constant_width}:    {sentence}"
        print(formatted_line)
        file.write(formatted_line + '\n')  # Write to file with a newline character

        result = get_ancestry(sentence, full_merged, SENTENCE_TIMESTAMPS, file)

        print("------")
        file.write("------\n")  # Write the separator to the file