In [7]:
import json
import re
import os

SCRIPT = '../scripts'
SPM = '../spms'
project_id = 11
script_path = os.path.join(SCRIPT, f"{project_id}_transcription.json")

# Load the JSON data from the files
with open(os.path.join(SCRIPT, 'test_segment_gpt.json'), 'r') as file:
    segment_times = json.load(file)

script = ''
for segment in segment_times:
    script += segment['text']

script = script.strip()

with open(script_path, 'w') as file:
    json.dump(script, file)
    



In [8]:
import base64
import requests
import os
import json
from PIL import Image
from pdf2image import convert_from_path

PDF = '../pdfs'
TOC = '../tocs'
IMAGE = '../images'
SCRIPT = '../scripts'
SPM = '../spms'

# OpenAI API Key
api_key = "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"
   
# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

# Read script file
def read_script(script_path):
    with open(script_path, "r") as script_file:
        script_content = script_file.read()
        return json.loads(script_content)

# Path to your image and script
image_directory = os.path.join(IMAGE, f"{str(project_id)}")
script_path = os.path.join(SCRIPT, f"{project_id}_transcription.json")

image_paths = sorted([os.path.join(image_directory, f) for f in os.listdir(image_directory) if f.lower().endswith('.png')])

# print(image_paths)
encoded_images = [{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encode_image(image)}"}} for image in image_paths]

# Read the script
script_content = read_script(script_path)

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {api_key}"
}

# Creating the content for the messages
content = [
    {
        "type": "text",
        "text": (
            "Given the following lecture notes images and the lecture script, "
            "please distribute the script content accurately to each page of the lecture notes. "
            "The output should be in the format: {\"1\": \"script\", \"2\": \"script\", ...}. "
            "Each key should correspond to the page number in the lecture notes where the script content appears, "
            "and the value should be the first sentence of the script content for that page. "
            "The value corresponding to a larger key must be a sentence that appears later in the script."
            f"The number of dictionary keys must be equal to {len(encoded_images)}. "
            "The original format of the script, including uppercase and lowercase letters, punctuation marks such as periods and commas, must be preserved without any alterations. "
            f"Lecture script: {script_content} "
        )
    }
] + encoded_images

payload = {
    "model": "gpt-4o",
    "response_format": {"type": "json_object"},
    "messages": [
        {
            "role": "system", 
            "content": "You are a helpful assistant designed to output JSON."
        },
        {
            "role": "user",
            "content": content
        }
    ],
    "max_tokens": 2000,
}

response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

# Get the response
response_data = response.json()

# Check if 'choices' key exists in the response
if 'choices' in response_data and len(response_data['choices']) > 0:
    # Parse the table of contents from the response
    script_text = response_data['choices'][0]['message']['content']

    print("Raw script text:", script_text)
    
    # Convert the script text to JSON format
    try:
        script_data = json.loads(script_text)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        script_data = {"error": "Failed to decode JSON"}
else:
    print("Error: 'choices' key not found in the response")
    print(response_data)
    script_data = {"error": "Failed to retrieve scripts"}

# Save the script data as a JSON file
script_json_path = os.path.join(SPM, f"{project_id}_spm.json")
with open(script_json_path, "w") as json_file:
    json.dump(script_data, json_file, indent=4)

print(f"Script saved to {script_json_path}")




Raw script text: {
  "1": "Okay, so welcome to Lecture 2 of CS231N.",
  "2": "And it's sort of easy for emails to get lost in the shuffle if you just send to the course list.",
  "3": "Probably that doesn't affect those of you who are sitting in the room right now, but for those students listening on SCPD.",
  "4": "We're just reshuffling it a little bit to make it, like for example, upgrading to work with Python 3 rather than Python 2.7, and some of these minor cosmetic changes.",
  "5": "NumPy lets you write these very efficient vectorized operations that let you do quite a lot of computation in just a couple lines of code.",
  "6": "Because Google has very generously supported this course, we'll be able to distribute to each of you coupons that let you use Google Cloud credits for free for the class.",
  "7": "So the last lecture we talked a little bit about this task of image classification, which is really a core task in computer vision.",
  "8": "But this is actually a really, re

In [14]:
import re
from difflib import SequenceMatcher

def preprocess_text(text):
    # Remove punctuation and lowercase the text
    return re.sub(r'[^\w\s]', '', text).lower()

def find_best_match(script_content, sentence):
    # Initialize the SequenceMatcher with the script content and the sentence
    matcher = SequenceMatcher(None, script_content, sentence)
    match = matcher.find_longest_match(0, len(script_content), 0, len(sentence))
    
    if match.size > 0:
        return match.a  # Start index of the match in the script content
    else:
        return -1  # No match found

def match_paragraphs_2(script_content, first_sentences):
    matched_paragraphs = {}
    page_numbers = sorted(first_sentences.keys(), key=int)
    
    script_content_processed = preprocess_text(script_content)

    for i, page in enumerate(page_numbers):
        current_sentence = preprocess_text(first_sentences[page])
        next_sentence = (
            preprocess_text(first_sentences[str(int(page) + 1)])
            if i < len(page_numbers) - 1
            else None
        )

        try:
            start_index = find_best_match(script_content_processed, current_sentence)
            if start_index == -1:
                print(f"Match not found for page {page}")
                continue

            if i == len(page_numbers) - 1:  # Last page
                matched_paragraphs[page] = script_content[start_index:].strip()
                print(page, start_index)
            else:
                end_index = find_best_match(script_content_processed, next_sentence)
                if end_index == -1:
                    matched_paragraphs[page] = script_content[start_index:].strip()
                else:
                    matched_paragraphs[page] = script_content[start_index:end_index].strip()
                print(page, start_index, end_index)
        except Exception as e:
            print(f"Error occurred at page {page}: {str(e)}")

    return matched_paragraphs


In [61]:
import os
import json
import re

SCRIPT = '../scripts'
SPM = '../spms'

# Load the first sentences JSON file
def load_first_sentences(json_path):
    with open(json_path, "r") as json_file:
        return json.load(json_file)

# Read script file
def read_script(script_path):
    with open(script_path, "r") as script_file:
        script_content = script_file.read()
        return json.loads(script_content)
    
# Match the paragraphs to the first sentences
def match_paragraphs(script_content, first_sentences):
    matched_paragraphs = {}
    page_numbers = sorted(first_sentences.keys(), key=int)

    # # Function to remove punctuation and lowercase the text
    # def preprocess_text(text):
    #     return re.sub(r'[^\w\s]', '', text).lower()

    script_content_processed = script_content
    prev_start_index = 0
    for i, page in enumerate(page_numbers):
        current_sentence = first_sentences[page]
        prev_page = str(int(page) - 1)
        next_sentence = (
            first_sentences[str(int(page) + 1)]
            if i < len(page_numbers) - 1
            else None
        )

        try:
            start_index = script_content_processed.find(current_sentence)
            if start_index == -1:
                start_index = find_best_match(script_content_processed, current_sentence)
            if i == len(page_numbers) - 1:  # Last page
                end_index = len(script_content)
            else:
                end_index = script_content_processed.find(next_sentence)
                if end_index == -1:
                    end_index = find_best_match(script_content_processed, next_sentence)
            
            # if start_index > end_index:
            #     start_index, end_index = end_index, start_index
            #     matched_paragraphs[prev_page] = script_content[prev_start_index:start_index].strip()
            #     print(prev_page, prev_start_index, start_index)
            
            print(page, start_index, end_index)

            matched_paragraphs[page] = script_content[start_index:end_index].strip()
            prev_start_index = start_index

        except Exception as e:
            print(f"Error occurred at page {page}: {str(e)}")

    return matched_paragraphs


project_id = 80
# Read the script content
script_path = os.path.join(SCRIPT, f"10_transcription.json")
script_content = read_script(script_path)

# Load the first sentences
first_sentences_path = os.path.join(SPM, f"{project_id}_spm.json")
first_sentences = load_first_sentences(first_sentences_path)

print(first_sentences)

# Match the paragraphs
matched_paragraphs = match_paragraphs(script_content, first_sentences)

# Save the matched paragraphs to a JSON file
matched_paragraphs_json_path = os.path.join(SPM, f"{project_id}_matched_paragraphs.json")
with open(matched_paragraphs_json_path, "w") as json_file:
    json.dump(matched_paragraphs, json_file, indent=4)

print(f"Matched paragraphs saved to {matched_paragraphs_json_path}")


{'1': 'Okay, so welcome to Lecture 2 of CS231N.', '2': 'One is Piazza, so I saw, when I checked yesterday,', '3': 'Assignment one will be up later today, probably sometime this afternoon,', '4': "So in this assignment, you'll be implementing your own k-nearest neighbor classifier,", '5': "The other thing I wanted to talk about is that we're happy to announce that we got,", '6': 'So the last lecture we talked a little bit about this task of image classification,', '7': 'And the job of the computer is to look at the picture and assign it one of these fixed category labels.', '8': 'So for example, if we took this same cat, and if the cat happened to sit still and not even twitch,', '9': 'Objects can also deform.', '10': 'I think cats are maybe among the more deformable of animals that you might see out there.', '11': 'There can also be problems of occlusion, where you might only see part of a cat, like just the face,', '12': "There's also this problem of inter-class variation, that this o

In [60]:
matched_paragraphs

{'1': "Okay, so welcome to Lecture 2 of CS231N. On Tuesday, we, just recall, we sort of gave you the big picture view of what is computer vision, what is the history, and a little bit of the overview of the class. And today, we're really going to dive in for the first time into the details, and we'll start to see in much more depth exactly how some of these learning algorithms actually work in practice. So the first lecture of the class is probably the sort of the largest big picture vision, and the majority of the lectures in this class will be much more detail-oriented, and much more focused on the specific mechanics of these different algorithms. So today, we'll see our first learning algorithm, and that'll be really exciting, I think. But before we get to that, I wanted to talk about a couple administrative issues.",
 '2': "One is Piazza, so I saw, when I checked yesterday, it seemed like we had maybe 500 students signed up on Piazza, which means that there are several hundred of y

In [70]:
import os
import json

SCRIPT = '../scripts'
SPM = '../spms'

# Load the first sentences JSON file
def load_first_sentences(json_path):
    with open(json_path, "r") as json_file:
        return json.load(json_file)

# Read script file
def read_script(script_path):
    with open(script_path, "r") as script_file:
        script_content = script_file.read()
        return json.loads(script_content)

def find_best_match(script_content, sentence):
    # Initialize the SequenceMatcher with the script content and the sentence
    matcher = SequenceMatcher(None, script_content, sentence)
    match = matcher.find_longest_match(0, len(script_content), 0, len(sentence))
    
    if match.size > 0:
        return match.a  # Start index of the match in the script content
    else:
        return -1  # No match found

# Function to find start indices
def find_start_indices(script_content, first_sentences):
    start_indices = {}
    page_numbers = sorted(first_sentences.keys(), key=int)

    for page in page_numbers:
        current_sentence = first_sentences[page]
        start_index = script_content.find(current_sentence)
        if start_index == -1:
            start_index = find_best_match(script_content, current_sentence)
        start_indices[page] = start_index
    
    return start_indices

# Match the paragraphs to the first sentences
def match_paragraphs(script_content, first_sentences):
    # Step 1: Find all start indices
    start_indices = find_start_indices(script_content, first_sentences)
    
    # Step 2: Sort the pages by start index
    sorted_pages = sorted(start_indices, key=lambda page: start_indices[page])
    print(sorted_pages)
    # Step 3: Create the matched paragraphs
    matched_paragraphs = {}
    for i, page in enumerate(sorted_pages):
        start_index = start_indices[page]
        if i < len(sorted_pages) - 1:
            next_page = sorted_pages[i + 1]
            end_index = start_indices[next_page]
        else:
            end_index = len(script_content)  # Last page
        
        matched_paragraphs[str(i+1)] = script_content[start_index:end_index].strip()

        # print(page, start_index, end_index)
        print(str(i+1), start_index, end_index)
 

    return matched_paragraphs

project_id = 11
# Read the script content
script_path = os.path.join(SCRIPT, f"{project_id}_transcription.json")
script_content = read_script(script_path)

# Load the first sentences
first_sentences_path = os.path.join(SPM, f"{project_id}_spm.json")
first_sentences = load_first_sentences(first_sentences_path)

print(first_sentences)

# Match the paragraphs
matched_paragraphs = match_paragraphs(script_content, first_sentences)

# Save the matched paragraphs to a JSON file
matched_paragraphs_json_path = os.path.join(SPM, f"{project_id}_matched_paragraphs.json")
with open(matched_paragraphs_json_path, "w") as json_file:
    json.dump(matched_paragraphs, json_file, indent=4)

print(f"Matched paragraphs saved to {matched_paragraphs_json_path}")


{'1': 'Okay, so welcome to Lecture 2 of CS231N.', '2': 'One is Piazza, so I saw, when I checked yesterday, it seemed like we had maybe 500 students signed up on Piazza, which means that there are several hundred of you who are not yet there.', '3': 'The next administrative issue is about assignment one.', '4': "So in this assignment, you'll be implementing your own k-nearest neighbor classifier, which we're going to talk about in this lecture.", '5': 'But this is actually pretty important.', '6': "And you'll get a lot of practice with this on the first assignment.", '7': 'So Google Cloud is somewhat similar to Amazon AWS.', '8': "And because Google has very generously supported this course, we'll be able to distribute to each of you coupons that let you use Google Cloud credits for free for the class.", '9': 'Yeah, so those are kind of the major administrative issues I wanted to talk about today.', '10': 'So the last lecture we talked a little bit about this task of image classificatio

In [97]:
import os
import json
from difflib import SequenceMatcher

SCRIPT = '../scripts'
SPM = '../spms'

# Load the first sentences JSON file
def load_first_sentences(json_path):
    with open(json_path, "r") as json_file:
        return json.load(json_file)

# Read script file
def read_script(script_path):
    with open(script_path, "r") as script_file:
        script_content = script_file.read()
        return json.loads(script_content)

# Function to remove the first word from a sentence
def remove_first_word(sentence):
    words = sentence.split()
    return ' '.join(words[1:])

# Function to find the best match using difflib
def find_best_match(script_content, sentence, min_index=0):
    matcher = SequenceMatcher(None, script_content, sentence)
    match = matcher.find_longest_match(min_index, len(script_content), 0, len(sentence))
    if match.size > 0:
        return match.a  # Return the start index of the match
    return -1  # No match found

# Function to find start indices with a fallback strategy
def find_start_indices(script_content, first_sentences):
    start_indices = {}
    page_numbers = sorted(first_sentences.keys(), key=int)
    last_index = 0  # Keep track of the last found index to ensure we find subsequent matches after this

    script_content_lower = script_content.lower()

    for page in page_numbers:
        current_sentence = first_sentences[page]
        current_sentence_lower = current_sentence.lower()

        # Attempt 1: Direct search after last_index
        start_index = script_content_lower.find(current_sentence_lower, last_index)
        
        # Attempt 2: Search after removing the first word
        if start_index == -1:
            modified_sentence = remove_first_word(current_sentence_lower)
            start_index = script_content_lower.find(modified_sentence, last_index)
        
        # Attempt 3: Fallback to best match if still not found
        if start_index == -1:
            start_index = find_best_match(script_content_lower, current_sentence_lower, min_index=last_index)
        
        
        start_indices[page] = start_index
        last_index = start_index  # Update last_index to the current start_index
    
    return start_indices

# Match the paragraphs to the first sentences
def match_paragraphs(script_content, first_sentences):
    # Step 1: Find all start indices
    start_indices = find_start_indices(script_content, first_sentences)
    print(start_indices)
    # Step 2: Sort the pages by start index
    sorted_pages = sorted(start_indices, key=lambda page: start_indices[page])
    print(sorted_pages)
    # Step 3: Create the matched paragraphs
    matched_paragraphs = {}
    for i, page in enumerate(sorted_pages):
        start_index = start_indices[page]
        if i < len(sorted_pages) - 1:
            next_page = sorted_pages[i + 1]
            end_index = start_indices[next_page]
        else:
            end_index = len(script_content)  # Last page
        
        matched_paragraphs[str(i+1)] = script_content[start_index:end_index].strip()

        # print(page, start_index, end_index)
        print(str(i+1), start_index, end_index)
 

    return matched_paragraphs

project_id = 11
# Read the script content
script_path = os.path.join(SCRIPT, f"{project_id}_transcription.json")
script_content = read_script(script_path)

# Load the first sentences
first_sentences_path = os.path.join(SPM, f"{project_id}_spm.json")
first_sentences = load_first_sentences(first_sentences_path)

# Match the paragraphs
matched_paragraphs = match_paragraphs(script_content, first_sentences)

# Save the matched paragraphs to a JSON file with sorted keys
matched_paragraphs_json_path = os.path.join(SPM, f"{project_id}_matched_paragraphs.json")
with open(matched_paragraphs_json_path, "w") as json_file:
    json.dump(matched_paragraphs, json_file, indent=4)

print(f"Matched paragraphs saved to {matched_paragraphs_json_path}")


{'1': 0, '2': 824, '3': 1940, '4': 2553, '5': 3098, '6': 3458, '7': 3912, '8': 4352, '9': 5016, '10': 5143, '11': 5872, '12': 6724, '13': 7158, '14': 8107, '15': 8744, '16': 9460, '17': 9462, '18': 9778, '19': 11090, '20': 12185, '21': 12940, '22': 13329, '23': 14188, '24': 14647, '25': 15375, '26': 17021, '27': 17837, '28': 18444, '29': 21104, '30': 23840}
['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30']
1 0 824
2 824 1940
3 1940 2553
4 2553 3098
5 3098 3458
6 3458 3912
7 3912 4352
8 4352 5016
9 5016 5143
10 5143 5872
11 5872 6724
12 6724 7158
13 7158 8107
14 8107 8744
15 8744 9460
16 9460 9462
17 9462 9778
18 9778 11090
19 11090 12185
20 12185 12940
21 12940 13329
22 13329 14188
23 14188 14647
24 14647 15375
25 15375 17021
26 17021 17837
27 17837 18444
28 18444 21104
29 21104 23840
30 23840 25062
Matched paragraphs saved to ../spms/11_matched_paragraphs.json


In [95]:
matched_paragraphs

{'1': "Okay, so welcome to Lecture 2 of CS231N. On Tuesday, we, just recall, we sort of gave you the big picture view of what is computer vision, what is the history, and a little bit of the overview of the class. And today, we're really going to dive in for the first time into the details, and we'll start to see in much more depth exactly how some of these learning algorithms actually work in practice. So the first lecture of the class is probably the sort of the largest big picture vision, and the majority of the lectures in this class will be much more detail-oriented, and much more focused on the specific mechanics of these different algorithms. So today, we'll see our first learning algorithm, and that'll be really exciting, I think. But before we get to that, I wanted to talk about a couple administrative issues.",
 '2': "One is Piazza, so I saw, when I checked yesterday, it seemed like we had maybe 500 students signed up on Piazza, which means that there are several hundred of y

In [90]:
import os
import json
import re
from difflib import SequenceMatcher

SCRIPT = '../scripts'
SPM = '../spms'

# Load the first sentences JSON file
def load_first_sentences(json_path):
    with open(json_path, "r") as json_file:
        return json.load(json_file)

# Read script file
def read_script(script_path):
    with open(script_path, "r") as script_file:
        script_content = script_file.read()
        return json.loads(script_content)

# Function to remove punctuation and lowercase the text
def preprocess_text(text):
    return re.sub(r'[^\w\s]', '', text).lower()

# Function to remove the first word from a sentence
def remove_first_word(sentence):
    words = sentence.split()
    return ' '.join(words[1:])

# Function to find the best match using difflib
def find_best_match(script_content, sentence, min_index=0):
    matcher = SequenceMatcher(None, script_content, sentence)
    match = matcher.find_longest_match(min_index, len(script_content), 0, len(sentence))
    if match.size > 0:
        return match.a  # Return the start index of the match
    return -1  # No match found

# Function to find start indices with a fallback strategy
def find_start_indices(script_content, first_sentences):
    start_indices = {}
    page_numbers = sorted(first_sentences.keys(), key=int)
    last_index = 0  # Keep track of the last found index to ensure we find subsequent matches after this

    script_content_processed = preprocess_text(script_content)

    print(script_content_processed)

    for page in page_numbers:
        current_sentence = first_sentences[page]
        current_sentence_processed = preprocess_text(current_sentence)

        print(current_sentence_processed)
        
        # Attempt 1: Direct search after last_index
        start_index = script_content_processed.find(current_sentence_processed, last_index)
        
        # Attempt 2: Search after removing the first word
        if start_index == -1:
            modified_sentence = remove_first_word(current_sentence_processed)
            start_index = script_content_processed.find(modified_sentence, last_index)
        
        # Attempt 3: Fallback to best match if still not found
        if start_index == -1:
            start_index = find_best_match(script_content_processed, current_sentence_processed, min_index=last_index)
        
        # # Ensure start_index is greater than last_index
        # if start_index <= last_index:
        #     start_index = last_index + 1
        
        start_indices[page] = start_index
        last_index = start_index  # Update last_index to the current start_index
    
    return start_indices

# Match the paragraphs to the first sentences
def match_paragraphs(script_content, first_sentences):
    # Step 1: Find all start indices
    start_indices = find_start_indices(script_content, first_sentences)
    
    # Step 2: Sort the pages by start index
    sorted_pages = sorted(start_indices, key=lambda page: start_indices[page])
    
    # Step 3: Create the matched paragraphs
    matched_paragraphs = {}
    for i, page in enumerate(sorted_pages):
        start_index = start_indices[page]
        if i < len(sorted_pages) - 1:
            next_page = sorted_pages[i + 1]
            end_index = start_indices[next_page]
        else:
            end_index = len(script_content)  # Last page
        
        matched_paragraphs[str(i+1)] = script_content[start_index:end_index].strip()

        print(str(i+1), start_index, end_index)

    return matched_paragraphs

project_id = 11
# Read the script content
script_path = os.path.join(SCRIPT, f"{project_id}_transcription.json")
script_content = read_script(script_path)

# Load the first sentences
first_sentences_path = os.path.join(SPM, f"{project_id}_spm.json")
first_sentences = load_first_sentences(first_sentences_path)

# Match the paragraphs
matched_paragraphs = match_paragraphs(script_content, first_sentences)

# Save the matched paragraphs to a JSON file with sorted keys
matched_paragraphs_json_path = os.path.join(SPM, f"{project_id}_matched_paragraphs.json")
with open(matched_paragraphs_json_path, "w") as json_file:
    json.dump(matched_paragraphs, json_file, indent=4, sort_keys=True)

print(f"Matched paragraphs saved to {matched_paragraphs_json_path}")


okay so welcome to lecture 2 of cs231n on tuesday we just recall we sort of gave you the big picture view of what is computer vision what is the history and a little bit of the overview of the class and today were really going to dive in for the first time into the details and well start to see in much more depth exactly how some of these learning algorithms actually work in practice so the first lecture of the class is probably the sort of the largest big picture vision and the majority of the lectures in this class will be much more detailoriented and much more focused on the specific mechanics of these different algorithms so today well see our first learning algorithm and thatll be really exciting i think but before we get to that i wanted to talk about a couple administrative issues one is piazza so i saw when i checked yesterday it seemed like we had maybe 500 students signed up on piazza which means that there are several hundred of you who are not yet there so we really want pi

In [93]:
import os
import json
import re
from difflib import SequenceMatcher

SCRIPT = '../scripts'
SPM = '../spms'

# Load the first sentences JSON file
def load_first_sentences(json_path):
    with open(json_path, "r") as json_file:
        return json.load(json_file)

# Read script file
def read_script(script_path):
    with open(script_path, "r") as script_file:
        script_content = script_file.read()
        return json.loads(script_content)

# Function to remove punctuation and lowercase the text
def preprocess_text_and_map(text):
    original_to_processed_map = []
    processed_text = []
    for i, char in enumerate(text):
        if char.isalnum() or char.isspace():
            processed_text.append(char.lower())
            original_to_processed_map.append(i)
    return ''.join(processed_text), original_to_processed_map

# Function to remove the first word from a sentence
def remove_first_word(sentence):
    words = sentence.split()
    return ' '.join(words[1:])

# Function to find the best match using difflib
def find_best_match(script_content, sentence, min_index=0):
    matcher = SequenceMatcher(None, script_content, sentence)
    match = matcher.find_longest_match(min_index, len(script_content), 0, len(sentence))
    if match.size > 0:
        return match.a  # Return the start index of the match
    return -1  # No match found

# Function to find start indices with a fallback strategy
def find_start_indices(script_content, first_sentences):
    start_indices = {}
    page_numbers = sorted(first_sentences.keys(), key=int)
    last_index = 0  # Keep track of the last found index to ensure we find subsequent matches after this

    # Preprocess script content and create a mapping to the original text
    script_content_processed, original_to_processed_map = preprocess_text_and_map(script_content)

    for page in page_numbers:
        current_sentence = first_sentences[page]
        current_sentence_processed, _ = preprocess_text_and_map(current_sentence)
        
        # Attempt 1: Direct search after last_index
        start_index_processed = script_content_processed.find(current_sentence_processed, last_index)
        
        # Attempt 2: Search after removing the first word
        if start_index_processed == -1:
            modified_sentence = remove_first_word(current_sentence_processed)
            start_index_processed = script_content_processed.find(modified_sentence, last_index)
        
        # Attempt 3: Fallback to best match if still not found
        if start_index_processed == -1:
            start_index_processed = find_best_match(script_content_processed, current_sentence_processed, min_index=last_index)
        
        # Ensure start_index is greater than last_index
        if start_index_processed <= last_index:
            start_index_processed = last_index + 1
        
        # Convert the processed text index back to the original text index
        if start_index_processed < len(original_to_processed_map):
            start_index = original_to_processed_map[start_index_processed]
        else:
            start_index = last_index 

        start_indices[page] = start_index
        last_index = start_index  # Update last_index to the current start_index
    
    return start_indices

# Match the paragraphs to the first sentences
def match_paragraphs(script_content, first_sentences):
    # Step 1: Find all start indices
    start_indices = find_start_indices(script_content, first_sentences)
    print(start_indices)
    
    # Step 2: Sort the pages by start index
    sorted_pages = sorted(start_indices, key=lambda page: start_indices[page])
    print(sorted_pages)
    
    # Step 3: Create the matched paragraphs
    matched_paragraphs = {}
    for i, page in enumerate(sorted_pages):
        start_index = start_indices[page]
        if i < len(sorted_pages) - 1:
            next_page = sorted_pages[i + 1]
            end_index = start_indices[next_page]
        else:
            end_index = len(script_content)  # Last page
        
        matched_paragraphs[str(i+1)] = script_content[start_index:end_index].strip()

        print(str(i+1), start_index, end_index)

    return matched_paragraphs

project_id = 11
# Read the script content
script_path = os.path.join(SCRIPT, f"{project_id}_transcription.json")
script_content = read_script(script_path)

# Load the first sentences
first_sentences_path = os.path.join(SPM, f"{project_id}_spm.json")
first_sentences = load_first_sentences(first_sentences_path)

# Match the paragraphs
matched_paragraphs = match_paragraphs(script_content, first_sentences)

# Save the matched paragraphs to a JSON file with sorted keys
matched_paragraphs_json_path = os.path.join(SPM, f"{project_id}_matched_paragraphs.json")
with open(matched_paragraphs_json_path, "w") as json_file:
    json.dump(matched_paragraphs, json_file, indent=4, sort_keys=True)

print(f"Matched paragraphs saved to {matched_paragraphs_json_path}")


{'1': 1, '2': 824, '3': 1940, '4': 2553, '5': 3098, '6': 3458, '7': 3912, '8': 4352, '9': 5016, '10': 5146, '11': 5872, '12': 6724, '13': 7158, '14': 8107, '15': 8744, '16': 9460, '17': 9700, '18': 20638, '19': 21163, '20': 21704, '21': 22258, '22': 24580, '23': 24580, '24': 24580, '25': 24580, '26': 24580, '27': 24580, '28': 24580, '29': 24580, '30': 24580}
['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30']
1 1 824
2 824 1940
3 1940 2553
4 2553 3098
5 3098 3458
6 3458 3912
7 3912 4352
8 4352 5016
9 5016 5146
10 5146 5872
11 5872 6724
12 6724 7158
13 7158 8107
14 8107 8744
15 8744 9460
16 9460 9700
17 9700 20638
18 20638 21163
19 21163 21704
20 21704 22258
21 22258 24580
22 24580 24580
23 24580 24580
24 24580 24580
25 24580 24580
26 24580 24580
27 24580 24580
28 24580 24580
29 24580 24580
30 24580 25062
Matched paragraphs saved to ../spms/11_matched_paragraphs.json


In [53]:
# Load the JSON data from the files
project_id = 11
with open(os.path.join(SCRIPT, 'test_word_gpt.json'), 'r') as file:
    word_times = json.load(file)

with open(os.path.join(SCRIPT, 'test_segment_gpt.json'), 'r') as file:
    segment_times = json.load(file)

with open(os.path.join(SPM, f"{project_id}_matched_paragraphs.json"), 'r') as file:
    paragraphs = json.load(file)

def get_script_times(script_text, word_timestamp):
    # Remove punctuation from the script_text and split into words

    words = re.findall(r"\b[\w\']+\b", script_text.lower())

    start_time = None
    end_time = None

    if len(words) >= 5:
        for i in range(len(word_timestamp) - 4):
            if (
                word_timestamp[i]["word"].lower() == words[0]
                and word_timestamp[i + 1]["word"].lower() == words[1]
                and word_timestamp[i + 2]["word"].lower() == words[2]
                and word_timestamp[i + 3]["word"].lower() == words[3]
                and word_timestamp[i + 4]["word"].lower() == words[4]
            ):
                # Set start time from the first word
                start_time = word_timestamp[i]["start"]

                # Find end time from the last word in words
                for j in range(i + 4, len(word_timestamp)):
                    if word_timestamp[j]["word"].lower() == words[-1]:
                        end_time = word_timestamp[j]["end"]
                        break
                break

    return start_time, end_time

def get_script_times_by_segment(script_text, segment_timestamp, start_time):
    script_text_without_punctuation = re.sub(r'[^\w\s]', '', script_text.lower().strip())
    print(script_text_without_punctuation)

    # start_time = None

    for segment in segment_timestamp:
        segment_text = re.sub(r'[^\w\s]', '',segment["text"].lower().strip())
        print(segment_text)
        if segment_text in script_text_without_punctuation:
            if segment["start"] > start_time:
                start_time = segment["start"] 
            break

    return start_time

# Create the structured output
output = {}
offset = 0
start_time = 0
for para_id, paragraph_text in paragraphs.items():
    # start_time, end_time = get_script_times(paragraph_text, word_times)
    words = paragraph_text.split()
    start_time = word_times[offset]["start"]
    end_time = word_times[offset + len(words) - 1]["end"] if int(para_id) < len(paragraphs) else word_times[-1]["end"]
    # if int(para_id) > 1:
    #     start_time = get_script_times_by_segment(paragraph_text, segment_times, start_time)
    
    # output[para_id] = {}
    # output[para_id]["start"] = start_time
    # if int(para_id) > 1:
    #     if int(para_id) == len(paragraphs):
    #         output[para_id]["end"] = segment_times[-1]["end"]
    #     output[str(int(para_id)-1)]["end"] = start_time 
    # output[para_id]["script"] = paragraph_text
    output[para_id] = {
        "start": start_time,
        "end": end_time,
        "script": paragraph_text
    }
    
    offset += len(words) 

# Save the output to a JSON file
with open(os.path.join(SPM, "test_page_info.json"), 'w') as file:
    json.dump(output, file, indent=4)

# Print the output (optional)
print(json.dumps(output, indent=4))

print(offset)
print(len(word_times))


{
    "1": {
        "start": 0.0,
        "end": 40.0,
        "script": "Okay, so welcome to Lecture 2 of CS231N. On Tuesday, we, just recall, we sort of gave you the big picture view of what is computer vision, what is the history, and a little bit of the overview of the class. And today, we're really going to dive in for the first time into the details, and we'll start to see in much more depth exactly how some of these learning algorithms actually work in practice. So the first lecture of the class is probably the sort of the largest big picture vision, and the majority of the lectures in this class will be much more detail-oriented, and much more focused on the specific mechanics of these different algorithms. So today, we'll see our first learning algorithm, and that'll be really exciting, I think. But before we get to that, I wanted to talk about a couple administrative issues."
    },
    "2": {
        "start": 40.47999954223633,
        "end": 95.62000274658203,
        "scr

In [42]:
words = "Okay, so welcome to Lecture 2 of CS231N. On Tuesday, we, just recall, we sort of gave you the big picture view of what is computer vision, what is the history, and a little bit of the overview of the class. And today, we're really going to dive in for the first time into the details, and we'll start to see in much more depth exactly how some of these learning algorithms actually work in practice. So the first lecture of the class is probably the sort of the largest big picture vision, and the majority of the lectures in this class will be much more detail-oriented, and much more focused on the specific mechanics of these different algorithms. So today, we'll see our first learning algorithm, and that'll be really exciting, I think. But before we get to that, I wanted to talk about a couple administrative issues."

words = re.findall(r"\b[\w\']+\b", words.lower())

for word in words:
    print(word)


okay
so
welcome
to
lecture
2
of
cs231n
on
tuesday
we
just
recall
we
sort
of
gave
you
the
big
picture
view
of
what
is
computer
vision
what
is
the
history
and
a
little
bit
of
the
overview
of
the
class
and
today
we're
really
going
to
dive
in
for
the
first
time
into
the
details
and
we'll
start
to
see
in
much
more
depth
exactly
how
some
of
these
learning
algorithms
actually
work
in
practice
so
the
first
lecture
of
the
class
is
probably
the
sort
of
the
largest
big
picture
vision
and
the
majority
of
the
lectures
in
this
class
will
be
much
more
detail
oriented
and
much
more
focused
on
the
specific
mechanics
of
these
different
algorithms
so
today
we'll
see
our
first
learning
algorithm
and
that'll
be
really
exciting
i
think
but
before
we
get
to
that
i
wanted
to
talk
about
a
couple
administrative
issues


In [54]:
with open(os.path.join(SPM, f"10_page_info.json"), 'r') as file:
    page_info = json.load(file)

with open(os.path.join(SPM, f"test_page_info.json"), 'r') as file:
    update_info = json.load(file)

for page_id, page_data in page_info["pages"].items():
    page_data["start"] = update_info[page_id]["start"]
    page_data["end"] = update_info[page_id]["end"]
    page_data["script"] = update_info[page_id]["script"]

with open(os.path.join(SPM, f"10_page_info.json"), 'w') as file:
    json.dump(page_info, file, indent=4)




In [29]:
project_id = 11
image_directory = os.path.join(IMAGE, f"{str(project_id)}")
script_path = os.path.join(SCRIPT, f"{project_id}_transcription.json")

image_paths = sorted([os.path.join(image_directory, f) for f in os.listdir(image_directory) if f.lower().endswith('.png')])

# print(image_paths)
encoded_images = [{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encode_image(image)}"}} for image in image_paths]

matched_paragraphs_path = os.path.join(SPM, f"{project_id}_matched_paragraphs.json")
with open(matched_paragraphs_path, 'r') as file:
    matched_paragraphs = json.load(file)


In [31]:
GPT_MODEL = "gpt-4o"
gpt_api_key = "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"

def filter_script_data(script_data):
    allowed_keys = {"keyword", "formal"}
    return {key: value for key, value in script_data.items() if key in allowed_keys}

def bbox_api_request(script_segment, encoded_image):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {gpt_api_key}",
    }

    content = [
        {
            "type": "text",
            "text": (
                # "Given the following lecture notes image and the corresponding lecture script, "
                # "please provide bounding box information for each relevant script sentence. "
                "Tell me specifically where each sentence in the script describes in the image. "
                "If there is a corresponding part on the image, please tell me the bounding box information of that area. "
                "The output should be in JSON format with the structure: "
                '{"bboxes": [{"script": "string", "bbox": [x, y, w, h]}]} '
                "Ensure that the value for script in the JSON response should be a sentence from the provided script, and the value must never be text extracted from the image. "
                f"script: {script_segment} "
                # "The bounding box is given in the form [x,y,w,h]. x and y represent the center position of the bounding box, "
                # "while w and h represent the width and height of the bounding box, respectively. "
                # "The coordinates are based on the top-left corner of the image being (0,0), with the x direction being vertical and the y direction being horizontal. "
                # "If a sentence in the script is highly relevant to a specific part of the image, "
                # "use the sentence as the key and provide the bounding box information of the specific part of the image as the value. "
                # "Only find bounding boxes for the sentences that are highly relevant to specific parts of the image. "
                # "The original format of the script, including uppercase and lowercase letters, punctuation marks such as periods and commas, must be preserved without any alterations."
            ),
        },
        # {"type": "text", "text": script_segment},
        encoded_image
    ]

    payload = {
        "model": GPT_MODEL,
        "response_format": {"type": "json_object"},
        "messages": [
            {
                "role": "system",
                "content": "You are a helpful assistant designed to output JSON.",
            },
            {"role": "user", "content": content},
        ],
        "max_tokens": 2000,
    }

    response = requests.post(
        "https://api.openai.com/v1/chat/completions", headers=headers, json=payload
    )
    return response.json()

def keyword_api_request(script_segment):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {gpt_api_key}",
    }
    content = [
        {
            "type": "text",
            "text": (
                "Given the lecture script, identify at least one important keywords. "
                "Next, transform the script into a more formal tone, breaking it down into a bullet point structure where appropriate. "
                "The output should be in JSON format with the following structure: "
                '{"keyword": ["string", "string", ...], "formal": "string"} '
                f"lecture script: {script_segment} "
            ),
        },
    ]

    payload = {
        "model": "gpt-4o",
        "response_format": {"type": "json_object"},
        "messages": [
            {
                "role": "system",
                "content": "You are a helpful assistant designed to output JSON.",
            },
            {"role": "user", "content": content},
        ],
        "max_tokens": 2000,
    }

    response = requests.post(
        "https://api.openai.com/v1/chat/completions", headers=headers, json=payload
    )
    return response.json()

BBOX = '../bboxs'
KEYWORD = '../keywords'
project_id = 11
bbox_dir = os.path.join(BBOX, str(project_id))
keyword_dir = os.path.join(KEYWORD, str(project_id))

for i, encoded_image in enumerate(encoded_images):
    page_number = str(i + 1)
    script_segment = matched_paragraphs[page_number]
    response_data_bbox = bbox_api_request(script_segment, encoded_image)

    # Process the response data for bbox
    if "choices" in response_data_bbox and len(response_data_bbox["choices"]) > 0:
        script_text = response_data_bbox["choices"][0]["message"]["content"]
        # Convert the script text to JSON format
        try:
            script_data = json.loads(script_text)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON for page {page_number}: {e}")
            script_data = {"error": "Failed to decode JSON"}
    else:
        print(
            f"Error: 'choices' key not found in the response for page {page_number}"
        )
        script_data = {"error": "Failed to retrieve scripts"}
    # Save the script data as a JSON file
    bbox_path = os.path.join(bbox_dir, f"{page_number}_spm.json")
    with open(bbox_path, "w") as json_file:
        json.dump(script_data, json_file, indent=4)

    response_data_keyword = keyword_api_request(script_segment)

    # Process the response data for keyword
    if (
        "choices" in response_data_keyword
        and len(response_data_keyword["choices"]) > 0
    ):
        script_text = response_data_keyword["choices"][0]["message"]["content"]

        # Convert the script text to JSON format
        try:
            script_data = json.loads(script_text)
            script_data = filter_script_data(script_data)
            script_data["original"] = script_segment
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON for page {page_number}: {e}")
            script_data = {"error": "Failed to decode JSON"}
    else:
        print(
            f"Error: 'choices' key not found in the response for page {page_number}"
        )
        script_data = {"error": "Failed to retrieve scripts"}

    keyword_path = os.path.join(keyword_dir, f"{page_number}_spm.json")
    with open(keyword_path, "w") as json_file:
        json.dump(script_data, json_file, indent=4)


In [34]:
SCRIPT = '../scripts'   
def timestamp_for_bbox(project_id, word_timestamp):
    for page_num in range(1, len(os.listdir(os.path.join(BBOX, str(project_id)))) + 1):
        bbox_path = os.path.join(BBOX, str(project_id), f"{page_num}_spm.json")

        # Load the bbox data for the current page
        with open(bbox_path, "r") as file:
            bboxes = json.load(file)

        updated_bboxes = []
        for item in bboxes["bboxes"]:
            bbox = item["bbox"]
            
            if bbox and isinstance(bbox[0], list):
                bbox = bbox[0]    
            if not bbox or bbox[2] == 0 or bbox[3] == 0:
                continue

            start_time, end_time = get_script_times(item["script"], word_timestamp)
            if start_time is not None:
                item["start"] = start_time
                item["end"] = end_time
                updated_bboxes.append(item)

        # Save the updated data back to the JSON file
        bboxes["bboxes"] = updated_bboxes
        with open(bbox_path, "w") as file:
            json.dump(bboxes, file, indent=4)


with open(os.path.join(SCRIPT, f"{project_id}_timestamp.json"), "r") as file:
    word_timestamp = json.load(file)
    

timestamp_for_bbox(project_id, word_timestamp)


In [4]:
import os
IMAGE = '../images'
project_id = 10
image_directory = os.path.join(IMAGE, str(project_id), 'raw')

image_paths = sorted(
        [
            os.path.join(image_directory, f)
            for f in os.listdir(image_directory)
            if f.lower().endswith(".png")
        ]
    )
print(image_paths)


['../images/10/raw/page_0001.png', '../images/10/raw/page_0002.png', '../images/10/raw/page_0003.png', '../images/10/raw/page_0004.png', '../images/10/raw/page_0005.png', '../images/10/raw/page_0006.png', '../images/10/raw/page_0007.png', '../images/10/raw/page_0008.png', '../images/10/raw/page_0009.png', '../images/10/raw/page_0010.png', '../images/10/raw/page_0011.png', '../images/10/raw/page_0012.png', '../images/10/raw/page_0013.png', '../images/10/raw/page_0014.png', '../images/10/raw/page_0015.png', '../images/10/raw/page_0016.png', '../images/10/raw/page_0017.png', '../images/10/raw/page_0018.png', '../images/10/raw/page_0019.png', '../images/10/raw/page_0020.png', '../images/10/raw/page_0021.png', '../images/10/raw/page_0022.png', '../images/10/raw/page_0023.png', '../images/10/raw/page_0024.png', '../images/10/raw/page_0025.png', '../images/10/raw/page_0026.png', '../images/10/raw/page_0027.png', '../images/10/raw/page_0028.png', '../images/10/raw/page_0029.png', '../images/10

In [57]:
def fill_missing_pages(toc_data, total_pages):
    """
    누락된 페이지를 확인하고, 적절한 위치에 포함시키는 함수.

    :param toc_data: 생성된 TOC 데이터 (JSON 형식)
    :param total_pages: PDF 파일의 전체 페이지 수
    :return: 누락된 페이지가 포함된 TOC 데이터
    """

    # 모든 페이지 번호를 추출
    included_pages = set()
    for section in toc_data["table_of_contents"]:
        for subsection in section["subsections"]:
            included_pages.update(subsection["page"])

    # 누락된 페이지를 찾기
    missing_pages = sorted(set(range(1, total_pages + 1)) - included_pages)

    # 첫 페이지와 마지막 페이지 처리
    if 1 in missing_pages:
        toc_data["table_of_contents"][0]["subsections"][0]["page"].insert(0, 1)
        missing_pages.remove(1)

    if total_pages in missing_pages:
        toc_data["table_of_contents"][-1]["subsections"][-1]["page"].append(total_pages)
        missing_pages.remove(total_pages)

    # 나머지 누락된 페이지를 적절한 위치에 포함시키기
    for missing_page in missing_pages:
        placed = False

        # 각 섹션 및 하위 섹션 간에서 누락된 페이지를 삽입할 위치 찾기
        for section in toc_data["table_of_contents"]:
            subsections = section["subsections"]

            for j in range(len(subsections)):
                current_pages = subsections[j]["page"]

                if j < len(subsections) - 1:
                    next_pages = subsections[j + 1]["page"]

                    # 현재 subsection과 다음 subsection 사이에 누락된 페이지가 있는 경우
                    if current_pages[-1] < missing_page < next_pages[0]:
                        if len(current_pages) <= len(next_pages):
                            current_pages.append(missing_page)
                            current_pages.sort()
                        else:
                            next_pages.insert(0, missing_page)
                        placed = True
                        break

            if placed:
                break

        # main section 간에 누락된 페이지가 있는지 확인
        if not placed:
            for i in range(len(toc_data["table_of_contents"]) - 1):
                current_section = toc_data["table_of_contents"][i]
                next_section = toc_data["table_of_contents"][i + 1]

                current_pages = current_section["subsections"][-1]["page"]
                next_pages = next_section["subsections"][0]["page"]

                if current_pages[-1] < missing_page < next_pages[0]:
                    if len(current_pages) <= len(next_pages):
                        current_pages.append(missing_page)
                        current_pages.sort()
                    else:
                        next_pages.insert(0, missing_page)
                    placed = True
                    break

    return toc_data


In [59]:
TOC = '../tocs'

with open(os.path.join(TOC, f"113_toc.json"), "r") as file:
    toc_data = json.load(file)

total_pages = 30
updated_toc_data = fill_missing_pages(toc_data, total_pages)

with open(os.path.join(TOC, f"113_toc_updated.json"), "w") as file:
    json.dump(updated_toc_data, file, indent=4)