In [36]:
from utils import encode_image_to_base64
from openai import OpenAI
from pathlib import Path
from tqdm import tqdm
import fitz  

In [37]:

def analyze_image_gpt4o(prompt, image_path):
    """
    Analyze an image using the GPT-4o model and return a description.

    :param prompt: The text prompt for the model
    :param image_path: Local path to the image file
    :return: Model-generated description or error message
    """
    client = OpenAI()
    
    # Encode the image
    base64_image = encode_image_to_base64(image_path)

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": prompt},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}",
                                "detail": "high"
                            }
                        }
                    ]
                }
            ],
            max_tokens=500
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Error: {str(e)}"


In [38]:
def convert_pdf_to_images(pdf_path, output_dir):
    """
    Convert each page of a PDF file to an image and save them to the specified output directory.

    :param pdf_path: Path to the PDF file
    :param output_dir: Directory to save the converted images
    """
    # Create the output directory if it doesn't exist
    pdf_name = Path(pdf_path).stem
    image_dir = Path(output_dir) / pdf_name
    image_dir.mkdir(parents=True, exist_ok=True)

    # Open the PDF
    doc = fitz.open(pdf_path)

    # Iterate through each page
    for page_num, page in enumerate(doc):
        # Convert the page to an image
        pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))  # 300 DPI
        
        # Save the image
        image_filename = f"page_{page_num+1}.png"
        pix.save(image_dir / image_filename)

In [39]:
pdf_name = "L15-nearest-neighbor-10-17"
pdf_path = f"./data/test_pdfs/{pdf_name}.pdf"
output_dir = f"./data/test_images/"
convert_pdf_to_images(pdf_path, output_dir)

In [40]:
import cv2
import numpy as np
from pathlib import Path
import os

def calculate_similarity(img1, img2):
    """
    Calculate the similarity between two images using ORB feature matching.
    This method is invariant to translation and rotation.
    
    :param img1: First image
    :param img2: Second image
    :return: A similarity score between 0 and 1
    """
    # Initialize ORB detector
    orb = cv2.ORB_create()
    
    # Find the keypoints and descriptors with ORB
    kp1, des1 = orb.detectAndCompute(img1, None)
    kp2, des2 = orb.detectAndCompute(img2, None)
    
    # Create BFMatcher object
    bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
    
    # Match descriptors
    matches = bf.match(des1, des2)
    
    # Sort them in the order of their distance
    matches = sorted(matches, key=lambda x: x.distance)
    
    # Calculate similarity score
    similarity = len(matches) / max(len(kp1), len(kp2))
    
    return similarity

def merge_similar_images(image_dir, output_dir, similarity_threshold=0.7):
    """
    Merge similar consecutive images in a directory while maintaining the original order.
    
    :param image_dir: Directory containing the images
    :param output_dir: Directory to save the merged images
    :param similarity_threshold: Threshold for considering images as similar
    """
    # Ensure output directory exists
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    # Get all image files sorted by name
    image_files = sorted([f for f in os.listdir(image_dir) if f.endswith('.png')])
    
    merged_groups = []
    current_group = [image_files[0]]
    
    for i in range(len(image_files) - 1):
        img1 = cv2.imread(os.path.join(image_dir, image_files[i]))
        img2 = cv2.imread(os.path.join(image_dir, image_files[i+1]))
        
        similarity = calculate_similarity(img1, img2)
        
        if similarity >= similarity_threshold:
            current_group.append(image_files[i+1])
        else:
            merged_groups.append(current_group)
            current_group = [image_files[i+1]]
    
    # Add the last group
    if current_group:
        merged_groups.append(current_group)
    
    # Merge and save images
    for i, group in enumerate(merged_groups):
        if len(group) == 1:
            img = cv2.imread(os.path.join(image_dir, group[0]))
            merged = img
        else:
            images = [cv2.imread(os.path.join(image_dir, f)) for f in group]
            heights = [img.shape[0] for img in images]
            max_width = max(img.shape[1] for img in images)
            merged = np.vstack([cv2.resize(img, (max_width, img.shape[0])) for img in images])
        
        # Use the first image's number in the group for naming
        first_num = int(group[0].split('_')[1].split('.')[0])
        cv2.imwrite(os.path.join(output_dir, f'merged_{first_num:03d}.png'), merged)
    
    print(f"Merged images saved to {output_dir}")

image_dir = f"./data/test_images/{pdf_name}"
output_dir = f"./data/test_images/{pdf_name}_merged"
merge_similar_images(image_dir, output_dir, similarity_threshold=0.7)

Merged images saved to ./data/test_images/L15-nearest-neighbor-10-17_merged


In [41]:
lecture_prompt_path = './prompts/slide_prompt'
with open(lecture_prompt_path, 'r') as file:
    lecture_prompt = file.read()

In [42]:
def generate_lecture_from_images(image_dir, prompt):
    """
    Generate a complete lecture by analyzing images in sequence, maintaining context.
    
    :param image_dir: Directory containing the merged images
    :param prompt: The base prompt to use for image analysis
    :return: Complete lecture content
    """
    image_files = sorted([f for f in os.listdir(image_dir) if f.endswith('.png')])
    full_lecture = ""
    context = []
    
    for i, image_file in tqdm(enumerate(image_files)):
        image_path = os.path.join(image_dir, image_file)
        
        # Create a context-aware prompt
        context_prompt = f"{prompt}\n\nContext from previous slides:\n{' '.join(context)}\n\nAnalyze the current slide in the context of what has been discussed before. remember do not repeat the same information."
        
        slide_content = analyze_image_gpt4o(context_prompt, image_path)
        full_lecture += f"\n\n--- Slide: {image_file} ---\n{slide_content}"
        
        # Update context
        context.append(slide_content)
        if len(context) > 2:
            context.pop(0)
    
    return full_lecture
merged_image_dir = f"./data/test_images/{pdf_name}_merged"

In [43]:
pdf_name

'L15-nearest-neighbor-10-17'

In [44]:
complete_lecture = generate_lecture_from_images(merged_image_dir, lecture_prompt)
print(complete_lecture)
# Optionally, save the lecture to a file
with open(f"./data/generated_lectures/{pdf_name}_lecture.txt", "w", encoding="utf-8") as f:
    f.write(complete_lecture)


19it [02:10,  6.86s/it]



--- Slide: merged_001.png ---
Now, let's dive deeper into the exciting world of Nearest Neighbor algorithms and Metric Learning. We've touched on data representation and feature extraction earlier, and these concepts tie directly into what we're about to explore.

Nearest Neighbor, often abbreviated as NN, is a simple, yet powerful, classification algorithm. It works by identifying the closest data points in a dataset to make predictions about new, unseen data points. Imagine you have a scatterplot of vegetables, each designated by its color and size. If you want to classify a new vegetable, you simply find its closest neighbors and determine its type based on majority rule.

But, how do you measure the "closeness" of data points? This is where the concept of a "metric" comes in. A metric is a mathematical function used to define a distance between any two points in a space. The most common example is the Euclidean distance, which measures the straight line distance between two point




In [45]:
final_slide_prompt_path = './prompts/final_slide_prompt'
with open(final_slide_prompt_path, 'r') as file:
    final_slide_prompt = file.read()




In [46]:
from openai import OpenAI

def summarize_lecture(complete_lecture, final_slide_prompt):
    client = OpenAI()

    summary = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
                "content": complete_lecture + '\n\n' + final_slide_prompt
            }
        ]
    )

    return summary.choices[0].message.content


summary = summarize_lecture(complete_lecture, final_slide_prompt)

# 将完整讲座内容和摘要写入文本文件
with open('lecture_and_summary.txt', 'w', encoding='utf-8') as f:
    f.write("Complete Lecture:\n\n")
    f.write(complete_lecture)
    f.write("\n\nSummary:\n\n")
    f.write(summary)

print("Lecture content and summary have been saved to 'lecture_and_summary.txt'")

Lecture content and summary have been saved to 'lecture_and_summary.txt'
