diff --git a/backend/main.py b/backend/main.py index acf66da..3d78027 100644 --- a/backend/main.py +++ b/backend/main.py @@ -1,7 +1,6 @@ # Copyright (C) 2025 Intel Corporation # SPDX-License-Identifier: Apache-2.0 - -from fastapi import FastAPI, HTTPException, File, UploadFile +from fastapi import BackgroundTasks, FastAPI, HTTPException, File, UploadFile from pydantic import BaseModel import fitz # PyMuPDF from pathlib import Path @@ -12,11 +11,15 @@ from generate_image_embedding import generate_image_embedding from fastapi.responses import FileResponse, JSONResponse from generate_pptx import create_pptx +from generate_pptx import create_pptx from starlette.background import BackgroundTask import tempfile import imagehash from PIL import Image import io +import uuid +from typing import Dict +import json app = FastAPI() @@ -26,22 +29,10 @@ OUTPUT_DIR.mkdir(parents=True, exist_ok=True) -@app.post("/parse") -async def parse_pdf(file: UploadFile = File(...)): - """ - Endpoint to parse a PDF file uploaded via multipart/form-data. - Extracts images, generates captions and embeddings, and returns the data. - """ - temp_file_path = None +def process_pdf_to_file(job_id: str, pdf_path: str, filename: str): try: - # Create temp file with delete=False to avoid Windows file locking issues - with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file: - temp_file.write(await file.read()) - temp_file_path = temp_file.name - - print(f"DEBUG : Temporary PDF file created at: {temp_file_path}") - # Open the PDF file using PyMuPDF (now works on Windows since file is closed) - pdf_file = fitz.open(str(temp_file_path)) + print(f"Processing job {job_id}") + pdf_file = fitz.open(str(pdf_path)) image_data = [] image_order = 1 seen_hashes = set() @@ -88,29 +79,62 @@ async def parse_pdf(file: UploadFile = File(...)): # Prepare the response data response_data = { - "name": file.filename, + "name": filename, "details": f"Extracted {len(image_data)} images from the PDF.", "images": image_data, "text": extracted_text, } - return JSONResponse(content=response_data) + temp_dir = tempfile.gettempdir() + result_path = os.path.join(temp_dir, f"{job_id}.json") + with open(result_path, "w") as f: + json.dump(response_data, f) except Exception as e: - print(f"Error processing PDF: {e}") - raise HTTPException( - status_code=500, detail=f"An error occurred while processing the PDF: {e}" - ) + print(f"Error in processing pdf job_id: {job_id}: {e}") + finally: - # Clean up temporary file on Windows - if temp_file_path and os.path.exists(temp_file_path): - try: - os.unlink(temp_file_path) - print(f"DEBUG: Cleaned up temporary file: {temp_file_path}") - except Exception as cleanup_error: - print( - f"Warning: Failed to clean up temporary file {temp_file_path}: {cleanup_error}" - ) + try: + if os.path.exists(pdf_path): + os.remove(pdf_path) + except Exception as cleanup_err: + print(f"Warning: Failed to remove temporary PDF {pdf_path}: {cleanup_err}") + + +@app.post("/upload") +async def upload_file( + file: UploadFile = File(...), background_tasks: BackgroundTasks = None +): + try: + # Generate job ID + job_id = str(uuid.uuid4()) + tmp_dir = tempfile.gettempdir() + tmp_path = os.path.join(tmp_dir, f"{job_id}_{file.filename}") + + # Save uploaded file to /tmp + with open(tmp_path, "wb") as buffer: + shutil.copyfileobj(file.file, buffer) + + # Schedule background PDF processing + background_tasks.add_task(process_pdf_to_file, job_id, tmp_path, file.filename) + + return {"jobID": job_id} + except Exception as e: + raise HTTPException(status_code=500, detail=f"Error uploading file: {e}") + + +@app.get("/result/{job_id}") +def get_result(job_id: str): + temp_dir = tempfile.gettempdir() + result_path = os.path.join(temp_dir, f"{job_id}.json") + if not os.path.exists(result_path): + return JSONResponse( + status_code=202, content={"message": "PDF processing not complete yet."} + ) + + with open(result_path, "r") as f: + result = json.load(f) + return result class PPTXRequest(BaseModel): diff --git a/frontend/src/app/api/slide/content-generator.ts b/frontend/src/app/api/slide/content-generator.ts index 865309a..9b19d97 100644 --- a/frontend/src/app/api/slide/content-generator.ts +++ b/frontend/src/app/api/slide/content-generator.ts @@ -4,6 +4,7 @@ import { createOllama } from 'ollama-ai-provider' import { type CoreMessage, generateText } from 'ai' import type { ClientSource } from '@/lib/types/client-source' +import type { CourseInfo } from '@/lib/types/course-info-types' import type { LectureContent, AssessmentQuestion, @@ -42,6 +43,7 @@ export async function generateCourseContent( sessionLength: number, difficultyLevel: string, topicName: string, + courseInfo?: CourseInfo, ): Promise { try { // Check for required environment variables @@ -55,8 +57,11 @@ export async function generateCourseContent( // Prepare source content console.log('Preparing source content...') - const { content: assistantContent, metadata: sourceMetadata } = - await prepareSourceContent(selectedSources) + const { content: assistantContent, metadata: sourceMetadata } = await prepareSourceContent( + selectedSources, + topicName, + courseInfo, + ) // Ensure assistant content fits within context window const assistantMessage: CoreMessage = { @@ -112,7 +117,9 @@ export async function generateCourseContent( const metadataUserMessage: CoreMessage = { role: 'user', - content: `Generate the title, learning outcomes, and at least 5-10 key terms for a ${difficultyLevel} level ${contentType} on "${topicName}" based STRICTLY on the provided source materials above.`, + content: sourceMetadata.usingCourseContext + ? `Generate the title, learning outcomes, and at least 5-10 key terms for a ${difficultyLevel} level ${contentType} on "${topicName}" based on standard academic knowledge and best practices for this subject area.` + : `Generate the title, learning outcomes, and at least 5-10 key terms for a ${difficultyLevel} level ${contentType} on "${topicName}" based STRICTLY on the provided source materials above.`, } const metadataMessages = [metadataSystemMessage, assistantMessage, metadataUserMessage] @@ -137,10 +144,17 @@ export async function generateCourseContent( const introSystemPrompt = `You are an expert educational content developer. Continue creating a ${difficultyLevel} level ${contentType} on "${topicName}" designed for a ${sessionLength}-minute session. IMPORTANT INSTRUCTIONS: -1. You MUST base your content ENTIRELY on the source materials provided. +${ + sourceMetadata.usingCourseContext + ? `1. Since no specific source materials were provided, base your content on standard academic knowledge for the topic. +2. Draw from established educational practices and common curriculum content for this subject area. +3. Create content appropriate for the specified difficulty level and session length. +4. Ensure the introduction provides context and importance of the topic based on general knowledge.` + : `1. You MUST base your content ENTIRELY on the source materials provided. 2. Extract key concepts, terminology, examples, and explanations directly from the source materials. 3. Do not introduce concepts or information that is not present in the source materials. -4. Create an engaging introduction that provides context and importance of the topic. +4. Create an engaging introduction that provides context and importance of the topic.` +} RESPONSE FORMAT: Your response MUST be a valid JSON object with EXACTLY these fields: @@ -157,7 +171,9 @@ CRITICAL: Your response MUST be valid JSON only. Do not include any text, markdo const introUserMessage: CoreMessage = { role: 'user', - content: `Generate an engaging introduction for a ${difficultyLevel} level ${contentType} on "${topicName}" with title "${metadataResponse.title}" based STRICTLY on the provided source materials above.`, + content: sourceMetadata.usingCourseContext + ? `Generate an engaging introduction for a ${difficultyLevel} level ${contentType} on "${topicName}" with title "${metadataResponse.title}" based on standard academic knowledge and best practices for this subject area.` + : `Generate an engaging introduction for a ${difficultyLevel} level ${contentType} on "${topicName}" with title "${metadataResponse.title}" based STRICTLY on the provided source materials above.`, } const introMessages = [introSystemMessage, assistantMessage, introUserMessage] @@ -179,9 +195,15 @@ CRITICAL: Your response MUST be valid JSON only. Do not include any text, markdo const specialSlidesSystemPrompt = `You are an expert educational content developer. Continue creating a ${difficultyLevel} level ${contentType} on "${topicName}" designed for a ${sessionLength}-minute session. IMPORTANT INSTRUCTIONS: -1. You MUST base your content ENTIRELY on the source materials provided. +${ + sourceMetadata.usingCourseContext + ? `1. Since no specific source materials were provided, base your content on standard academic knowledge for the topic. +2. Draw from established educational practices and common curriculum content for this subject area. +3. Create content appropriate for the specified difficulty level and session length.` + : `1. You MUST base your content ENTIRELY on the source materials provided. 2. Extract key concepts, terminology, examples, and explanations directly from the source materials. -3. Do not introduce concepts or information that is not present in the source materials. +3. Do not introduce concepts or information that is not present in the source materials.` +} 4. Create ONLY the following special slides: - Introduction slide (first slide that introduces the topic) - Agenda/Overview slide (outlines what will be covered) @@ -228,7 +250,9 @@ CRITICAL: Your response MUST be valid JSON only. Do not include any text, markdo const specialSlidesUserMessage: CoreMessage = { role: 'user', - content: `Generate the introduction, agenda, assessment, and conclusion slides for a ${difficultyLevel} level ${contentType} on "${topicName}" with title "${metadataResponse.title}" based STRICTLY on the provided source materials above.`, + content: sourceMetadata.usingCourseContext + ? `Generate the introduction, agenda, assessment, and conclusion slides for a ${difficultyLevel} level ${contentType} on "${topicName}" with title "${metadataResponse.title}" based on standard academic knowledge and best practices for this subject area.` + : `Generate the introduction, agenda, assessment, and conclusion slides for a ${difficultyLevel} level ${contentType} on "${topicName}" with title "${metadataResponse.title}" based STRICTLY on the provided source materials above.`, } const specialSlidesMessages = [ @@ -291,9 +315,15 @@ CRITICAL: Your response MUST be valid JSON only. Do not include any text, markdo const contentSlidesSystemPrompt = `You are generating content slides ${startSlideNum} through ${endSlideNum} of a total of ${totalContentSlidesNeeded} content slides. Ensure all slides are unique. IMPORTANT INSTRUCTIONS: -1. You MUST base your content ENTIRELY on the source materials provided. +${ + sourceMetadata.usingCourseContext + ? `1. Since no specific source materials were provided, base your content on standard academic knowledge for the topic. +2. Draw from established educational practices and common curriculum content for this subject area. +3. Create content appropriate for the specified difficulty level and session length.` + : `1. You MUST base your content ENTIRELY on the source materials provided. 2. Extract key concepts, terminology, examples, and explanations directly from the source materials. -3. Do not introduce concepts or information that is not present in the source materials. +3. Do not introduce concepts or information that is not present in the source materials.` +} 4. Create detailed teaching slides with substantial content on each slide. 5. Focus ONLY on core teaching content slides. 6. Each slide should have comprehensive speaker notes with additional details and examples. @@ -324,7 +354,11 @@ CRITICAL: Your response MUST be valid JSON only. Do not include any text, markdo const contentSlidesUserMessage: CoreMessage = { role: 'user', - content: `Generate content slides ${startSlideNum} through ${endSlideNum} for a ${difficultyLevel} level ${contentType} on "${topicName}" with title "${metadataResponse.title}" based STRICTLY on the provided source materials above. + content: sourceMetadata.usingCourseContext + ? `Generate content slides ${startSlideNum} through ${endSlideNum} for a ${difficultyLevel} level ${contentType} on "${topicName}" with title "${metadataResponse.title}" based on standard academic knowledge and best practices for this subject area. + +DO NOT create introduction, agenda, assessment, or conclusion slides. Focus ONLY on core teaching content slides.` + : `Generate content slides ${startSlideNum} through ${endSlideNum} for a ${difficultyLevel} level ${contentType} on "${topicName}" with title "${metadataResponse.title}" based STRICTLY on the provided source materials above. DO NOT create introduction, agenda, assessment, or conclusion slides. Focus ONLY on core teaching content slides.`, } @@ -381,7 +415,9 @@ DO NOT create introduction, agenda, assessment, or conclusion slides. Focus ONLY const activitiesUserMessage: CoreMessage = { role: 'user', - content: `Generate the activities for a ${difficultyLevel} level ${contentType} on "${topicName}" with title "${metadataResponse.title}" based STRICTLY on the provided source materials above.`, + content: sourceMetadata.usingCourseContext + ? `Generate the activities for a ${difficultyLevel} level ${contentType} on "${topicName}" with title "${metadataResponse.title}" based on standard academic knowledge and best practices for this subject area.` + : `Generate the activities for a ${difficultyLevel} level ${contentType} on "${topicName}" with title "${metadataResponse.title}" based STRICTLY on the provided source materials above.`, } const activitiesMessages = [activitiesSystemMessage, assistantMessage, activitiesUserMessage] @@ -417,7 +453,9 @@ DO NOT create introduction, agenda, assessment, or conclusion slides. Focus ONLY const assessmentUserMessage: CoreMessage = { role: 'user', - content: `Generate assessment ideas (without example questions) for a ${difficultyLevel} level ${contentType} on "${topicName}" with title "${metadataResponse.title}" based STRICTLY on the provided source materials above.`, + content: sourceMetadata.usingCourseContext + ? `Generate assessment ideas (without example questions) for a ${difficultyLevel} level ${contentType} on "${topicName}" with title "${metadataResponse.title}" based on standard academic knowledge and best practices for this subject area.` + : `Generate assessment ideas (without example questions) for a ${difficultyLevel} level ${contentType} on "${topicName}" with title "${metadataResponse.title}" based STRICTLY on the provided source materials above.`, } const assessmentMessages = [assessmentSystemMessage, assistantMessage, assessmentUserMessage] @@ -479,7 +517,9 @@ DO NOT create introduction, agenda, assessment, or conclusion slides. Focus ONLY const readingsUserMessage: CoreMessage = { role: 'user', - content: `Generate further reading suggestions for a ${difficultyLevel} level ${contentType} on "${topicName}" with title "${metadataResponse.title}" based STRICTLY on the provided source materials above.`, + content: sourceMetadata.usingCourseContext + ? `Generate further reading suggestions for a ${difficultyLevel} level ${contentType} on "${topicName}" with title "${metadataResponse.title}" based on standard academic knowledge and best practices for this subject area.` + : `Generate further reading suggestions for a ${difficultyLevel} level ${contentType} on "${topicName}" with title "${metadataResponse.title}" based STRICTLY on the provided source materials above.`, } const readingsMessages = [readingsSystemMessage, assistantMessage, readingsUserMessage] diff --git a/frontend/src/app/api/slide/route.ts b/frontend/src/app/api/slide/route.ts index 0ae893c..2d7086d 100644 --- a/frontend/src/app/api/slide/route.ts +++ b/frontend/src/app/api/slide/route.ts @@ -139,6 +139,7 @@ export async function POST(req: Request) { sessionLength, difficultyLevel, topicName, + courseInfo, } = requestData console.log('Data from request:', { @@ -149,6 +150,7 @@ export async function POST(req: Request) { sessionLength, difficultyLevel, topicName, + courseInfo, }) // Generate course content @@ -160,6 +162,7 @@ export async function POST(req: Request) { sessionLength, difficultyLevel, topicName, + courseInfo, ) return NextResponse.json(generatedContent) diff --git a/frontend/src/app/api/slide/types.ts b/frontend/src/app/api/slide/types.ts index 021f3a5..52aee57 100644 --- a/frontend/src/app/api/slide/types.ts +++ b/frontend/src/app/api/slide/types.ts @@ -4,6 +4,7 @@ // Type definitions for course content generation import { ClientSource } from '@/lib/types/client-source' +import type { CourseInfo } from '@/lib/types/course-info-types' export interface LectureSlide { title: string @@ -120,6 +121,7 @@ export interface CourseContentRequest { sessionLength: number difficultyLevel: string topicName: string + courseInfo?: CourseInfo action?: string content?: LectureContent } diff --git a/frontend/src/app/api/slide/utils.ts b/frontend/src/app/api/slide/utils.ts index 34d5b69..dc5557c 100644 --- a/frontend/src/app/api/slide/utils.ts +++ b/frontend/src/app/api/slide/utils.ts @@ -3,6 +3,7 @@ import type { ContextChunk } from '@/lib/types/context-chunk' import type { ClientSource } from '@/lib/types/client-source' +import type { CourseInfo } from '@/lib/types/course-info-types' import { getStoredChunks } from '@/lib/chunk/get-stored-chunks' import type { AssessmentQuestion } from './types' import { fallbackDiscussionIdeas } from './fallback-content' @@ -95,14 +96,66 @@ export function extractAndParseJSON(text: string) { } // Prepare source content for the AI model -export async function prepareSourceContent(selectedSources: ClientSource[]) { +export async function prepareSourceContent( + selectedSources: ClientSource[], + topicName?: string, + courseInfo?: CourseInfo, +) { try { + // Check if we have any selected sources + const selectedSourcesFiltered = selectedSources?.filter((source) => source.selected) || [] + // If no sources are selected, create course-based content + if (selectedSourcesFiltered.length === 0) { + console.log('No sources selected, using course context for content generation') + const courseContent = `COURSE CONTEXT:\n\n` + let structuredContent = courseContent + if (courseInfo) { + structuredContent += `Course: ${courseInfo.courseCode || ''} ${courseInfo.courseName || 'Academic Course'}\n` + structuredContent += `Semester: ${courseInfo.semester || 'Current Semester'}\n` + structuredContent += `Academic Year: ${courseInfo.academicYear || 'Current Academic Year'}\n\n` + } + structuredContent += `Topic: ${topicName || 'Course Topic'}\n\n` + structuredContent += `GENERAL KNOWLEDGE CONTEXT:\n` + structuredContent += `Since no specific source materials were provided, this content should be generated based on:\n` + structuredContent += `1. Standard academic knowledge for the topic "${topicName}"\n` + structuredContent += `2. Common educational practices and pedagogical approaches\n` + structuredContent += `3. Typical curriculum content for this subject area\n` + structuredContent += `4. Best practices in educational content development\n\n` + const sourceMetadata = { + sourceCount: 0, + chunkCount: 0, + tokenEstimate: countTokens(structuredContent), + sourceNames: [], + usingCourseContext: true, + } + return { content: structuredContent, metadata: sourceMetadata } + } + // Use the getStoredChunks function to retrieve chunks from Payload CMS - const retrievedChunks = await getStoredChunks(selectedSources) + const retrievedChunks = await getStoredChunks(selectedSourcesFiltered) console.log('Retrieved chunks:', retrievedChunks.length) if (retrievedChunks.length === 0) { - throw new Error('No content found in the selected sources.') + // If we have selected sources but no chunks found, fallback to course context + console.log('No content found in selected sources, falling back to course context') + const courseContent = `COURSE CONTEXT (Source Fallback):\n\n` + let structuredContent = courseContent + if (courseInfo) { + structuredContent += `Course: ${courseInfo.courseCode || ''} ${courseInfo.courseName || 'Academic Course'}\n` + structuredContent += `Semester: ${courseInfo.semester || 'Current Semester'}\n` + structuredContent += `Academic Year: ${courseInfo.academicYear || 'Current Academic Year'}\n\n` + } + structuredContent += `Topic: ${topicName || 'Course Topic'}\n\n` + structuredContent += `GENERAL KNOWLEDGE CONTEXT:\n` + structuredContent += `Content should be generated based on standard academic knowledge for "${topicName}"\n\n` + const sourceMetadata = { + sourceCount: 0, + chunkCount: 0, + tokenEstimate: countTokens(structuredContent), + sourceNames: [], + usingCourseContext: true, + } + return { content: structuredContent, metadata: sourceMetadata } } // Process chunks to create a more structured context @@ -184,6 +237,7 @@ export async function prepareSourceContent(selectedSources: ClientSource[]) { chunkCount: retrievedChunks.length, tokenEstimate: countTokens(structuredContent), sourceNames: Array.from(sourceGroups.keys()), + usingCourseContext: false, } return { content: structuredContent, metadata: sourceMetadata } @@ -196,6 +250,7 @@ export async function prepareSourceContent(selectedSources: ClientSource[]) { chunkCount: 0, tokenEstimate: 0, sourceNames: [], + usingCourseContext: false, }, } } diff --git a/frontend/src/components/slide/ConfigView.tsx b/frontend/src/components/slide/ConfigView.tsx index c850e4c..1b7e2d1 100644 --- a/frontend/src/components/slide/ConfigView.tsx +++ b/frontend/src/components/slide/ConfigView.tsx @@ -103,11 +103,12 @@ export function ConfigView({ {selectedSources.filter((source) => source.selected).length === 0 && (

- No sources selected. Please select at least one source document from the - sidebar. + No sources selected. Content will be generated using course context and + general knowledge.

- The generated content will be based ONLY on your selected sources. + You can optionally select source documents to focus content on specific + materials.

)} diff --git a/frontend/src/components/slide/CourseContentGenerator.tsx b/frontend/src/components/slide/CourseContentGenerator.tsx index 6cda010..376de90 100644 --- a/frontend/src/components/slide/CourseContentGenerator.tsx +++ b/frontend/src/components/slide/CourseContentGenerator.tsx @@ -7,6 +7,8 @@ import { toast } from 'sonner' import { ContextRequirementMessage } from '@/components/context-requirement-message' import { useContextAvailability } from '@/lib/hooks/use-context-availability' import { useSourcesStore } from '@/lib/store/sources-store' +import { usePersonaStore } from '@/lib/store/persona-store' +import { useCourses } from '@/lib/hooks/use-courses' import type { LectureContent, View } from '@/lib/types/slide' import { WelcomeView } from './WelcomeView' import { ConfigView } from './ConfigView' @@ -37,6 +39,8 @@ export default function CourseContentGenerator() { const { getActiveContextModelName } = useContextAvailability() const selectedModel = getActiveContextModelName() const selectedSources = useSourcesStore((state) => state.selectedSources) + const { selectedCourseId } = usePersonaStore() + const { data: coursesData } = useCourses() const generateCourseContent = async () => { if (!selectedModel) { @@ -50,8 +54,11 @@ export default function CourseContentGenerator() { } const selectedSourcesCount = selectedSources.filter((source) => source.selected).length - if (selectedSourcesCount === 0 || selectedSourcesCount >= 2) { - toast.error('Please select EXACTLY one source.') + // Allow generation with no sources selected (will use course context) + if (selectedSourcesCount > 1) { + toast.error( + 'Please select at most one source. You can also generate content without selecting any sources.', + ) return } @@ -62,6 +69,20 @@ export default function CourseContentGenerator() { setExpandedQuestions({}) try { + // Get course information from context - using proven method from assessment page + const selectedCourse = coursesData?.docs.find((course) => course.id === selectedCourseId) + const courseDescription = selectedCourse?.description || '' + const courseInfo = selectedCourse + ? { + courseCode: selectedCourse.code || '', + courseName: selectedCourse.name || '', + courseDescription: courseDescription, + // Add semester and academic year if available in the course data + semester: selectedCourse.tag || '', + academicYear: '', // This might need to be added to the Course type if not available + } + : undefined + const response = await fetch('/api/slide', { method: 'POST', headers: { @@ -75,6 +96,7 @@ export default function CourseContentGenerator() { sessionLength, difficultyLevel, topicName, + courseInfo, }), }) @@ -92,7 +114,14 @@ export default function CourseContentGenerator() { setGenerationError(data._error) toast.warning('Content generation had issues. Using fallback content.') } else { - toast.success('Course content generated successfully from your selected sources!') + // Different success messages based on whether sources were used + if (selectedSourcesCount === 0) { + toast.success( + 'Course content generated successfully using general knowledge and course context!', + ) + } else { + toast.success('Course content generated successfully from your selected sources!') + } } // Store source metadata if available diff --git a/frontend/src/lib/embedding/generate-embedding.ts b/frontend/src/lib/embedding/generate-embedding.ts index d1d54ec..0e2f76e 100644 --- a/frontend/src/lib/embedding/generate-embedding.ts +++ b/frontend/src/lib/embedding/generate-embedding.ts @@ -6,7 +6,7 @@ import { EmbeddingChunk } from '../types/embedding-chunk' import { embed } from 'ai' import { verifyModel } from '../model/model-manager' import { detokenize, effectiveTokenCount, tokenize } from '../utils' - +import { randomInt } from 'crypto' /** * Generates embeddings for a given text using a specified model. * @@ -17,6 +17,22 @@ import { detokenize, effectiveTokenCount, tokenize } from '../utils' * @returns A promise that resolves to an array of embedding chunks. * @throws An error if the model verification or embedding generation fails. */ + +function sanitizeChunk(text: string): string { + return ( + text + // Collapse long runs of periods (..... -> .) + .replace(/([.])\1{2,}/g, '$1') + // Collapse long runs of dashes, underscores, etc. (optional) + .replace(/([-_*])\1{2,}/g, '$1') + // Remove zero-width and control characters + .replace(/[\u0000-\u001F\u007F-\u009F\u200B]/g, '') + // Collapse extra whitespace + .replace(/\s{2,}/g, ' ') + .trim() + ) +} + export async function generateEmbeddings( text: string, chunkSizeToken: number, @@ -73,46 +89,90 @@ export async function generateEmbeddings( let completedCount = 0 const totalChunks = chunks.length console.log('DEBUG: generateEmbeddings totalChunks:', totalChunks) - const embeddingPromises = chunks.map(async (chunk, index) => { - try { - const { embedding } = await embed({ - model: ollama.embedding(modelName), - value: chunk, - }) - // console.log( - // `Embedding generated for chunk ${index + 1}/${chunks.length}` - // ); - completedCount++ - const completionPercentage = ((completedCount / totalChunks) * 100).toFixed(2) - // console.log( - // `Embedding generation: ${completionPercentage}% (${completedCount}/${totalChunks})` - // ); - const tokens = tokenize(chunk) - console.log( - `DEBUG: generateEmbeddings: ${completionPercentage}% (${completedCount}/${totalChunks}) | ` + - `[${index}]: ${chunk.length} chars | ` + - `Adjusted token (${chunkSizeToken}): ${tokens.length}`, - ) + async function embedChunk(chunk: string, index: number): Promise { + const sanitized = sanitizeChunk(chunk) + // Log full chunk if sanitization changed it + if (sanitized !== chunk) { + const sanitizeLog = ` +Sanitized chunk ${index + 1}: +Before: ${chunk} +After : ${sanitized} +Length: ${chunk.length} -> ${sanitized.length} +-------` + console.log(sanitizeLog) + } + const maxRetries = 5 + const tokens = tokenize(chunk) + const preview = chunk.slice(0, 500) - return { - order: index + 1, // 1-based order - chunk: chunk, - embedding: embedding, // assumed to be a number[] - sourceType: 'user' as const, // Specify sourceType for user-generated embeddings + for (let attempt = 1; attempt <= maxRetries; attempt++) { + try { + const { embedding } = await embed({ + model: ollama.embedding(modelName), + value: sanitized, + }) + completedCount++ + const completionPercentage = ((completedCount / totalChunks) * 100).toFixed(2) + + const successLog = ` +Successful embedding for chunk ${index + 1}/${totalChunks} +Length: ${chunk.length}, Tokens: ${tokens.length} +Preview: ${preview} +Completion: ${completionPercentage}% (${completedCount}/${totalChunks}) +-------` + console.log(successLog) + return { + order: index + 1, + chunk: sanitized, + embedding, + sourceType: 'user' as const, + } + } catch (err: unknown) { + let message: string + if (err instanceof Error) { + message = err.message + } else if (typeof err === 'string') { + message = err + } else { + message = JSON.stringify(err) + } + + const errorLog = ` +Attempt ${attempt}/${maxRetries} failed for chunk ${index + 1}/${totalChunks} +Length: ${chunk.length}, Tokens: ${tokens.length} +Preview: ${preview} +Error: ${message} +-------` + console.error(errorLog) + if (attempt < maxRetries) { + const jitter = randomInt(0, 100) + const delay = 500 * 2 ** (attempt - 1) + jitter + await new Promise((resolve) => { + setTimeout(() => resolve(), delay) + }) + } } - } catch (error) { - throw new Error( - `Failed to generate embedding for chunk ${index + 1}/${totalChunks}: ${error}`, - ) } - }) - const results = await Promise.all(embeddingPromises) + const finalErrorLog = ` +Failed permanently for chunk ${index + 1}/${totalChunks} +Length: ${chunk.length}, Tokens: ${tokens.length} +Preview: ${preview} +-------` + console.error(finalErrorLog) + return null + } + + const embeddingPromises = chunks.map((chunk, index) => embedChunk(chunk, index)) + const settled = await Promise.all(embeddingPromises) + + const results = settled.filter((r): r is EmbeddingChunk => r !== null) + const endTime = Date.now() const totalTimeTakenMs = endTime - startTime const totalTimeTakenSec = (totalTimeTakenMs / 1000).toFixed(2) console.log( - `Generated ${chunks.length} embeddings in ${totalTimeTakenMs}ms (${totalTimeTakenSec}s)`, + `Generated ${results.length}/${chunks.length} embeddings in ${totalTimeTakenMs}ms (${totalTimeTakenSec}s)`, ) return results diff --git a/frontend/src/lib/extract-file-data.ts b/frontend/src/lib/extract-file-data.ts index 6ea929a..6814806 100644 --- a/frontend/src/lib/extract-file-data.ts +++ b/frontend/src/lib/extract-file-data.ts @@ -49,25 +49,46 @@ export async function extractFileData(file: { const formData = new FormData() formData.append('file', new Blob([new Uint8Array(data)], { type: mimetype }), file.name) - const parsefastApiUrl = new URL('/parse', process.env.FASTAPI_SERVER_URL).href - const fastApiResponse = await fetch(parsefastApiUrl, { + // Upload file and get response to confirm upload status + + const url = new URL('/upload', process.env.FASTAPI_SERVER_URL) + const uploadResponse = await fetch(url, { method: 'POST', body: formData, }) + if (!uploadResponse.ok) { + throw new Error('Failed to upload file to FastAPI server') + } - if (!fastApiResponse.ok) { - throw new Error('Failed to parse PDF on FastAPI server') + const { jobID } = await uploadResponse.json() + + // Poll /result/{jobID} until done + const pollResult = async (): Promise => { + const url = new URL(`/result/${encodeURIComponent(jobID)}`, process.env.FASTAPI_SERVER_URL) + const pollRes = await fetch(url) + if (pollRes.status === 202) { + // Not ready yet, wait and retry + await new Promise((resolve) => { + setTimeout(() => resolve(), 3000) + }) + return pollResult() + } + if (!pollRes.ok) { + throw new Error('Failed to retrieve processed PDF result') + } + return pollRes.json() } - const fastApiData = await fastApiResponse.json() - extractedText = fastApiData.text - extractedImages = fastApiData.images + const parsedData = await pollResult() + extractedText = parsedData.text + extractedImages = parsedData.images } else if (mimetype.includes('text') || ext === '.txt') { fileType = 'txt' - // contentSequence.push({ type: "text", content: data.toString("utf-8") }); + + // contentSequence.push({ type: 'text', content: data.toString('utf-8') }); } else if (mimetype.includes('markdown') || ext === '.md') { fileType = 'md' - // contentSequence.push({ type: "text", content: data.toString("utf-8") }); + // contentSequence.push({ type: 'text', content: data.toString('utf-8') }); } else { throw new Error('Unsupported file type') } diff --git a/frontend/src/lib/types/course-info-types.ts b/frontend/src/lib/types/course-info-types.ts index 6d82e96..e430876 100644 --- a/frontend/src/lib/types/course-info-types.ts +++ b/frontend/src/lib/types/course-info-types.ts @@ -4,6 +4,7 @@ export interface CourseInfo { courseCode?: string courseName?: string + courseDescription?: string semester?: string academicYear?: string deadline?: string