In [32]:
import pdfplumber
import re
import json
def clean_text(text):
    if not text:
        return ""
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def extract_section_between_keywords(full_text, start_keyword, end_keyword):
    try:
        pattern = re.compile(f"{re.escape(start_keyword)}(.*?){re.escape(end_keyword)}", re.IGNORECASE | re.DOTALL)
        match = pattern.search(full_text)
        if match:
            return clean_text(match.group(1).strip())
    except Exception as e:
        print(f"Error extracting between '{start_keyword}' and '{end_keyword}': {e}")
    return "Not Found"

def parse_semester_course_table(table):
    """Parses the main semester course tables into a list of dictionaries."""
    if not table or len(table) < 2:
        return []
    
    headers = [h.replace('\n', ' ') for h in table[0]]
    courses = []
    
    for row in table[1:]:
        course_data = {}
        for i, header in enumerate(headers):
            if i < len(row):
                course_data[header] = row[i]
        if course_data:
            courses.append(course_data)
            
    return courses

def find_tables_for_section(pages, section_title):
    """Finds and parses tables that appear after a specific section title."""
    tables_found = []
    found_title = False
    
    for page in pages:
        page_text = page.extract_text()
        if section_title in page_text:
            found_title = True
        
        if found_title:
            tables = page.extract_tables()
            for table in tables:
                if any("Course Number" in str(h) for h in table[0]):
                    parsed_table = parse_semester_course_table(table)
                    tables_found.extend(parsed_table)
    return tables_found

def parse_detailed_course_descriptions(full_text):
    """Parses the final, most complex section of the PDF."""
    semesters_data = {}
    
    semester_chunks = re.split(r'(SEMESTER\s+[IVX]+)', full_text, flags=re.IGNORECASE)
    
    if len(semester_chunks) < 2:
        return {"error": "Could not find semester sections."}

    for i in range(1, len(semester_chunks), 2):
        semester_name = semester_chunks[i].strip()
        semester_content = semester_chunks[i+1]
        
        semesters_data[semester_name] = {"courses": []}

        course_chunks = re.split(r'COURSE OVERVIEW:', semester_content, flags=re.IGNORECASE)
        
        for course_block in course_chunks[1:]:
            course_details = {}

            course_block_full = "COURSE OVERVIEW:" + course_block

            table_pattern = re.compile(r"(Course Code.*?Year of Introduction.*?)COURSE OVERVIEW:", re.DOTALL)
            table_match = table_pattern.search(course_block_full)

            if table_match:
                table_text = table_match.group(1).strip()
                lines = [line.strip() for line in table_text.split("\n") if line.strip()]
                if len(lines) >= 2:
                    headers = [h.strip() for h in lines[0].split("|")]
                    values = [v.strip() for v in lines[1].split("|")]
                    if len(headers) < 2:
                        headers = re.split(r'\s{2,}', lines[0])
                        values = re.split(r'\s{2,}', lines[1])
                    course_table = dict(zip(headers, values))
                else:
                    course_table = {"Error": "Table lines not sufficient"}
            else:
                course_table = {"Error": "Table not found"}

            course_details['course_info'] = course_table

            course_details['overview'] = extract_section_between_keywords(course_block_full, "COURSE OVERVIEW:", "COURSE OUTCOMES")
            course_details['outcomes'] = extract_section_between_keywords(course_block_full, "COURSE OUTCOMES", "SYLLABUS")
            course_details['syllabus'] = extract_section_between_keywords(course_block_full, "SYLLABUS", "TEXT BOOKS")
            course_details['textbooks'] = extract_section_between_keywords(course_block_full, "TEXT BOOKS", "REFERENCES")
            course_details['references'] = extract_section_between_keywords(course_block_full, "REFERENCES", "COURSE PLAN")

            try:
                plan_pattern = re.compile(r"COURSE PLAN(.*)", re.IGNORECASE | re.DOTALL)
                plan_match = plan_pattern.search(course_block_full)
                if plan_match:
                    course_details['course_plan'] = clean_text(plan_match.group(1).strip())
                else:
                    course_details['course_plan'] = "Not Found"
            except:
                course_details['course_plan'] = "Error Parsing"

            semesters_data[semester_name]['courses'].append(course_details)

            
    return semesters_data


def process_syllabus_pdf(pdf_path):
    """
    Main function to process the syllabus PDF and return a structured dictionary.
    """
    try:
        with pdfplumber.open(pdf_path) as pdf:
            full_text = "\n".join([page.extract_text() for page in pdf.pages])
            all_pages = pdf.pages

            syllabus_data = {
                "header_info": {},
                "credit_distribution": "Not Found",
                "semester_courses": {},
                "electives_honors_minors": {},
                "detailed_course_info": {}
            }

            print("Parsing header information...")
            header_info = syllabus_data["header_info"]
            header_info["institution_vision_mission"] = extract_section_between_keywords(full_text, "Vision and Mission of the Institution", "Vision and Mission of the Department")
            header_info["department_vision_mission"] = extract_section_between_keywords(full_text, "Vision and Mission of the Department", "PROGRAMME EDUCATIONAL OBJECTIVES")
            header_info["peos"] = extract_section_between_keywords(full_text, "PROGRAMME EDUCATIONAL OBJECTIVES (PEOs)", "PROGRAMME OUTCOMES (POs)")
            header_info["pos"] = extract_section_between_keywords(full_text, "PROGRAMME OUTCOMES (POs)", "PROGRAMME SPECIFIC OUTCOMES (PSOs)")
            header_info["psos"] = extract_section_between_keywords(full_text, "PROGRAMME SPECIFIC OUTCOMES (PSOs)", "Scheduling of Courses")


            print("Parsing semester tables...")
            for i in range(1, 9): 
                semester_roman = {1:'I', 2:'II', 3:'III', 4:'IV', 5:'V', 6:'VI', 7:'VII', 8:'VIII'}[i]
                semester_title = f"SEMESTER {semester_roman}"
                syllabus_data["semester_courses"][semester_title] = find_tables_for_section(all_pages, semester_title)
            
            print("Parsing elective/honors/minors tables...")
            syllabus_data["electives_honors_minors"]["programme_elective_I"] = find_tables_for_section(all_pages, "PROGRAMME ELECTIVE I")
            syllabus_data["electives_honors_minors"]["honors_basket"] = find_tables_for_section(all_pages, "B. Tech (HONOURS)")
            syllabus_data["electives_honors_minors"]["minor_basket"] = find_tables_for_section(all_pages, "B. Tech (MINOR)")

            print("Parsing detailed course descriptions...")
            syllabus_data["detailed_course_info"] = parse_detailed_course_descriptions(full_text)

            print("Processing complete.")
            return syllabus_data

    except Exception as e:
        print(f"An error occurred while processing the PDF: {e}")
        return None

if __name__ == "__main__":
    pdf_file_path = '/home/the-stick-insect/College Syllabus Chatbot/College-Chatbot/Curriculum-2022_CSEAI_B-Tech1st-n-2nd-year-and-3-yearsyllabus_030425 (1).pdf' 
    
    structured_syllabus = process_syllabus_pdf(pdf_file_path)
    
    if structured_syllabus:
        output_json_path = 'structured_syllabus.json'
        with open(output_json_path, 'w', encoding='utf-8') as f:
            json.dump(structured_syllabus, f, ensure_ascii=False, indent=4)
        print(f"Successfully processed and saved structured data to {output_json_path}")


Parsing header information...
Parsing semester tables...
Parsing elective/honors/minors tables...
Parsing detailed course descriptions...
Processing complete.
Successfully processed and saved structured data to structured_syllabus.json
