In [None]:
import numpy as np
import pandas as pd
from neo4j import GraphDatabase
from typing import List, Dict
import os
import streamlit as st
import re
import pandas as pd
import ast
import os


## Processing Raw Data(Scraped from course catalogue)

### Generate Current Course Catalogue(Used for Scraping course indexes)

In [None]:
directory = 'data'
file_name = 'Course_Catalogue.txt'
file_path = os.path.join(directory, file_name)
combined_lines = ''
# Combine lines with '|'
try:
    with open(file_path, 'r') as file:
        combined_lines = '|'.join(line.strip() for line in file if line.strip())
        combined_lines = '('+combined_lines+')'
    print("Combined Lines:\n", combined_lines)
except FileNotFoundError:
    print(f"The file '{file_name}' was not found in the directory '{directory}'.")

## Opening and Specifying Directory

In [None]:
source_directory = 'data/raw'
target_directory = 'data/processed'

# Define the source and target directories
def is_graduate_course(title):
    match = re.search(r'\b\d{1,3}', title)  # Extract course number
    if match:
        course_number = int(match.group())  # Convert to integer
        return course_number < 200  # Check if it's a graduate course
    return False  # If no number is found, consider it not a graduate course

# Split the title and extract the first two elements
def get_index(title):
    split_title = title.split()  # Split the title by spaces
    return " ".join(split_title[:2]).strip(".")  # Join the first two elements back into a string

def get_name(title):
    # Split the title by whitespace
    parts = title.split()
    
    # Remove the first two elements (course code and number)
    course_name_parts = parts[2:-1]  # Exclude the last element (e.g., '(4)')
    
    # Join the remaining parts to reconstruct the course name
    course_name = ' '.join(course_name_parts)
    
    return course_name

def get_unit(description):
    # Use regex to find the number inside parentheses
    match = re.search(r'\((\d+)\)', description)
    if match:
        return int(match.group(1))  # Convert the unit to an integer
    return None  # Return None if no unit is found

def get_tags(description):
    # Split the string by "Tags:" and check if tags exist
    if "Tags:" in description:
        tags_part = description.split("Tags:")[1]  # Extract the part after "Tags:"
        # Split tags into a list and strip any extra whitespace
        tags = [tag.strip() for tag in tags_part.split(',')]
        return tags
    return []  # Return an empty list if no tags are found

def split_and_extract_courses(data):
    """
    Split the input by "and" and extract valid course numbers for each group.

    Args:
        data (list or str): Input data containing course information.

    Returns:
        list: A list where each element represents an "AND" group with extracted course numbers.
    """
    # Regular expression to match valid course numbers
    course_pattern = rf'(?i){combined_lines} (\d+[A-Z]?[A-Z]?)'

    def tuples_to_strings(tuples_list):
        """
        Convert a list of tuples into a list of regular strings.
        
        Args:
            tuples_list (list of tuples): Each tuple contains two strings, e.g., [('MATH', '20AC'), ('MATH', '20BC')].
        
        Returns:
            list of str: A list of strings where the tuple elements are joined with a space.
        """
        return [f"{dept} {course}" for dept, course in tuples_list]
    
    def process_string(s):
        """Helper function to process a string, split by 'and', and extract course numbers."""
        and_groups = s.split(' and ')
        courses = [re.findall(course_pattern, group) for group in and_groups]
        itemized_courses = []
        for course in courses:
            itemized_courses.append(tuples_to_strings(course))
        return itemized_courses

    # If the input is a string, process it
    if isinstance(data, str):
        return process_string(data)





## Extracting Key Informations(Prereqs, Units, Major Restrictions)

In [None]:
for file_name in os.listdir(source_directory):
    
    if file_name.endswith('.csv'):  # Only process CSV files
        prerequisites = []
        major_restrictions = []
        course_indexes = []
        course_units = []
        course_titles = []
        course_descriptions = []
        course_tags = []
        source_file_path = os.path.join(source_directory, file_name)
        target_file_path = os.path.join(target_directory, file_name)
        
        # Read the CSV file
        raw_data = pd.read_csv(source_file_path)
        data = raw_data[raw_data['Title'].apply(is_graduate_course)]

        # Extract course titles into a list
        course_indexes = data['Title'].apply(get_index).tolist()
        course_titles = data['Title'].apply(get_name).tolist()
        course_units = data['Title'].apply(get_unit).tolist()
        course_tags = data['Title'].apply(get_tags).tolist()
        
        
        # Extract descriptions into a list
        course_descriptions = data['Description'].tolist()

        # Loop through the course descriptions to extract key info
        for description in course_descriptions:
            # Check for major restrictions
            if "restricted to" in description.lower():
                # Extract the part mentioning restricted majors
                match = re.search(r"restricted to (.+?)(?:\.|$)", description, re.IGNORECASE)
                if match:
                    restriction_text = match.group(1)
                    # Extract major codes (e.g., CS25, EC26)
                    majors = re.findall(r"[A-Z]{2}\d{2}", restriction_text)
                    
                    # Check if unrestricted condition is also mentioned
                    if "all other students will be allowed" in description.lower():
                        major_restrictions.append([])  # Unrestricted as space permits
                    else:
                        major_restrictions.append(majors)  # Only restricted to listed majors
                else:
                    major_restrictions.append([])  # No valid restriction found
            else:
                major_restrictions.append([])  # No restrictions mentioned
            
            # Match prerequisites that look like course numbers (e.g., "CSE 12", "MATH 10A")
            match = re.search(r"Prerequisite[s]*: (.+?)(?:;|\.|$)", description)
            
            if match:
                # Extract the part of the description mentioning prerequisites
                prereqs_text = split_and_extract_courses(match.group(1))
                print('out', prereqs_text)
            else:
                prereqs_text = []  # No prerequisites found
            prerequisites.append(prereqs_text)
            
        # Display the results in a structured format
        structured_data = {
            "Course_Index": course_indexes,
            "Course_Title": course_titles,
            "Course_Units" : course_units,
            "Course_Description": course_descriptions,
            "Course_Prerequisites": prerequisites,
            "Major_Restriction" : major_restrictions, 
            "Course_Tags": course_tags
        }

        df_structured = pd.DataFrame(structured_data)
        df_structured.to_csv(target_file_path, index=False)
        
        
        print("Sent to", target_file_path)


In [None]:
from neo4j import GraphDatabase
from typing import List, Dict

class CourseDatabase:
    def __init__(self, uri: str, username: str, password: str):
        """Initialize connection to Neo4j database."""
        self.driver = GraphDatabase.driver(uri, auth=(username, password))

    def close(self):
        """Close the database connection."""
        self.driver.close()

    def add_course(self, code: str, name: str) -> None:
        """Add a new course to the database."""
        with self.driver.session() as session:
            session.execute_write(self._create_course, code, name)

    @staticmethod
    def _create_course(tx, code: str, name: str):
        query = """
        MERGE (c:Course {code: $code})
        SET c.name = $name
        RETURN c
        """
        result = tx.run(query, code=code, name=name)
        return result.single()

    

    def get_prerequisites(self, course_code: str, recursive: bool = True) -> List[Dict]:
        """
        Get prerequisites for a course, including ORGroup members. If recursive=True, fetch indirect prerequisites as well.

        Args:
            course_code (str): The course code to query.
            recursive (bool): Whether to fetch indirect prerequisites (default: True).

        Returns:
            List[Dict]: A list of dictionaries containing prerequisite course codes, names, and ORGroups.
        """
        with self.driver.session() as session:
            return session.execute_read(self._get_prerequisites, course_code, recursive)

    @staticmethod
    def _get_prerequisites(tx, course_code: str, recursive: bool) -> List[Dict]:
        """
        Helper method to retrieve prerequisites, including ORGroup members.

        Args:
            tx: The Neo4j transaction object.
            course_code (str): The course code to query.
            recursive (bool): Whether to fetch indirect prerequisites.

        Returns:
            List[Dict]: A list of prerequisite courses and their ORGroups as dictionaries.
        """
        depth = "*" if recursive else "1"
        query = f"""
        MATCH path = (c:Course {{code: $course_code}})-[:REQUIRES{depth}]->(group:ORGroup)
        WITH group, nodes(path) AS pathNodes
        OPTIONAL MATCH (group)<-[:MEMBER_OF]-(prereq:Course)
        RETURN DISTINCT group.name AS ORGroup, collect(DISTINCT prereq.code) AS PrerequisiteCodes,
            pathNodes AS PathNodes
        """
        result = tx.run(query, course_code=course_code)
        prerequisites = []
        for record in result:
            prerequisites.append({
                "ORGroup": record["ORGroup"],
                "PrerequisiteCodes": record["PrerequisiteCodes"],
                "PathNodes": [node["code"] for node in record["PathNodes"] if "code" in node]
            })
        return prerequisites



    def get_available_courses(self, completed_courses: List[str]) -> List[Dict]:
        """Get courses where all prerequisites have been completed."""
        with self.driver.session() as session:
            return session.execute_read(self._get_available_courses, completed_courses)

    @staticmethod
    def _get_available_courses(tx, completed_courses: List[str]):
        query = """
        MATCH (c:Course)
        WHERE (
            // Either has no prerequisites
            NOT EXISTS((c)-[:REQUIRES]->(:Course))
            OR 
            // Or all prerequisites are completed
            ALL(prereq IN [(c)-[:REQUIRES]->(p) | p.code] 
                WHERE prereq IN $completed_courses)
        )
        // Exclude courses that are already completed
        AND NOT c.code IN $completed_courses
        RETURN c.code as code, c.name as name
        """
        result = tx.run(query, completed_courses=completed_courses)
        return [dict(record) for record in result]
    def add_prerequisites(self, course_code: str, prereq_groups: List[List[str]]) -> None:
        """Add prerequisite relationships between courses."""
        print("hai")
        with self.driver.session() as session:
            session.execute_write(self._create_prerequisites, course_code, prereq_groups)
            
    @staticmethod        
    def _create_prerequisites(tx, course_code: str, prereq_groups: List[List[str]]):
        """
        Creates prerequisite relationships with explicit ORGroup nodes.
        """
        for group_index, prereq_group in enumerate(prereq_groups):
            if not prereq_group:  # Skip empty prerequisite groups
                continue
            
            # Create an ORGroup node
            or_group_name = f"{course_code}_ORGroup_{group_index}"
            create_or_group_query = """
            MERGE (group:ORGroup {name: $or_group_name})
            MERGE (course:Course {code: $course_code})
            MERGE (course)-[:REQUIRES]->(group)
            """
            tx.run(create_or_group_query, or_group_name=or_group_name, course_code=course_code)

            # Link each prerequisite to the ORGroup
            for prereq_code in prereq_group:
                link_prereq_query = """
                MATCH (group:ORGroup {name: $or_group_name})
                MATCH (prereq:Course {code: $prereq_code})
                MERGE (prereq)-[:MEMBER_OF]->(group)
                """
                tx.run(link_prereq_query, or_group_name=or_group_name, prereq_code=prereq_code)
    
    def create_lower_division_category(self):
        with self.driver.session() as session:
            session.run("CREATE (:Category {name: 'Lower Division'})")

            # Add Mathematics Sequence OR Group
            session.run("CREATE (:OrGroup {name: 'Mathematics Sequence'})")
            session.run("""
                MATCH (c:Category {name: 'Lower Division'}), (og:OrGroup {name: 'Mathematics Sequence'})
                CREATE (c)-[:INCLUDES]->(og)
            """)
            session.run("""
                MATCH (og:OrGroup {name: 'Mathematics Sequence'})
                MERGE (:Course {code: 'MATH 18', name: 'Linear Algebra'})
                MERGE (:Course {code: 'MATH 20A', name: 'Calculus I'})
                MERGE (:Course {code: 'MATH 20B', name: 'Calculus II'})
                MERGE (:Course {code: 'MATH 20C', name: 'Calculus III'})
                MERGE (:Course {code: 'MATH 20D', name: 'Differential Equations'})
                MERGE (:Course {code: 'MATH 31AH', name: 'Honors Linear Algebra'})
                MERGE (:Course {code: 'MATH 31BH', name: 'Honors Calculus I'})
                MERGE (:Course {code: 'MATH 31CH', name: 'Honors Calculus II'})
            """)
            session.run("""
                MATCH (og:OrGroup {name: 'Mathematics Sequence'})
                MATCH (course:Course)
                WHERE course.code IN ['MATH 18', 'MATH 20A', 'MATH 20B', 'MATH 20C', 'MATH 20D', 'MATH 31AH', 'MATH 31BH', 'MATH 31CH']
                CREATE (og)-[:INCLUDES]->(course)
            """)

            # Add Introduction to Computer Science OR Group
            session.run("CREATE (:OrGroup {name: 'Introduction to Computer Science'})")
            session.run("""
                MATCH (c:Category {name: 'Lower Division'}), (og:OrGroup {name: 'Introduction to Computer Science'})
                CREATE (c)-[:INCLUDES]->(og)
            """)
            session.run("""
                MATCH (og:OrGroup {name: 'Introduction to Computer Science'})
                MERGE (:Course {code: 'CSE 8A-B', name: 'Introduction to Computer Science: Java'})
                MERGE (:Course {code: 'CSE 11', name: 'Accelerated Introduction to Computer Science: Java'})
            """)
            session.run("""
                MATCH (og:OrGroup {name: 'Introduction to Computer Science'})
                MATCH (course:Course)
                WHERE course.code IN ['CSE 8A-B', 'CSE 11']
                CREATE (og)-[:INCLUDES]->(course)
            """)

            # Add Software Tools and Laboratory OR Group
            session.run("CREATE (:OrGroup {name: 'Software Tools and Laboratory'})")
            session.run("""
                MATCH (c:Category {name: 'Lower Division'}), (og:OrGroup {name: 'Software Tools and Laboratory'})
                CREATE (c)-[:INCLUDES]->(og)
            """)
            session.run("""
                MATCH (og:OrGroup {name: 'Software Tools and Laboratory'})
                MERGE (:Course {code: 'CSE 15L', name: 'Software Tools and Techniques Laboratory'})
                MERGE (:Course {code: 'CSE 29', name: 'Software Tools Laboratory'})
            """)
            session.run("""
                MATCH (og:OrGroup {name: 'Software Tools and Laboratory'})
                MATCH (course:Course)
                WHERE course.code IN ['CSE 15L', 'CSE 29']
                CREATE (og)-[:INCLUDES]->(course)
            """)

            # Add Basic Data Structures and Object-Oriented Design directly
            session.run("""
                MATCH (c:Category {name: 'Lower Division'})
                MERGE (:Course {code: 'CSE 12', name: 'Basic Data Structures and Object-Oriented Design'})
                WITH c
                MATCH (course:Course {code: 'CSE 12'})
                CREATE (c)-[:INCLUDES]->(course)
            """)

            print("Lower Division courses created successfully!")
    
    def create_upper_division_courses(self):
        with self.driver.session() as session:
            # Create the main category for Upper Division
            session.run("CREATE (:Category {name: 'Upper Division'})")

            # Add Mathematical Reasoning
            session.run("""
                MATCH (c:Category {name: 'Upper Division'})
                MERGE (:Course {code: 'MATH 109', name: 'Mathematical Reasoning'})
                WITH c
                MATCH (course:Course {code: 'MATH 109'})
                CREATE (c)-[:INCLUDES]->(course);
            """)
            
            session.run("""
                MATCH (c:Category {name: 'Upper Division'})
                MERGE (:Course {code: 'CSE 101', name: 'Design and Analysis of Algorithms'})
                WITH c
                MATCH (course:Course {code: 'CSE 101'})
                CREATE (c)-[:INCLUDES]->(course);
            """)
            
            # Add Modern/Applied Algebra Sequence Requirement
            session.run("""
                // Create a Sequence Requirement for Modern/Applied Algebra
                CREATE (:SequenceRequirement {name: 'Modern/Applied Algebra Sequence Requirement'});

                // Create two OR groups for the two sequences
                CREATE (:OrGroup {name: 'Modern Algebra Sequence'});
                CREATE (:OrGroup {name: 'Applied Algebra Sequence'});

                // Link the Sequence Requirement to the OR groups
                MATCH (sr:SequenceRequirement {name: 'Modern/Applied Algebra Sequence Requirement'}),
                    (og1:OrGroup {name: 'Modern Algebra Sequence'}),
                    (og2:OrGroup {name: 'Applied Algebra Sequence'})
                CREATE (sr)-[:INCLUDES]->(og1),
                    (sr)-[:INCLUDES]->(og2);

                // Link the Sequence Requirement to the Upper Division category
                MATCH (c:Category {name: 'Upper Division'}),
                    (sr:SequenceRequirement {name: 'Modern/Applied Algebra Sequence Requirement'})
                CREATE (c)-[:INCLUDES]->(sr);

                // Add courses to the Modern Algebra Sequence OR group
                MATCH (og1:OrGroup {name: 'Modern Algebra Sequence'})
                MERGE (:Course {code: 'MATH 103A', name: 'Modern Algebra A'})
                MERGE (:Course {code: 'MATH 103B', name: 'Modern Algebra B'})
                WITH og1
                MATCH (og1), (course:Course)
                WHERE course.code IN ['MATH 103A', 'MATH 103B']
                CREATE (og1)-[:INCLUDES]->(course);

                // Add courses to the Applied Algebra Sequence OR group
                MATCH (og2:OrGroup {name: 'Applied Algebra Sequence'})
                MERGE (:Course {code: 'MATH 100A', name: 'Applied Algebra A'})
                MERGE (:Course {code: 'MATH 100B', name: 'Applied Algebra B'})
                WITH og2
                MATCH (og2), (course:Course)
                WHERE course.code IN ['MATH 100A', 'MATH 100B']
                CREATE (og2)-[:INCLUDES]->(course);
            """)
            
            session.run("""
                MATCH (c:Category {name: 'Upper Division'})
                MERGE (:Course {code: 'CSE 105', name: 'Theory of Computability'})
                WITH c
                MATCH (course:Course {code: 'CSE 105'})
                CREATE (c)-[:INCLUDES]->(course);
            """)
            
            session.run("""
                CREATE (:OrGroup {name: 'Probability'});
                MATCH (c:Category {name: 'Upper Division'}), (og:OrGroup {name: 'Probability'})
                CREATE (c)-[:INCLUDES]->(og);
                MATCH (og:OrGroup {name: 'Probability'})
                MERGE (:Course {code: 'MATH 180A', name: 'Intro to Probability A'})
                MERGE (:Course {code: 'MATH 183', name: 'Applied Probability'})
                MATCH (og:OrGroup {name: 'Probability'})
                MATCH (course:Course)
                WHERE course.code IN ['MATH 180A', 'MATH 183']
                CREATE (og)-[:INCLUDES]->(course);
            """)
            
            session.run("""
                CREATE (:OrGroup {name: 'Discrete Math'});
                MATCH (c:Category {name: 'Upper Division'}), (og:OrGroup {name: 'Discrete Math'})
                CREATE (c)-[:INCLUDES]->(og);
                MATCH (og:OrGroup {name: 'Discrete Math'})
                MERGE (:Course {code: 'MATH 154', name: 'Discrete Structures'})
                MERGE (:Course {code: 'MATH 158', name: 'Discrete Mathematics'})
                MERGE (:Course {code: 'MATH 184', name: 'Combinatorics A'})
                MERGE (:Course {code: 'MATH 188', name: 'Combinatorics B'});
                MATCH (og:OrGroup {name: 'Discrete Math'})
                MATCH (course:Course)
                WHERE course.code IN ['MATH 154', 'MATH 158', 'MATH 184', 'MATH 188']
                CREATE (og)-[:INCLUDES]->(course);
            """)
       




uri = st.secrets["NEO4J_URI"]
username = st.secrets["NEO4J_USERNAME"]
password = st.secrets["NEO4J_PASSWORD"]

db = CourseDatabase(
    uri=uri,
    username=username,
    password=password
)

#all_prerequisites = db.get_prerequisites_recursive("MATH 109")
#print(all_prerequisites)
#st.write(f"All prerequisites for {course_id}: {all_prerequisites}")           




[]

## Testing the get_prereqs function

## Getting Course Prerequisites

In [None]:
course_code = "MATH 154"
prerequisites = db.get_prerequisites(course_code, recursive=True)

# Print the results
print(f"Prerequisites for {course_code}:")
print(prerequisites)
for prereq in prerequisites:
    print(f"{prereq['PrerequisiteCodes']}")

## Creating Categories for MATH CS major

In [None]:
db.create_lower_division_category()


In [None]:
db.create_upper_division_category()

## Populating the Neo4j Database

In [None]:

                
source_directory = "data/processed/"

for file_name in os.listdir(source_directory):
    if file_name.endswith('.csv'):
        # Load the CSV file to examine its structure
        file_path = os.path.join(source_directory, file_name)
        print(f"Processing: {file_path}")
        
        data = pd.read_csv(file_path)

        # Extract courses and prerequisites
        courses = list(zip(data['Course_Index'], data['Course_Title']))
        prerequisites = list(zip(data['Course_Index'], data['Course_Prerequisites']))
        
        # Add courses to the database
        for code, name in courses:
            db.add_course(code, name)
        
        # Add prerequisites as groups
        for code, prereq in prerequisites:
            if pd.notna(prereq):  # Ensure the prerequisites field is not NaN
                try:
                    prereq_groups = ast.literal_eval(prereq)  # Parse the prerequisite string
                    print(f"Calling _create_prerequisites with course_code={code}, prereq_groups={prereq_groups}")
                    db.add_prerequisites(code, prereq_groups)
                except (ValueError, SyntaxError) as e:
                    print(f"Error parsing prerequisites for {code}: {prereq} - {e}")
        
        print(f"Completed processing: {file_path}")


In [None]:
# print("Available courses (completed: CS101, MATH101):")
# available = db.get_available_courses(["CS101", "MATH101"])
# for course in available:
#     print(f"- {course['code']}: {course['name']}")

In [None]:
db.close()