In [1]:
import pandas as pd
import random
import numpy as np
from datetime import datetime

In [2]:
# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

In [3]:
# Define course data based on the provided curriculum
cs_ai_courses = {
    1: {
        2: ["ECE10002", "ECE10005", "ECE10020"]
    },
    2: {
        1: ["ECE20010", "ECE20016", "ECE20025", "ECE20057", "ECE20064", "ECE20065"],
        2: ["ECE20006", "ECE20009", "ECE20021", "ECE20022", "ECE20026", "ECE20027", "ECE20042", "ECE20063"]
    },
    3: {
        1: ["ECE30002", "ECE30008", "ECE30011", "ECE30012", "ECE30021", "ECE30030", "ECE30039", "ECE30051", "ECE30070"],
        2: ["ECE30003", "ECE30006", "ECE30018", "ECE30040", "ECE30078", "ECE30086", "ECE30087"]
    },
    4: {
        1: ["ECE40010", "ECE40012", "ECE40027", "ECE40035", "ECE40042", "ECE40066", "ECE40097"],
        2: ["ECE40013", "ECE40014", "ECE40027", "ECE40044", "ECE40049", "ECE40052", "ECE40087", "ECE40097"]
    }
}

ee_courses = {
     1: {
        2: ["ECE10002", "ECE10005", "ECE10020"]
    },
    2: {
        1: ["ECE20010", "ECE20016", "ECE20057", "ECE20064", "ECE20065"],
        2: ["ECE20006", "ECE20010", "ECE20016", "ECE20021", "ECE20022", "ECE20053", "ECE20061", "ECE20063", "ECE20066"]
    },
    3: {
        1: ["ECE30021", "ECE30039", "ECE30051", "ECE30054", "ECE30063", "ECE30070", "ECE30083"],
        2: ["ECE30003", "ECE30018", "ECE30039", "ECE30040", "ECE30052", "ECE30055", "ECE30078", "ECE30086", "ECE30087"]
    },
    4: {
        1: ["ECE40008", "ECE40027", "ECE40066", "ECE40081", "ECE40082", "ECE40097"],
        2: ["ECE40013", "ECE40027", "ECE40052", "ECE40065", "ECE40087", "ECE40097"]
    }
}

# Course names mapping (simplified for dataset)
course_names = {
    "ECE10002": "C-Programming",
    "ECE10005": "Coding Studio",
    "ECE10020": "Introduction to Engineering Design",
    "ECE20010": "Data Structure",
    "ECE20016": "Introduction to JAVA Programming",
    "ECE20025": "Programming Studio",
    "ECE20057": "Logic Design",
    "ECE20064": "Circuit Theory",
    "ECE20065": "Basic Circuit and Logic Laboratory",
    "ECE20006": "Signal and System",
    "ECE20009": "Web Service Development",
    "ECE20021": "Computer Architecture and Organization",
    "ECE20022": "Computer Vision",
    "ECE20026": "Open Source Studio",
    "ECE20027": "System Programming",
    "ECE20042": "Discrete Mathematics",
    "ECE20063": "Digital System Design",
    "ECE30002": "Mobile App Development",
    "ECE30008": "Introduction to AI",
    "ECE30011": "Algorithm Analysis",
    "ECE30012": "Object-Oriented Design Pattern",
    "ECE30021": "Operating System",
    "ECE30030": "Database",
    "ECE30039": "Vocation and Career Planning",
    "ECE30051": "Electronic Circuits 1",
    "ECE30070": "Microprocessor Application",
    "ECE30003": "IoT System Design",
    "ECE30006": "Programming Language Theory",
    "ECE30018": "Problem Solving Studio",
    "ECE30040": "IT Startup Practicum",
    "ECE30078": "Intelligent Robot Control",
    "ECE30086": "Computer Networks",
    "ECE30087": "Probability and Random Processes",
    "ECE40010": "Software Engineering",
    "ECE40012": "Compiler Theory",
    "ECE40027": "Post-capstone Research",
    "ECE40035": "Introduction to Deep Learning",
    "ECE40042": "Computer Graphics",
    "ECE40066": "IoT Laboratories",
    "ECE40097": "Special Topic 1",
    "ECE40013": "Intelligent Signal Processing",
    "ECE40014": "Applied Project Studio",
    "ECE40044": "Computer Security",
    "ECE40049": "Deep Learning for Image Processing",
    "ECE40052": "Integrated Circuit Design",
    "ECE40087": "Machine Learning",
    "ECE20053": "Basic Circuit Theory 2",
    "ECE20061": "Electromagnetic",
    "ECE20066": "Basic Signal Processing Laboratory",
    "ECE30054": "Practicing the Electronic Circuits and Communication System",
    "ECE30063": "Semiconductor Physics",
    "ECE30083": "Principles of Communication",
    "ECE30052": "Electronic Circuits 2",
    "ECE30055": "Circuit Design and Fabrication",
    "ECE40008": "RF Circuit Design",
    "ECE40081": "Digital Communication",
    "ECE40082": "Digital Signal Processing",
    "ECE40065": "Semiconductor Processing and Nano Technology"
}

# Required major courses for CS/AI track
cs_ai_required = ["ECE10020", "ECE20010", "ECE20021", "ECE20026", "ECE30008", "ECE30021", ]

# Required major courses for EE track
ee_required = ["ECE10020", "ECE20065", "ECE20066"]

# Lab interest keywords and research areas
lab_fields = [
    "Artificial Intelligence", "Deep Learning", "Signal Processing", "Human-Computer Interaction",
    "Computer Vision", "Software Engineering", "Security", "Computer Networks", "Mobile Computing",
    "Multimedia Systems", "Embedded Systems", "Integrated Circuits", "Wireless Communication",
    "Sensor Technology", "Robotics", "Mobile Development", "Game Development", "Web Development",
    "Autonomous Systems", "Image Processing", "Medical Imaging"
]

majors = [
    "AI - Computer Science and Engineering",
    "Computer Science and Electrical Engineering", 
    "Computer Science",
    "Electrical Engineering"
]

In [4]:
def generate_student_id(enrollment_year, order):
    return f"2{enrollment_year:02d}{order:05d}"

In [5]:



def generate_courses_for_student(major, semester):
    """Generate realistic course progression for a student"""
    courses_taken = []
    courses_gpa = []
    current_courses = []
    required_completed = []
    credits = 0

    # Determine which course set and required list to use
    if "AI" in major or "Computer Science" in major:
        course_set = cs_ai_courses
        required_set = cs_ai_required
    else:
        course_set = ee_courses
        required_set = ee_required

    # Generate courses for completed semesters
    for year in range(1, (semester // 2) + 2):
        if year > 4:
            break
        for sem in [1, 2]:
            current_sem = (year - 1) * 2 + sem
            if current_sem >= semester:
                continue
            if year in course_set and sem in course_set[year]:
                available = course_set[year][sem]
                num_courses = random.randint(2, min(4, len(available)))
                selected = random.sample(available, num_courses)
                for course in selected:
                    courses_taken.append(course)
                    base_gpa = random.uniform(2.5, 4.5)
                    courses_gpa.append(round(base_gpa, 2))
                    credits += 3
                    if course in required_set:
                        required_completed.append(course)

    # Generate current semester courses
    current_year = ((semester - 1) // 2) + 1
    current_sem_in_year = ((semester - 1) % 2) + 1

    if current_year in course_set and current_sem_in_year in course_set[current_year]:
        available = [c for c in course_set[current_year][current_sem_in_year] if c not in courses_taken]
        if available:
            num_current = random.randint(2, min(4, len(available)))
            current_courses = random.sample(available, num_current)

    # Fallback if no current_courses available
    if not current_courses:
        for future_year in range(current_year, 5):
            for future_sem in [1, 2]:
                if future_year in course_set and future_sem in course_set[future_year]:
                    pool = [c for c in course_set[future_year][future_sem] if c not in courses_taken]
                    if pool:
                        current_courses = random.sample(pool, min(2, len(pool)))
                        break
            if current_courses:
                break

    return courses_taken, courses_gpa, current_courses, required_completed, credits


In [6]:

def generate_student_data():
    students = []
    
    for i in range(1000):
        # Generate enrollment year (2019-2024)
        enrollment_year = random.randint(19, 24)
        student_id = generate_student_id(enrollment_year, i + 1)
        
        # Generate name (using common Korean surnames + English first names)
        surnames = [
    "Kim", "Lee", "Park", "Choi", "Jung", "Kang", "Cho", "Yoon", "Jang", "Lim",
    "Shin", "Han", "Oh", "Seo", "Hwang", "Ahn", "Song", "Jeon", "Hong", "Yang",
    "Kwon", "Baek", "Nam", "Joo", "Ha", "Ryu", "No", "Gu", "Byun", "Seok"
]
        first_names = [
    "Minho", "Jisoo", "Hyunwoo", "Soyeon", "Jihoon", "Yuna", "Seungmin", "Chaeyoung", 
    "Taehyun", "Nayeon", "Jaehyun", "Dahyun", "Sunghoon", "Minji", "Wonwoo",
    "Eunji", "Jiwon", "Gyuri", "Youngjae", "Sungmin", "Haeun", "Jinwoo", "Harin", 
    "Myungsoo", "Seojin", "Heejin", "Kyungsoo", "Nari", "Siwoo", "Areum", "Yejin", 
    "Hoseok", "Yeonjun", "Soobin", "Hana", "Mingyu", "Suhyun", "Jinyoung", "Haejin"
]
        name = f"{random.choice(surnames)} {random.choice(first_names)}"
        
        # Generate major
        major = random.choice(majors)
        
        # Generate semester (1-6 only, no capstone students)
        semester = random.randint(2, 6)
        
        # Generate courses
        courses_taken, courses_gpa, current_courses, required_completed, credits = generate_courses_for_student(major, semester)
        
        # Generate overall GPA (weighted average of course GPAs)
        if courses_gpa:
            overall_gpa = round(sum(courses_gpa) / len(courses_gpa) + random.uniform(-0.3, 0.3), 2)
            overall_gpa = max(2.0, min(4.5, overall_gpa))
        else:
            overall_gpa = round(random.uniform(2.5, 4.0), 2)
        
        # Generate lab interests (2-4 keywords)
        num_interests = random.randint(2, 4)
        lab_interests = random.sample(lab_fields, num_interests)
        
        # Generate preferred research area (single field)
        preferred_research = random.choice(lab_fields)
        
        # Generate other characteristics
        prior_experience = random.choice(["yes", "no"])
        study_hours = random.randint(5, 40)
        attendance = random.randint(75, 100)
        assignment_completion = random.randint(75, 100)
        participation = random.randint(75, 100)
        sleep_hours = random.randint(3, 9)
        stress_level = random.randint(1, 10)
        
        # Create student record
        student = {
            "student_id": student_id,
            "name": name,
            "major": major,
            "semester": semester,
            "GPA": overall_gpa,
            "courses_taken": ";".join(courses_taken),
            "courses_gpa": ";".join(map(str, courses_gpa)),
            "required_major_courses_completed": ";".join(required_completed),
            "credits_completed": credits,
            "current_courses": ";".join(current_courses),
            "lab_interest_keywords": ";".join(lab_interests),
            "preferred_research_area": preferred_research,
            "prior_project_experience": prior_experience,
            "weekly_study_hours": study_hours,
            "attendance_rate": f"{attendance}%",
            "sleeps_hours_per_day": sleep_hours,
            "stress_level": stress_level,
            "assignment_completion_rate": f"{assignment_completion}%",
            "class_participation": f"{participation}%"
        }
        
        students.append(student)
    
    return students

In [9]:

# Generate the dataset
# print("Generating 1,000 student profiles...")
student_data = generate_student_data()

# Create DataFrame
df = pd.DataFrame(student_data)

# Save to CSV
filename = "../../data/processed/handong_smart_academic_advisor_students.csv"
df.to_csv(filename, index=False, encoding='utf-8')

In [12]:
df.tail()

Unnamed: 0,student_id,name,major,semester,GPA,courses_taken,courses_gpa,required_major_courses_completed,credits_completed,current_courses,lab_interest_keywords,preferred_research_area,prior_project_experience,weekly_study_hours,attendance_rate,sleeps_hours_per_day,stress_level,assignment_completion_rate,class_participation
995,21900996,Choi Sunghoon,Computer Science,4,4.18,ECE10020;ECE10002;ECE20065;ECE20016,4.28;4.41;3.71;3.73,ECE10020,12,ECE20006;ECE20009;ECE20063;ECE20042,Mobile Development;Computer Vision;Artificial ...,Sensor Technology,yes,27,100%,6,6,75%,86%
996,22400997,Jung Chaeyoung,Computer Science,6,3.71,ECE10005;ECE10002;ECE10020;ECE20057;ECE20065;E...,2.59;3.83;3.35;3.38;3.29;4.34;3.37;3.73;2.77;3...,ECE10020;ECE30008,39,ECE30086;ECE30006,Human-Computer Interaction;Wireless Communicat...,Autonomous Systems,no,40,95%,8,5,84%,90%
997,21900998,Choi Mingyu,AI - Computer Science and Engineering,5,3.18,ECE10020;ECE10002;ECE10005;ECE20016;ECE20025;E...,2.54;4.0;4.42;3.28;2.67;2.88;3.3;2.74;4.11,ECE10020;ECE20026,27,ECE30051;ECE30070;ECE30039,Security;Signal Processing;Computer Networks,Medical Imaging,yes,22,97%,6,1,78%,78%
998,22000999,No Areum,Computer Science and Electrical Engineering,6,3.5,ECE10005;ECE10020;ECE20064;ECE20057;ECE20065;E...,3.64;2.99;3.45;3.5;4.02;2.87;2.73;2.57;3.3;4.3...,ECE10020;ECE20010;ECE20026,33,ECE30006;ECE30078;ECE30087;ECE30040,Integrated Circuits;Software Engineering;Compu...,Security,no,29,86%,8,6,86%,77%
999,21901000,Hong Nayeon,Electrical Engineering,4,3.61,ECE10002;ECE10020;ECE10005;ECE20016;ECE20057;E...,3.47;3.53;3.54;4.4;3.89;2.6;2.73,ECE10020;ECE20065,21,ECE20006;ECE20066,Web Development;Artificial Intelligence;Softwa...,Human-Computer Interaction,yes,26,93%,4,6,86%,83%
