# Data Collection and Preparation

## For the purpose of this assessment, I am creating a synthetic database

In [33]:
import random
import pandas as pd

# Function to generate a random 5-digit student ID
def generate_student_id():
    return ''.join(random.choices('0123456789', k=5))

# Function to generate a random 5-character alphanumeric password
def generate_password():
    characters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
    return ''.join(random.choices(characters, k=5))

# Function to generate a random Filipino first and last name with gender-specific first names
def generate_name_and_gender():
    male_first_names = [
        "Juan", "Jose", "Antonio", "Ricardo", "Miguel", 
        "Fernando", "Carlos", "Ismael", "Roberto", "Alberto"
    ]
    female_first_names = [
        "Maria", "Carmen", "Rosa", "Sofia", "Luisa", 
        "Isabela", "Angelica", "Jessica", "Alicia", "Veronica"
    ]
    last_names = [
        "Garcia", "Reyes", "Cruz", "Santos", "Rivera", "Gonzalez", 
        "Martinez", "Ramos", "Fernandez", "Torres", "Flores", "Morales"
    ]

    gender = random.choice(['male', 'female'])
    if gender == 'male':
        first_name = random.choice(male_first_names)
    else:
        first_name = random.choice(female_first_names)
    
    last_name = random.choice(last_names)
    
    return first_name, last_name, gender.capitalize()

# Function to generate a random age between 18 and 35
def generate_age():
    return random.randint(18, 35)

# Function to randomly assign a nationality from Southeast Asian countries
def generate_nationality():
    nationalities = [
        "Philippines", "Malaysia", "Indonesia", "Thailand", "Vietnam", 
        "Singapore", "Myanmar", "Cambodia", "Laos", "Brunei"
    ]
    return random.choice(nationalities)

# Function to generate a list of completed units with corresponding semesters and credits
def generate_completed_units(course_structure, gender):
    completed_units = []
    semesters = random.sample(range(1, 5), random.randint(1, 3))
    
    # Ensure some students have completed BCO7000 and BCO7006
    if random.random() < 0.2:  # 20% chance to have both units completed
        completed_units.append({
            'unit_code': 'BCO7000',
            'semester_completed': random.choice(semesters),
            'credits_earned': 12
        })
        completed_units.append({
            'unit_code': 'BCO7006',
            'semester_completed': random.choice(semesters),
            'credits_earned': 12
        })
    
    for semester in semesters:
        units_in_semester = course_structure[course_structure['SEMESTER'] == semester]['UNIT_CODE'].tolist()
        if units_in_semester:
            unit_code = random.choice(units_in_semester)
            # Prevent adding duplicate units
            if unit_code not in [unit['unit_code'] for unit in completed_units]:
                completed_units.append({
                    'unit_code': unit_code,
                    'semester_completed': semester,
                    'credits_earned': 12
                })

    # Ensure only female students have completed WOM1000
    if gender == 'Female' and 'WOM1000' not in [unit['unit_code'] for unit in completed_units]:
        completed_units.append({
            'unit_code': 'WOM1000',
            'semester_completed': random.choice(semesters),
            'credits_earned': 12
        })
    
    return completed_units

# Load the course structure from the provided CSV file
course_structure = pd.read_csv('https://raw.githubusercontent.com/ifiecas/courseplan/main/bmbu-2024_ver7.csv')

# Generate a synthetic dataset of 500 students
students = []
for _ in range(500):
    first_name, last_name, gender = generate_name_and_gender()
    completed_units = generate_completed_units(course_structure, gender)
    
    # Prepare student data with completed units in separate columns
    student = {
        'st_id': generate_student_id(),
        'password': generate_password(),  # Generate a password for each student
        'first_name': first_name,
        'last_name': last_name,
        'age': generate_age(),
        'gender': gender,
        'nationality': generate_nationality()
    }
    
    # Add completed units to the student dictionary
    for i, unit in enumerate(completed_units, start=1):
        student[f'CompletedUnit{i}_Code'] = unit['unit_code']
        student[f'CompletedUnit{i}_Title'] = course_structure.loc[course_structure['UNIT_CODE'] == unit['unit_code'], 'UNIT_TITLE'].values[0]
        student[f'CompletedUnit{i}_Semester'] = unit['semester_completed']
        student[f'CompletedUnit{i}_Credits'] = unit['credits_earned']

    students.append(student)

# Convert the generated students into a DataFrame for easy visualization and export
students_df = pd.DataFrame(students)

# Convert semester columns to integers to ensure no decimal places
for i in range(2, 6):  # Assuming columns for up to 5 completed units
    semester_col = f'CompletedUnit{i}_Semester'
    if semester_col in students_df.columns:
        students_df[semester_col] = students_df[semester_col].astype('Int64')  # Converts to integer without decimal places




File has been saved to student_dataset16.csv
