# Data Simulation â€“ Educational Attendance Monitoring System

## Objective

This notebook simulates a structured relational database for an educational institution 
with multiple branches.

The dataset was created to support a Power BI dashboard focused on:

- Weekly absenteeism monitoring
- Dropout risk classification
- Operational follow-up by branch managers

All data is fully simulated using Python.
No real student data was used.

---

## Business Context

Students attend classes twice per week.
Attendance is monitored to identify:

- Weekly absenteeism
- Accumulated absences in the last 30 days
- Risk of dropout

Two behavioral profiles were simulated:
- Regular students (high attendance probability)
- At-risk students (lower attendance probability)


In [61]:
# 1. IMPORTS & CONFIGURATION 

import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta
import os

# Reproducibility
random.seed(42)
np.random.seed(42)
fake = Faker()
Faker.seed(42)

# Project paths
BASE_PATH = os.path.dirname(os.getcwd())
RAW_PATH = os.path.join(BASE_PATH, "data", "raw")
os.makedirs(RAW_PATH, exist_ok=True)



In [62]:
# 2. GENERATE CLASSES TABLE

branches = ['Coral Gables', 'Downtown', 'Kendall', 'Miami Beach', 'Doral']
levels = ['Beginner', 'Intermediate', 'Advanced']
shifts = ['Morning', 'Evening']

teachers = [fake.name() for _ in range(8)]

branch_managers = {
    'Coral Gables': 'Maxwell Wall',
    'Downtown': 'Mateo Decker',
    'Kendall': 'Ayan Mays',
    'Miami Beach': 'David Cordova',
    'Doral': 'Celeste Phillips'
}

classes = []
class_id = 1

for branch in branches:
    for level in levels:
        for i in range(2):  # 2 classes per level per branch
            
            classes.append({
                'Class Code': f'T{class_id:02}',
                'Branch': branch,
                'Teacher Full Name': random.choice(teachers),
                'Class Shift': shifts[i % len(shifts)],
                'Course Level': level,
                'Branch Manager': branch_managers[branch]
            })
            
            class_id += 1

classes_df = pd.DataFrame(classes)

print("Classes:", len(classes_df))
classes_df.head()


Classes: 30


Unnamed: 0,Class Code,Branch,Teacher Full Name,Class Shift,Course Level,Branch Manager
0,T01,Coral Gables,Noah Rhodes,Morning,Beginner,Maxwell Wall
1,T02,Coral Gables,Allison Hill,Evening,Beginner,Maxwell Wall
2,T03,Coral Gables,Cristian Santos,Morning,Intermediate,Maxwell Wall
3,T04,Coral Gables,Daniel Wagner,Evening,Intermediate,Maxwell Wall
4,T05,Coral Gables,Daniel Wagner,Morning,Advanced,Maxwell Wall


In [63]:
# 3. GENERATE STUDENTS TABLE

students = []
student_id = 1

for class_item in classes:
    
    num_students = random.randint(10, 15)
    
    for _ in range(num_students):
        
        enrollment_date = datetime.today() - timedelta(days=random.randint(0, 365))
        
        student_status = random.choices(
            ['Active', 'Inactive'],
            weights=[0.95, 0.05]
        )[0]
        
        students.append({
            'Student ID': student_id,
            'Student Full Name': fake.name(),
            'Branch': class_item['Branch'],
            'Course Level': class_item['Course Level'],
            'Enrollment Date': enrollment_date.date(),
            'Student Status': student_status,
            'Class Code': class_item['Class Code']
        })
        
        student_id += 1

students_df = pd.DataFrame(students)

print("Students:", len(students_df))
students_df.head()


Students: 372


Unnamed: 0,Student ID,Student Full Name,Branch,Course Level,Enrollment Date,Student Status,Class Code
0,1,Gabrielle Davis,Coral Gables,Beginner,2025-12-25,Active,T01
1,2,Ryan Munoz,Coral Gables,Beginner,2025-08-20,Active,T01
2,3,Monica Herrera,Coral Gables,Beginner,2026-01-21,Active,T01
3,4,Jamie Arnold,Coral Gables,Beginner,2025-05-14,Active,T01
4,5,Lisa Hensley,Coral Gables,Beginner,2025-08-03,Active,T01


In [64]:
# 4. GENERATE ENROLLMENTS TABLE

def generate_enrollment_status(student_status):
    if student_status == 'Inactive':
        return random.choice(['Dropped', 'Completed'])
    return random.choices(
        ['Active', 'Dropped', 'Completed'],
        weights=[0.8, 0.1, 0.1]
    )[0]

enrollments_df = students_df[['Student ID', 'Class Code', 'Student Status']].copy()

enrollments_df['Enrollment ID'] = [
    'E' + str(i).zfill(4) 
    for i in range(1, len(enrollments_df) + 1)
]

enrollments_df['Enrollment Status'] = (
    enrollments_df['Student Status']
    .apply(generate_enrollment_status)
)

enrollments_df = enrollments_df[
    ['Enrollment ID', 'Student ID', 'Class Code', 'Enrollment Status']
]

print("Enrollments:", len(enrollments_df))
enrollments_df.head()



Enrollments: 372


Unnamed: 0,Enrollment ID,Student ID,Class Code,Enrollment Status
0,E0001,1,T01,Active
1,E0002,2,T01,Active
2,E0003,3,T01,Active
3,E0004,4,T01,Active
4,E0005,5,T01,Active


In [65]:
# 5. GENERATE ATTENDANCE TABLE

attendance = []
class_dates = []

NUM_WEEKS = 12
today = datetime(2026, 2, 1)

# Generate class dates (Monday & Wednesday each week)
for week in range(NUM_WEEKS):
    base = today - timedelta(weeks=week)
    monday = base - timedelta(days=base.weekday())
    wednesday = monday + timedelta(days=2)
    class_dates.append(monday.date())
    class_dates.append(wednesday.date())

# Simulate attendance for each enrollment
for _, enrollment in enrollments_df.iterrows():
    # Define student risk profile
    risk_profile = np.random.choice(['Regular', 'At Risk'], p=[0.65, 0.35])

    # Attendance probability
    if risk_profile == 'Regular':
        attendance_prob = 0.92
    else:  # At Risk
        attendance_prob = 0.35

    for class_date in class_dates:
        present = np.random.choice([1, 0], p=[attendance_prob, 1 - attendance_prob])
        attendance.append({
            'Enrollment ID': enrollment['Enrollment ID'],
            'Class Date': class_date,
            'Attendance Flag': present
        })

attendance_df = pd.DataFrame(attendance)


In [66]:
# 6. EXPORT RAW DATA

students_df.to_csv(os.path.join(RAW_PATH, "students_raw.csv"), index=False, encoding="utf-8-sig")
classes_df.to_csv(os.path.join(RAW_PATH, "classes_raw.csv"), index=False, encoding="utf-8-sig")
enrollments_df.to_csv(os.path.join(RAW_PATH, "enrollments_raw.csv"), index=False, encoding="utf-8-sig")
attendance_df.to_csv(os.path.join(RAW_PATH, "attendance_raw.csv"), index=False, encoding="utf-8-sig")

print("Raw data exported successfully.")


Raw data exported successfully.
