In [4]:
!pip install faker pandas




[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [36]:
import csv
import pandas as pd
from faker import Faker
import random
import numpy as np
#import uuid

In [40]:
fake = Faker()
Faker.seed(42)

# Parameters
NUM_STUDENTS = 100
NUM_COURSES = 10
NUM_ASSIGNMENTS = 50
NUM_SUBMISSIONS = 500
NUM_ENROLLMENTS = 300  # To simulate many-to-many student-course relationships

# 1. Students
students = [{
    "StudentID": f"S{1000 + i}",
    "Name": fake.name(),
    "Email": fake.email(),
    "EnrollmentDate": fake.date_between(start_date='-2y', end_date='today').isoformat()
} for i in range(NUM_STUDENTS)]

# 2. Courses
courses = [{
    "CourseID": f"C{100 + i}",
    "CourseName": fake.catch_phrase(),
    "Instructor": fake.name(),
    "Semester": random.choice(["Fall 2024", "Spring 2025", "Summer 2025"])
} for i in range(NUM_COURSES)]

# 3. Assignments
assignments = [{
    "AssignmentID": f"A{100 + i}",
    "CourseID": random.choice(courses)["CourseID"],
    "Title": fake.sentence(nb_words=4),
    "DueDate": fake.date_between(start_date='-3m', end_date='+3m').isoformat()
} for i in range(NUM_ASSIGNMENTS)]

# 4. Enrollments
enrollments = []
enrollment_pairs = set()
while len(enrollments) < NUM_ENROLLMENTS:
    student = random.choice(students)
    course = random.choice(courses)
    key = (student["StudentID"], course["CourseID"])
    if key not in enrollment_pairs:
        enrollment_pairs.add(key)
        enrollments.append({
            "EnrollmentID": f"E{1000 + len(enrollments)}",
            "StudentID": student["StudentID"],
            "CourseID": course["CourseID"]
        })

# 5. Submissions
submissions = [{
    "SubmissionID": f"SUB{1000 + i}",
    "AssignmentID": random.choice(assignments)["AssignmentID"],
    "StudentID": random.choice(students)["StudentID"],
    "SubmissionDate": fake.date_between(start_date='-2m', end_date='today').isoformat(),
    "Grade": random.choice([None] + [round(random.uniform(60, 100), 2) for _ in range(5)])
} for i in range(NUM_SUBMISSIONS)]


In [41]:
# Dirty students data
for student in students:
    if random.random() < 0.05:
        student["Email"] = None  # Missing email
    if random.random() < 0.05:
        student["Name"] = student["Name"].strip().title() + " "  # Extra space
    if random.random() < 0.03:
        student["EnrollmentDate"] = ""  # Empty date string

# Dirty courses data
for course in courses:
    if random.random() < 0.05:
        course["Instructor"] = fake.name().lower()  # lowercase name
    if random.random() < 0.03:
        course["Semester"] = random.choice(["Fall2024", "Spring 2025", "SUMMER 2025"])  # inconsistent format

# Dirty assignments data
for assignment in assignments:
    if random.random() < 0.05:
        assignment["Title"] = assignment["Title"].replace(".", "")  # remove punctuation
    if random.random() < 0.03:
        assignment["DueDate"] = None  # missing due date

# Dirty enrollments data
for enrollment in enrollments:
    if random.random() < 0.03:
        enrollment["CourseID"] = enrollment["CourseID"].lower()  # lowercased ID

# Dirty submissions data
for submission in submissions:
    if random.random() < 0.05:
        submission["Grade"] = None  # Missing grade
    if random.random() < 0.03:
        submission["SubmissionDate"] = submission["SubmissionDate"] + " "  # Trailing space

# Add few duplicates
def add_duplicates(data, percent=0.05):
    num_dupes = int(len(data) * percent)
    duplicates = random.choices(data, k=num_dupes)
    return data + duplicates

# Add duplicates (approx 5% each)
students = add_duplicates(students, percent=0.05)
courses = add_duplicates(courses, percent=0.05)
assignments = add_duplicates(assignments, percent=0.05)
enrollments = add_duplicates(enrollments, percent=0.05)
submissions = add_duplicates(submissions, percent=0.05)


In [42]:
# Save all as CSVs

pd.DataFrame(students).to_csv("students.csv", index=False, quoting=csv.QUOTE_NONE, escapechar='\\')
pd.DataFrame(courses).to_csv("courses.csv", index=False, quoting=csv.QUOTE_NONE, escapechar='\\')
pd.DataFrame(assignments).to_csv("assignments.csv", index=False, quoting=csv.QUOTE_NONE, escapechar='\\')
pd.DataFrame(submissions).to_csv("submissions.csv", index=False, quoting=csv.QUOTE_NONE, escapechar='\\')
pd.DataFrame(enrollments).to_csv("enrollments.csv", index=False, quoting=csv.QUOTE_NONE, escapechar='\\')
