In [1]:
import pandas as pd
import random
from faker import Faker
import numpy as np

# Initialize Faker
fake = Faker()

# Define constants
num_students = 6  # Adjust as needed
num_classes = 4  # Adjust as needed
streams = ["Yellow", "Blue"]
terms = [1, 2, 3]
grades = ["A", "B", "C", "D", "E"]

# Generate student names
students = [fake.first_name() for _ in range(num_students)]

# Define subject categories
subject_tuples = [
    ("Languages", "English", "English Reading"),
    ("Languages", "English", "English Writing"),
    ("Languages", "Spanish", "Vocabulary"),
    ("Languages", "Spanish", "Spanish Grammar"),
    ("Sciences", "Math", "Algebra"),
    ("Sciences", "Math", "Geometry"),
    ("Sciences", "Physics", "Mechanics"),
    ("Sciences", "Physics", "Thermodynamics"),
    ("Humanities", "Geography", "Physical Geography"),
    ("Humanities", "Geography", "Human Geography"),
    ("Humanities", "History", "Ancient History"),
    ("Humanities", "History", "Medieval History"),
]

# Define class performance categories
class_performance = {
    "Good": [1, 2],
    "Mid": [3, 4],
    "Bad": [5, 6],
}

# Assign students into performance groups
student_performance = dict()

student_performance["Good"] = random.sample(students, k=2)
student_performance["Mid"] = random.sample(
    list(set(students) - set(student_performance.get("Good", []))), k=2
)
student_performance["Bad"] = list(
    set(students) - set(student_performance["Good"]) - set(student_performance["Mid"])
)

# Assign subject strengths and weaknesses
student_subject_strengths = {
    student: random.choice(["Languages", "Sciences", "Humanities"])
    for student in students
}
student_subject_weaknesses = {
    student: random.choice(
        list(
            set(["Languages", "Sciences", "Humanities"])
            - {student_subject_strengths[student]}
        )
    )
    for student in students
}

# Define weighted distributions
grade_distributions = {
    "Good": [0.6, 0.3, 0.08, 0.02, 0.0],  # Mostly A and B
    "Mid": [0.2, 0.3, 0.3, 0.1, 0.1],  # Even spread
    "Bad": [0.05, 0.1, 0.3, 0.25, 0.3],  # More D and E
}

# Define subject-based adjustments
subject_adjustment = {
    "Strong": [0.7, 0.2, 0.08, 0.02, 0.0],  # More A's and B's
    "Weak": [0.05, 0.1, 0.3, 0.25, 0.3],  # More D's and E's
}


# Function to get student category
def get_student_category(student):
    return next(
        (cat for cat, names in student_performance.items() if student in names), "Mid"
    )


# Function to get class category
def get_class_category(class_num):
    return next(
        (cat for cat, classes in class_performance.items() if class_num in classes),
        "Mid",
    )


# Function to generate grade with subject influence
def generate_grade(student, class_num, subject_category):
    student_category = get_student_category(student)
    class_category = get_class_category(class_num)

    # Get base weights from student and class performance
    student_weights = np.array(grade_distributions[student_category])
    class_weights = np.array(grade_distributions[class_category])

    # Average student and class performance influence
    final_weights = (student_weights + class_weights) / 2

    # Apply subject strengths/weaknesses
    if subject_category == student_subject_strengths[student]:
        final_weights = np.array(subject_adjustment["Strong"])
    elif subject_category == student_subject_weaknesses[student]:
        final_weights = np.array(subject_adjustment["Weak"])

    final_weights /= final_weights.sum()  # Normalize to sum to 1

    return random.choices(grades, weights=final_weights, k=1)[0]


# List comprehension to generate data
data = [
    [
        class_num,
        stream,
        student,
        subject_category,
        subject,
        subject_strand,
        term,
        generate_grade(student, class_num, subject_category),
    ]
    for class_num in range(1, num_classes + 1)
    for stream in streams
    for student in students
    for subject_category, subject, subject_strand in subject_tuples
    for term in terms
]

# Convert to DataFrame
df = pd.DataFrame(
    data,
    columns=[
        "Class",
        "Stream",
        "Student Name",
        "Subject Category",
        "Subject",
        "Subject Strand",
        "Term",
        "Grade",
    ],
)

df.head()


Unnamed: 0,Class,Stream,Student Name,Subject Category,Subject,Subject Strand,Term,Grade
0,1,Yellow,Edwin,Languages,English,English Reading,1,A
1,1,Yellow,Edwin,Languages,English,English Reading,2,D
2,1,Yellow,Edwin,Languages,English,English Reading,3,A
3,1,Yellow,Edwin,Languages,English,English Writing,1,D
4,1,Yellow,Edwin,Languages,English,English Writing,2,A


In [2]:
df["Grade"].value_counts()

Grade
A    641
C    352
B    311
E    224
D    200
Name: count, dtype: int64

In [3]:
df.to_csv("data.csv", index=False)