In [175]:
import pandas as pd
import random
from faker import Faker
import numpy as np

# Generate Dataset

In [176]:
# Define possible values for each column
num_students = 3
number_of_years = 2

# Generate unique student names
fake = Faker()
names = [fake.first_name() for _ in range(num_students)]

grades = ["A", "B", "C", "D", "E"]

subject_tuples = [
    ("Languages", "English", "English Reading"),
    ("Languages", "English", "English Writing"),
    ("Languages", "English", "English Grammar"),
    ("Languages", "English", "Literature"),
    ("Languages", "English", "Speaking"),
    ("Languages", "Spanish", "Vocabulary"),
    ("Languages", "Spanish", "Spanish Grammar"),
    ("Languages", "Spanish", "Spanish Reading"),
    ("Languages", "Spanish", "Spanish Writing"),
    ("Languages", "Spanish", "Conversation"),
    ("Sciences", "Math", "Algebra"),
    ("Sciences", "Math", "Geometry"),
    ("Sciences", "Math", "Calculus"),
    ("Sciences", "Math", "Statistics"),
    ("Sciences", "Math", "Number Theory"),
    ("Sciences", "Physics", "Mechanics"),
    ("Sciences", "Physics", "Thermodynamics"),
    ("Sciences", "Physics", "Electromagnetism"),
    ("Sciences", "Physics", "Optics"),
    ("Sciences", "Physics", "Quantum Physics"),
    ("Humanities", "Geography", "Physical Geography"),
    ("Humanities", "Geography", "Human Geography"),
    ("Humanities", "Geography", "Cartography"),
    ("Humanities", "Geography", "Climatology"),
    ("Humanities", "Geography", "Geopolitics"),
    ("Humanities", "History", "Ancient History"),
    ("Humanities", "History", "Medieval History"),
    ("Humanities", "History", "Modern History"),
    ("Humanities", "History", "World History"),
    ("Humanities", "History", "Cultural History"),
]


classes = list(range(1, 9))
streams = ["Yellow", "Blue"]
# years = list(range(2020, 2025))
terms = list(range(1, 4))


In [None]:
# Function to generate random weights for grades
def generate_weights():
    random_values = np.random.normal(loc=0.5, scale=0.2, size=len(grades))
    random_values = np.abs(random_values)  # Ensure all values are positive
    normalized_values = random_values / np.sum(random_values)  # Normalize to sum to 1
    return normalized_values.tolist()


# Function to generate custom weights based on class or student
def generate_skewed_weights(school_class, student_name, stream):
    # if school_class in [1, 2]:  # Younger classes may have better grades
    #     weights = [0.4, 0.3, 0.2, 0.05, 0.05]  # More A's and B's
    # elif school_class in [7, 8]:  # Older classes may struggle
    #     weights = [0.1, 0.15, 0.2, 0.25, 0.3]  # More D's and E's
    # elif student_name.startswith("A"):  # Assume students with 'A' perform better
    #     weights = [0.5, 0.3, 0.1, 0.05, 0.05]  # Mostly A's and B's
    if stream == "Yellow":
        weights = [0.4, 0.4, 0.1, 0.05, 0.05]
    # elif stream == "Blue":
    #     weights = [0.05, 0.05, 0.1, 0.4, 0.4]
    else:  # Default case
        weights = [0.2, 0.2, 0.2, 0.2, 0.2]  # Even distribution

    return weights


# Function to generate a random grade based on weights
def generate_grade(weights):
    random_number = random.randint(0, len(grades) - 1)
    return random.choices(population=grades, weights=weights, k=len(grades))[
        random_number
    ]


# Create dataset using list comprehensions
data = [
    [
        school_class,
        stream,
        name,
        subject_category,
        subject,
        subject_strand,
        # year,
        term,
        generate_grade(generate_skewed_weights(school_class, name, stream)),
    ]
    for school_class in classes
    for stream in streams
    for name in names
    for subject_category, subject, subject_strand in subject_tuples
    # for year in years
    for term in terms
]

# Convert to DataFrame
df = pd.DataFrame(
    data,
    columns=[
        "Class",
        "Stream",
        "Student Name",
        "Subject Category",
        "Subject",
        "Subject Strand",
        # "Year",
        "Term",
        "Grade",
    ],
)

df.head()

Unnamed: 0,Class,Stream,Student Name,Subject Category,Subject,Subject Strand,Term,Grade
0,1,Yellow,John,Languages,English,English Reading,1,A
1,1,Yellow,John,Languages,English,English Reading,2,C
2,1,Yellow,John,Languages,English,English Reading,3,A
3,1,Yellow,John,Languages,English,English Writing,1,A
4,1,Yellow,John,Languages,English,English Writing,2,B


In [178]:
df["Grade"].value_counts()

Grade
A    1325
B    1307
C     607
E     543
D     538
Name: count, dtype: int64

In [179]:
df.to_csv("data.csv", index=False)