"""
Synthetic Student Dataset Generator
-----------------------------------
Generates a synthetic dataset for analyzing correlations between
cognitive skills (comprehension, attention, focus, retention)
and performance metrics (assessment_score, engagement_time).

Outputs:
- students.csv (default)
- students.json (optional)

Author: Jayanth Raj G (for Igebra.ai assignment)
"""

In [11]:
pip install faker



In [12]:
import numpy as np
import pandas as pd
import random
import json
from faker import Faker
from pathlib import Path

# Configuration

In [13]:
NUM_STUDENTS = 1000       # Total students
CLASSES = list(range(1, 11))  # Classes 1 to 10
OUTPUT_DIR = Path("data")     # Directory to save files
SEED = 42                    # Reproducibility

In [14]:
faker = Faker()
np.random.seed(SEED)
random.seed(SEED)

# Helper Functions


In [15]:
def generate_student_id(idx: int) -> str:
    """Generate unique student ID."""
    return f"S{idx:04d}"

def generate_cognitive_features():
    """
    Generate cognitive skill scores (0–100) with some correlations:
    - Comprehension, attention, focus, and retention are moderately correlated.
    """
    # Base latent factor for cognitive ability
    latent_factor = np.random.normal(loc=70, scale=10)
    comprehension = np.clip(np.random.normal(latent_factor, 8), 30, 100)
    attention = np.clip(np.random.normal(latent_factor, 8), 30, 100)
    focus = np.clip(np.random.normal(latent_factor, 8), 30, 100)
    retention = np.clip(np.random.normal(latent_factor, 8), 30, 100)
    return comprehension, attention, focus, retention

def generate_performance_metrics(comp, att, foc, ret):
    """
    Correlate performance with cognitive skills.
    assessment_score is influenced by all four.
    engagement_time (minutes) correlates with attention and focus.
    """
    # Weighted sum for assessment score
    base_score = 0.3*comp + 0.25*att + 0.25*foc + 0.2*ret
    noise = np.random.normal(0, 5)  # Add small randomness
    assessment_score = np.clip(base_score + noise, 40, 100)

    # Engagement time correlates with attention and focus
    engagement_base = 0.5*att + 0.5*foc + np.random.normal(0, 10)
    engagement_time = int(np.clip(engagement_base, 20, 180))  # minutes
    return assessment_score, engagement_time

def generate_dataset(num_students=NUM_STUDENTS):
    """Generate synthetic dataset as a pandas DataFrame."""
    records = []
    for i in range(1, num_students + 1):
        student_id = generate_student_id(i)
        name = faker.name()
        student_class = random.choice(CLASSES)

        # Cognitive features
        comp, att, foc, ret = generate_cognitive_features()

        # Performance metrics
        assessment_score, engagement_time = generate_performance_metrics(comp, att, foc, ret)

        records.append({
            "student_id": student_id,
            "name": name,
            "class": student_class,
            "comprehension": round(comp, 2),
            "attention": round(att, 2),
            "focus": round(foc, 2),
            "retention": round(ret, 2),
            "assessment_score": round(assessment_score, 2),
            "engagement_time": engagement_time
        })

    df = pd.DataFrame(records)
    return df

def save_dataset(df: pd.DataFrame, output_dir: Path):
    """Save dataset to CSV and JSON formats."""
    output_dir.mkdir(parents=True, exist_ok=True)
    csv_path = output_dir / "student_performance.csv"
    json_path = output_dir / "student_performance.json"
    df.to_csv(csv_path, index=False)
    df.to_json(json_path, orient="records", indent=2)
    print(f"✅ Dataset saved to:\n  - {csv_path}\n  - {json_path}")


# Main Execution


In [16]:
if __name__ == "__main__":
    df_students = generate_dataset()
    save_dataset(df_students, OUTPUT_DIR)
    print("\nPreview of dataset:")
    print(df_students.head())

✅ Dataset saved to:
  - data/student_performance.csv
  - data/student_performance.json

Preview of dataset:
  student_id             name  class  comprehension  attention  focus  \
0      S0001      Mason Adams      2          73.86      80.15  87.15   
1      S0002      Thomas Hood      1          73.92      82.01  73.97   
2      S0003     Matthew Diaz      5          48.25      44.65  55.26   
3      S0004  Laura Carpenter      4          68.28      56.34  63.39   
4      S0005     Justin Mccoy      4          61.66      59.18  78.81   

   retention  assessment_score  engagement_time  
0      73.09             77.43               99  
1      73.95             77.17               58  
2      45.49             41.49               64  
3      68.63             58.39               63  
4      63.89             60.48               77  
