In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set random seed for reproducibility
np.random.seed(42)

# Number of students
n_students = 1000

print("Generating realistic student performance dataset for Elevatr...")
print("=" * 60)

# Generate student IDs
student_ids = [f"S{str(i).zfill(3)}" for i in range(1, n_students + 1)]

# Generate base features with realistic distributions
age = np.random.randint(18, 26, size=n_students)
gender = np.random.choice(['M', 'F'], size=n_students, p=[0.5, 0.5])

# Study hours: normal distribution around 20, clipped to 5-40
study_hours_weekly = np.clip(np.random.normal(20, 7, n_students), 5, 40)

# Attendance: most students 70-95%, some lower
attendance_percent = np.clip(np.random.beta(8, 2, n_students) * 100, 40, 100)

# Previous GPA: normal around 7.5
previous_gpa = np.clip(np.random.normal(7.5, 1.2, n_students), 5.0, 10.0)

# Assignments completed: correlated with study habits
assignments_completed = np.clip(
    study_hours_weekly * 2 + np.random.normal(20, 15, n_students), 
    0, 100
)

# Participation score: somewhat correlated with attendance
participation_score = np.clip(
    attendance_percent * 0.6 + np.random.normal(20, 15, n_students),
    0, 100
)

# Midterm score: based on previous GPA and current effort
midterm_score = np.clip(
    previous_gpa * 6 + study_hours_weekly * 0.8 + np.random.normal(0, 12, n_students),
    0, 100
)

# Hours on platform: varies widely
hours_on_platform = np.clip(np.random.gamma(4, 15, n_students), 10, 200)

# Calculate final grade with realistic correlations and noise
def calculate_final_grade(idx):
    # Base score from multiple factors
    base_score = (
        attendance_percent[idx] * 0.20 +
        study_hours_weekly[idx] * 1.2 +
        assignments_completed[idx] * 0.25 +
        participation_score[idx] * 0.15 +
        midterm_score[idx] * 0.20 +
        previous_gpa[idx] * 3
    )
    
    # Add noise
    base_score += np.random.normal(0, 8)
    
    return base_score

base_scores = np.array([calculate_final_grade(i) for i in range(n_students)])

# Convert scores to letter grades
def score_to_grade(score):
    if score >= 85:
        return 'S'
    elif score >= 70:
        return 'A'
    elif score >= 55:
        return 'B'
    elif score >= 40:
        return 'C'
    elif score >= 25:
        return 'D'
    else:
        return 'F'

final_grade = np.array([score_to_grade(s) for s in base_scores])

print("✓ Base dataset generated with realistic correlations")
print(f"  - Initial grade distribution: {dict(zip(*np.unique(final_grade, return_counts=True)))}")

# Create DataFrame
df = pd.DataFrame({
    'student_id': student_ids,
    'age': age.astype(int),
    'gender': gender,
    'study_hours_weekly': np.round(study_hours_weekly, 1),
    'attendance_percent': np.round(attendance_percent, 1),
    'previous_gpa': np.round(previous_gpa, 2),
    'assignments_completed': np.round(assignments_completed, 1).astype(int),
    'participation_score': np.round(participation_score, 1).astype(int),
    'midterm_score': np.round(midterm_score, 1).astype(int),
    'hours_on_platform': np.round(hours_on_platform, 1).astype(int),
    'final_grade': final_grade
})

# Clip percentage values
df['assignments_completed'] = df['assignments_completed'].clip(0, 100)
df['participation_score'] = df['participation_score'].clip(0, 100)
df['midterm_score'] = df['midterm_score'].clip(0, 100)

print("\n✓ DataFrame created with clean data types")

Generating realistic student performance dataset for Elevatr...
✓ Base dataset generated with realistic correlations
  - Initial grade distribution: {np.str_('A'): np.int64(149), np.str_('B'): np.int64(31), np.str_('C'): np.int64(5), np.str_('S'): np.int64(815)}

✓ DataFrame created with clean data types


In [2]:

# Generate edge cases as specified

print("\nGenerating edge cases for realistic variability...")
print("=" * 60)

# Edge Case 1: Perfect/high attendance but failing (50 students)
# These need intervention - high attendance but poor outcomes
high_attendance_failing_indices = np.random.choice(
    df.index[df['attendance_percent'] > 80].tolist(), 
    size=min(50, len(df[df['attendance_percent'] > 80])), 
    replace=False
)

for idx in high_attendance_failing_indices:
    df.loc[idx, 'attendance_percent'] = np.random.uniform(85, 100)
    df.loc[idx, 'study_hours_weekly'] = np.random.uniform(15, 30)  # They try but struggle
    df.loc[idx, 'midterm_score'] = np.random.randint(30, 50)
    df.loc[idx, 'assignments_completed'] = np.random.randint(60, 85)
    df.loc[idx, 'final_grade'] = np.random.choice(['D', 'F'], p=[0.6, 0.4])

print(f"✓ Created {len(high_attendance_failing_indices)} high-attendance struggling students (need intervention)")

# Edge Case 2: Low hours but acing (30 students) - efficient/gifted learners
low_effort_high_performance_indices = np.random.choice(
    df.index, 
    size=30, 
    replace=False
)

for idx in low_effort_high_performance_indices:
    df.loc[idx, 'study_hours_weekly'] = np.random.uniform(5, 12)
    df.loc[idx, 'attendance_percent'] = np.random.uniform(60, 85)
    df.loc[idx, 'hours_on_platform'] = np.random.randint(15, 50)
    df.loc[idx, 'midterm_score'] = np.random.randint(80, 95)
    df.loc[idx, 'assignments_completed'] = np.random.randint(70, 90)
    df.loc[idx, 'final_grade'] = np.random.choice(['S', 'A'], p=[0.7, 0.3])

print(f"✓ Created {len(low_effort_high_performance_indices)} efficient/naturally gifted learners")

# Edge Case 3: Improving trend - low midterm, high final (20 students)
improving_trend_indices = np.random.choice(
    df.index, 
    size=20, 
    replace=False
)

for idx in improving_trend_indices:
    df.loc[idx, 'midterm_score'] = np.random.randint(35, 55)
    df.loc[idx, 'study_hours_weekly'] = np.random.uniform(25, 40)  # Increased effort
    df.loc[idx, 'attendance_percent'] = np.random.uniform(80, 95)
    df.loc[idx, 'assignments_completed'] = np.random.randint(75, 95)
    df.loc[idx, 'final_grade'] = np.random.choice(['A', 'B'], p=[0.6, 0.4])

print(f"✓ Created {len(improving_trend_indices)} students with improving trends")

# Add some additional variety: high effort but mediocre results (struggling but trying)
high_effort_mediocre_indices = np.random.choice(
    df.index, 
    size=40, 
    replace=False
)

for idx in high_effort_mediocre_indices:
    df.loc[idx, 'study_hours_weekly'] = np.random.uniform(30, 40)
    df.loc[idx, 'attendance_percent'] = np.random.uniform(85, 100)
    df.loc[idx, 'assignments_completed'] = np.random.randint(85, 100)
    df.loc[idx, 'participation_score'] = np.random.randint(70, 90)
    df.loc[idx, 'midterm_score'] = np.random.randint(50, 70)
    df.loc[idx, 'final_grade'] = np.random.choice(['C', 'B'], p=[0.7, 0.3])

print(f"✓ Created {len(high_effort_mediocre_indices)} high-effort students with mediocre results")

# Round all numeric columns
df['attendance_percent'] = df['attendance_percent'].round(1)
df['study_hours_weekly'] = df['study_hours_weekly'].round(1)
df['previous_gpa'] = df['previous_gpa'].round(2)

print("\n" + "=" * 60)
print("FINAL DATASET STATISTICS")
print("=" * 60)
print(f"\nTotal students: {len(df)}")
print(f"\nGrade Distribution:")
grade_counts = df['final_grade'].value_counts().sort_index()
for grade, count in grade_counts.items():
    print(f"  {grade}: {count} ({count/len(df)*100:.1f}%)")

print(f"\nAge range: {df['age'].min()}-{df['age'].max()} years")
print(f"Gender balance: {df['gender'].value_counts().to_dict()}")
print(f"\nStudy hours: {df['study_hours_weekly'].min():.1f}-{df['study_hours_weekly'].max():.1f} hrs/week")
print(f"Attendance: {df['attendance_percent'].min():.1f}%-{df['attendance_percent'].max():.1f}%")
print(f"Previous GPA: {df['previous_gpa'].min():.2f}-{df['previous_gpa'].max():.2f}")

# Display first few rows
print("\nFirst 5 rows of dataset:")
print(df.head())


Generating edge cases for realistic variability...
✓ Created 50 high-attendance struggling students (need intervention)
✓ Created 30 efficient/naturally gifted learners
✓ Created 20 students with improving trends
✓ Created 40 high-effort students with mediocre results

FINAL DATASET STATISTICS

Total students: 1000

Grade Distribution:
  A: 149 (14.9%)
  B: 55 (5.5%)
  C: 28 (2.8%)
  D: 34 (3.4%)
  F: 12 (1.2%)
  S: 722 (72.2%)

Age range: 18-25 years
Gender balance: {'F': 500, 'M': 500}

Study hours: 5.0-40.0 hrs/week
Attendance: 40.0%-99.8%
Previous GPA: 5.00-10.00

First 5 rows of dataset:
  student_id  age gender  study_hours_weekly  attendance_percent  \
0       S001   24      F                 5.0                71.3   
1       S002   21      F                36.0                95.1   
2       S003   22      M                10.3                75.0   
3       S004   24      F                 8.5                78.5   
4       S005   20      F                27.2               

In [3]:

# Save main dataset
df.to_csv('../data/student_data.csv', index=False)
print("✓ Saved main dataset to: data/student_data.csv")

# Create diverse test sample (5 students with different profiles)
test_students = pd.DataFrame({
    'student_id': ['T001', 'T002', 'T003', 'T004', 'T005'],
    'age': [19, 22, 20, 24, 21],
    'gender': ['M', 'F', 'M', 'F', 'M'],
    'study_hours_weekly': [35.0, 8.0, 22.0, 12.0, 28.0],
    'attendance_percent': [92.0, 68.0, 88.0, 95.0, 75.0],
    'previous_gpa': [7.8, 9.2, 6.5, 7.0, 8.5],
    'assignments_completed': [95, 75, 80, 85, 70],
    'participation_score': [85, 60, 75, 90, 65],
    'midterm_score': [78, 88, 65, 45, 82],
    'hours_on_platform': [120, 35, 95, 150, 60],
    'final_grade': ['A', 'S', 'B', 'D', 'A']
})

# Add descriptions for test samples
test_descriptions = [
    "T001: High-effort student with good results (typical A student)",
    "T002: Efficient learner - low hours but excellent performance",
    "T003: Average student with consistent effort",
    "T004: High-attendance struggling student (needs intervention)",
    "T005: Above-average student with moderate engagement"
]

test_students.to_csv('../data/test_sample.csv', index=False)
print("✓ Saved test sample to: data/test_sample.csv")
print("\nTest sample profiles:")
for desc in test_descriptions:
    print(f"  {desc}")

print("\n" + "=" * 60)
print("Dataset generation complete!")
print("=" * 60)

✓ Saved main dataset to: data/student_data.csv
✓ Saved test sample to: data/test_sample.csv

Test sample profiles:
  T001: High-effort student with good results (typical A student)
  T002: Efficient learner - low hours but excellent performance
  T003: Average student with consistent effort
  T004: High-attendance struggling student (needs intervention)
  T005: Above-average student with moderate engagement

Dataset generation complete!


In [4]:
import plotly.express as px

# Data from the provided JSON
data = {"grades": ["S", "A", "B", "C", "D", "F"], "counts": [722, 149, 55, 28, 34, 12], "percentages": [72.2, 14.9, 5.5, 2.8, 3.4, 1.2]}

# Create the pie chart
fig = px.pie(values=data['counts'], names=data['grades'], title="Final Grade Distribution (N=1000)")

# Update traces to show grade, count and percentage in the labels
fig.update_traces(
    texttemplate='%{label}<br>%{value} (%{percent})'
)

# Apply pie chart specific formatting
fig.update_layout(uniformtext_minsize=14, uniformtext_mode='hide')

# Save as PNG and SVG
fig.write_image("chart.png")
fig.write_image("chart.svg", format="svg")

In [5]:
import pandas as pd
import json

try:
    df = pd.read_csv('../data/student_data.csv')
except FileNotFoundError:
    df = pd.read_csv('data/student_data.csv')
# Calculate correlation matrix for the heatmap
# Select only numeric columns relevant for correlation
numeric_cols = ['age', 'study_hours_weekly', 'attendance_percent', 'previous_gpa', 
                'assignments_completed', 'participation_score', 'midterm_score', 'hours_on_platform']

correlation_matrix = df[numeric_cols].corr()

print("Correlation Matrix:")
print(correlation_matrix)
print("\nTop correlations with midterm_score:")
midterm_corr = correlation_matrix['midterm_score'].sort_values(ascending=False)
print(midterm_corr)

# Convert to format for chart tool
corr_data = []
for i, row_name in enumerate(correlation_matrix.index):
    for j, col_name in enumerate(correlation_matrix.columns):
        corr_data.append({
            'row': row_name,
            'column': col_name,
            'value': round(correlation_matrix.iloc[i, j], 3)
        })

import json
corr_json = json.dumps(corr_data)
print(f"\nPrepared correlation data with {len(corr_data)} cells")


Correlation Matrix:
                            age  study_hours_weekly  attendance_percent  \
age                    1.000000            0.006098            0.026068   
study_hours_weekly     0.006098            1.000000            0.118733   
attendance_percent     0.026068            0.118733            1.000000   
previous_gpa           0.052134           -0.015379            0.052368   
assignments_completed -0.009576            0.627990            0.088906   
participation_score    0.009369            0.020558            0.400901   
midterm_score         -0.000922            0.166146           -0.106453   
hours_on_platform      0.033547            0.073356            0.000786   

                       previous_gpa  assignments_completed  \
age                        0.052134              -0.009576   
study_hours_weekly        -0.015379               0.627990   
attendance_percent         0.052368               0.088906   
previous_gpa               1.000000              -0.0150