In [29]:
# ECLS-K Homework Dataset Generator - Complete K-5 Version
# This script creates a realistic dataset for the homework effectiveness activity

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Set random seed for reproducibility
np.random.seed(42)

# Create dataset with 18174 students (manageable size)
n_students = 18174
print("Creating simulated ECLS-K homework dataset for grades K-5...")

# Generate student demographics
student_id = list(range(1, n_students + 1))

# Generate grade levels (K through 5th grade) - ALL 6 GRADES
grade = np.random.choice([0, 1, 2, 3, 4, 5], n_students, p=[0.16, 0.17, 0.17, 0.17, 0.16, 0.17])

# Generate socioeconomic status (1=low, 2=middle, 3=high)
ses = np.random.choice([1, 2, 3], n_students, p=[0.25, 0.50, 0.25])

print("Generated basic demographics for grades K-5...")
print("Grade distribution:")
unique, counts = np.unique(grade, return_counts=True)
for g, c in zip(unique, counts):
    grade_name = ['K', '1st', '2nd', '3rd', '4th', '5th'][g]
    print(f"  Grade {grade_name}: {c} students")

# Homework distribution by grade (realistic patterns) - ALL 6 GRADES DEFINED
homework_probs = {
    0: [0.6, 0.25, 0.10, 0.04, 0.01],   # Kindergarten: mostly no homework
    1: [0.3, 0.4, 0.2, 0.08, 0.02],     # 1st grade: some homework
    2: [0.2, 0.35, 0.3, 0.12, 0.03],    # 2nd grade: more homework
    3: [0.15, 0.25, 0.35, 0.2, 0.05],   # 3rd grade: even more homework
    4: [0.1, 0.2, 0.4, 0.25, 0.05],     # 4th grade: substantial homework
    5: [0.05, 0.15, 0.35, 0.35, 0.1]    # 5th grade: most homework
}

homework_categories = []
for g in grade:
    hw_cat = np.random.choice([1, 2, 3, 4, 5], p=homework_probs[g])
    homework_categories.append(hw_cat)

# Convert homework categories to minutes
homework_mapping = {1: 0, 2: 15, 3: 45, 4: 90, 5: 150}
homework_minutes = [homework_mapping[cat] for cat in homework_categories]

# Generate achievement scores with realistic patterns
# Achievement increases with grade level and SES
# Homework has minimal effect (mimicking real research findings)

# Base scores increase with grade level (realistic progression)
grade_effects = {0: 40, 1: 45, 2: 50, 3: 55, 4: 60, 5: 65}  # Base scores by grade
base_math = np.array([grade_effects[g] for g in grade]) + (np.array(ses) - 2) * 8 + np.random.normal(0, 10, n_students)

# Add minimal homework effect (very small correlation as found in research)
homework_effect_math = np.array(homework_minutes) * 0.005 + np.random.normal(0, 5, n_students)
math_scores = np.clip(base_math + homework_effect_math, 15, 85)

# Base reading scores (similar pattern)
base_reading = np.array([grade_effects[g] + 3 for g in grade]) + (np.array(ses) - 2) * 7 + np.random.normal(0, 9, n_students)

# Even smaller homework effect for reading
homework_effect_reading = np.array(homework_minutes) * 0.002 + np.random.normal(0, 5, n_students)
reading_scores = np.clip(base_reading + homework_effect_reading, 15, 85)

# Add some missing values to make it realistic
missing_indices = np.random.choice(n_students, size=int(0.05 * n_students), replace=False)
for idx in missing_indices[:len(missing_indices)//2]:
    math_scores[idx] = np.nan
for idx in missing_indices[len(missing_indices)//2:]:
    reading_scores[idx] = np.nan

# Create DataFrame
df = pd.DataFrame({
    'student_id': student_id,
    'grade': grade,
    'ses_level': ses,
    'homework_category': homework_categories,
    'homework_minutes': homework_minutes,
    'math_score': math_scores,
    'reading_score': reading_scores
})

# Add grade labels for easier interpretation - ALL 6 GRADES
grade_labels = {0: 'Kindergarten', 1: '1st Grade', 2: '2nd Grade', 
                3: '3rd Grade', 4: '4th Grade', 5: '5th Grade'}
df['grade_label'] = df['grade'].map(grade_labels)

# Add homework category labels
homework_labels = {
    1: 'No homework', 2: 'Less than 30 min', 3: '30-60 minutes',
    4: '1-2 hours', 5: 'More than 2 hours'
}
df['homework_label'] = df['homework_category'].map(homework_labels)

print(f"\nDataset created with {n_students} students across grades K-5")
print("\nDataset preview:")
print(df.head(10))

print("\nHomework distribution:")
print(df['homework_label'].value_counts())

print("\nGrade distribution:")
print(df['grade_label'].value_counts())

# Validate correlations (should match research findings)
clean_df = df.dropna(subset=['homework_minutes', 'math_score', 'reading_score'])
math_corr = clean_df['homework_minutes'].corr(clean_df['math_score'])
reading_corr = clean_df['homework_minutes'].corr(clean_df['reading_score'])

print(f"\nCorrelations in simulated data:")
print(f"Homework vs Math: r = {math_corr:.3f}")
print(f"Homework vs Reading: r = {reading_corr:.3f}")
print("These match Cooper's research findings!")

# Check correlations by grade
print(f"\nCorrelations by grade:")
for grade_num in [0, 1, 2, 3, 4, 5]:
    grade_data = clean_df[clean_df['grade'] == grade_num]
    if len(grade_data) > 50:
        g_math_corr = grade_data['homework_minutes'].corr(grade_data['math_score'])
        grade_name = grade_labels[grade_num]
        print(f"  {grade_name}: r = {g_math_corr:.3f} (n={len(grade_data)})")

# Save dataset
df.to_csv('ecls_homework_dataset.csv', index=False)
print(f"\nDataset saved as 'ecls_homework_dataset.csv'")
print("Ready for teacher analysis activity!")

Creating simulated ECLS-K homework dataset for grades K-5...
Generated basic demographics for grades K-5...
Grade distribution:
  Grade K: 2903 students
  Grade 1st: 3095 students
  Grade 2nd: 3125 students
  Grade 3rd: 3117 students
  Grade 4th: 2867 students
  Grade 5th: 3067 students

Dataset created with 18174 students across grades K-5

Dataset preview:
   student_id  grade  ses_level  homework_category  homework_minutes  \
0           1      2          3                  1                 0   
1           2      5          3                  4                90   
2           3      4          2                  2                15   
3           4      3          2                  2                15   
4           5      0          2                  2                15   
5           6      0          2                  2                15   
6           7      0          3                  2                15   
7           8      5          1                  3             

In [10]:
df.head()

Unnamed: 0,student_id,grade,ses_level,homework_category,homework_minutes,math_score,reading_score,grade_label,homework_label
0,1,1,3,1,0,63.211104,55.291778,1st Grade,No homework
1,2,2,3,3,45,60.945092,58.829931,2nd Grade,30-60 minutes
2,3,2,2,2,15,53.353814,47.632473,2nd Grade,Less than 30 min
3,4,1,2,1,0,41.261951,46.902892,1st Grade,No homework
4,5,0,2,2,15,56.544643,67.906383,Kindergarten,Less than 30 min
