In [77]:
# 1. SETUP

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

plt.style.use("default")

base_path = r"C:\Users\jujuf\Documents\projeto_cultura_inglesa"
processed_path = os.path.join(base_path, "data", "processed")

# Load datasets
students_df = pd.read_csv(os.path.join(processed_path, "students.csv"))
classes_df = pd.read_csv(os.path.join(processed_path, "classes.csv"))
attendance_df = pd.read_csv(os.path.join(processed_path, "attendance.csv"))
weekly_absences_df = pd.read_csv(os.path.join(processed_path, "weekly_absences.csv"))
absences_30d_df = pd.read_csv(os.path.join(processed_path, "absences_30d.csv"))
interventions_df = pd.read_csv(os.path.join(processed_path, "recent_interventions.csv"))
enrollments_df = pd.read_csv(os.path.join(processed_path, "enrollments.csv"))



In [91]:
# 2. DATA INTEGRITY CHECK

print("=== TABLE SHAPES ===")
print("Students:", students_df.shape)
print("Classes:", classes_df.shape)
print("Enrollments:", enrollments_df.shape)
print("Attendance:", attendance_df.shape)
print("Weekly:", weekly_absences_df.shape)
print("30-day risk:", absences_30d_df.shape)
print("Interventions:", interventions_df.shape)
print("")

# Check primary keys / duplicates
print("=== DUPLICATES ===")
print("Students:", students_df.duplicated(subset='student_id').sum())
print("Classes:", classes_df.duplicated(subset='class_id').sum())
print("Enrollments:", enrollments_df.duplicated(subset='enrollment_id').sum())
print("Attendance:", attendance_df.duplicated(subset=['enrollment_id','date']).sum())
print("Weekly absences:", weekly_absences_df.duplicated(subset=['enrollment_id','year','week']).sum())
print("30d risk:", absences_30d_df.duplicated(subset='enrollment_id').sum())
print("")

# Check nulls in key columns
print("=== NULLS IN KEY COLUMNS ===")
print("Students PK:", students_df['student_id'].isnull().sum())
print("Classes PK:", classes_df['class_id'].isnull().sum())
print("Enrollments PK:", enrollments_df['enrollment_id'].isnull().sum())
print("Attendance PK:", attendance_df['enrollment_id'].isnull().sum())
print("Weekly PK:", weekly_absences_df['enrollment_id'].isnull().sum())
print("30d PK / risk:", absences_30d_df[['enrollment_id','risk']].isnull().sum())
print("")

# Check active students
print("=== ACTIVE STUDENTS ===")
print(students_df['active'].value_counts())
print("")

=== TABLE SHAPES ===
Students: (372, 8)
Classes: (30, 6)
Enrollments: (372, 5)
Attendance: (8928, 3)
Weekly: (4248, 8)
30-day risk: (354, 3)
Interventions: (31, 5)

=== DUPLICATES ===
Students: 0
Classes: 0
Enrollments: 0
Attendance: 0
Weekly absences: 0
30d risk: 0

=== NULLS IN KEY COLUMNS ===
Students PK: 0
Classes PK: 0
Enrollments PK: 0
Attendance PK: 0
Weekly PK: 0
30d PK / risk: enrollment_id    0
risk             0
dtype: int64

=== ACTIVE STUDENTS ===
active
True     354
False     18
Name: count, dtype: int64



In [92]:
# 3. RISK CHECK

print("=== 30-DAY RISK DISTRIBUTION ===")
print(absences_30d_df['risk'].value_counts())
print("")

print("=== WEEKLY RISK DISTRIBUTION ===")
print(weekly_absences_df['risk_status'].value_counts())
print("")

# Optional: overlap check
weekly_risk_ids = weekly_absences_df[
    weekly_absences_df['risk_status'].isin(['Attention','Priority'])
]['enrollment_id'].unique()

risk_30d_ids = absences_30d_df[
    absences_30d_df['risk'].isin(['Medium Risk','High Risk'])
]['enrollment_id'].unique()

at_risk_ids = set(weekly_risk_ids) | set(risk_30d_ids)
print("Total students ever flagged as at risk:", len(at_risk_ids))
print("")


=== 30-DAY RISK DISTRIBUTION ===
risk
Normal         264
Medium Risk     78
High Risk       12
Name: count, dtype: int64

=== WEEKLY RISK DISTRIBUTION ===
risk_status
Normal       2587
Attention    1070
Priority      591
Name: count, dtype: int64

Total students ever flagged as at risk: 321



In [97]:
# 4. INTERVENTION CHECK
# ---------------------------

# Map interventions to enrollment
interventions_enrollment = interventions_df.merge(
    enrollments_df[['enrollment_id','student_id']],
    left_on='Student_ID',
    right_on='student_id',
    how='left'
)

# Check intervention status distribution
print("=== INTERVENTION STATUS ===")
print(interventions_enrollment['Case_Status'].value_counts())
print("")

# Optional: at-risk students with interventions
risk_interventions = interventions_enrollment[
    interventions_enrollment['enrollment_id'].isin(at_risk_ids)
]
print("At-risk students with interventions:", risk_interventions['enrollment_id'].nunique())
print("")

# Students with pending intervention
pending_cases = risk_interventions[
    risk_interventions['Case_Status'].isin(['Open','In Progress'])
]
print("At-risk students with pending cases (Open or In Progress):", pending_cases['enrollment_id'].nunique())

=== INTERVENTION STATUS ===
Case_Status
Resolved       24
In Progress     5
Open            2
Name: count, dtype: int64

At-risk students with interventions: 31

At-risk students with pending cases (Open or In Progress): 7
