# New Section

In [2]:
import os
import pandas as pd

# Check what files are in /content/
print("Files in /content/:")
print(os.listdir('/content/'))

# Load the file
file_path = '/content/QARAWChronic_Absent_Students_with_DOB_2025-05-29.xlsx'
df = pd.read_excel(file_path)

# Display basic info
print(f"\nShape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
df.head()

Files in /content/:
['.config', 'sample_data', 'drive', 'QARAWChronic_Absent_Students_with_DOB_2025-05-29.xlsx']

Shape: (18638, 28)
Columns: ['ID', 'LastName', 'FirstName', 'Birthdate', 'DT', 'DaysEnr', 'DaysAbs', 'DaysPresent', 'AttRate', 'AttGrp', 'SiteName', 'Gr', 'Gen', 'Eth', 'Fluency', 'Home Language', 'Special Ed Status', 'Cumulative Weighted Total GPA (TP)', 'Current Weighted Total GPA (GT)', 'AddressResidence', 'CityResidence', 'ZipResidence', 'ParentName', 'Telephone', 'PG_Email_1', 'SED Status', 'NumSusp', 'NumDaysSusp']


Unnamed: 0,ID,LastName,FirstName,Birthdate,DT,DaysEnr,DaysAbs,DaysPresent,AttRate,AttGrp,...,Current Weighted Total GPA (GT),AddressResidence,CityResidence,ZipResidence,ParentName,Telephone,PG_Email_1,SED Status,NumSusp,NumDaysSusp
0,443282,Aarif,Aslam,2010-07-02,2025-05-29,180,87,93,0.5167,Severe Chronic Absent,...,,1931 Myrtle St,Oakland,94607,Danyelle Aarif,5103055000.0,kkisa@yahoo.com,Not SED,,
1,436859,Abarca,Josiah,2018-05-31,2025-05-29,180,31,149,0.8278,Chronic Absent,...,,1001 105TH AVE,Oakland,94603,Roxana Aguilar,5106958000.0,Roxanaaguilar1011@yahoo.com,SED,,
2,435234,Abarca Carranza,Maura,2004-08-26,2025-05-29,180,121,59,0.3278,Severe Chronic Absent,...,0.0,6108 HARMON AVE,Oakland,94621,Jose Mauricio Polanco,5105411000.0,dayana.abarca1023@gmail.com,SED,,
3,408468,Abarca Climaco,Valeria,2016-03-09,2025-05-29,167,17,150,0.8982,Chronic Absent,...,,1001 105TH AVE,Oakland,94603,Edith Climaco / Jose Abarca,5104200000.0,edithclimaco87i@gmail.com,SED,,
4,440496,Abarca Escobar,Genesis,2015-11-24,2025-05-29,180,20,160,0.8889,Chronic Absent,...,,1058 75TH AVE,Oakland,94621,Angela Escobar,2098087000.0,alegriaesc0@gmail.com,SED,,


In [3]:
# Get a comprehensive view of data quality issues
print("="*50)
print("DATA QUALITY ASSESSMENT")
print("="*50)

# 1. Missing values by column
print("\nMissing Values:")
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing_Count': missing,
    'Missing_Percent': missing_pct
}).sort_values('Missing_Percent', ascending=False)
print(missing_df[missing_df['Missing_Count'] > 0])

# 2. Data types
print("\n" + "="*50)
print("Data Types:")
print(df.dtypes)

# 3. Check for duplicates
print("\n" + "="*50)
print(f"Duplicate rows: {df.duplicated().sum()}")
print(f"Duplicate IDs: {df['ID'].duplicated().sum()}")

# 4. Value ranges for numeric columns
print("\n" + "="*50)
print("Numeric Column Ranges:")
numeric_cols = ['DaysEnr', 'DaysAbs', 'DaysPresent', 'AttRate', 'NumSusp', 'NumDaysSusp']
for col in numeric_cols:
    if col in df.columns:
        print(f"\n{col}:")
        print(f"  Min: {df[col].min()}, Max: {df[col].max()}")
        print(f"  Negative values: {(df[col] < 0).sum()}")

# 5. Check unique values in categorical columns
print("\n" + "="*50)
print("Categorical Column Unique Values:")
cat_cols = ['AttGrp', 'Gr', 'Gen', 'Eth', 'Fluency', 'Special Ed Status', 'SED Status']
for col in cat_cols:
    if col in df.columns:
        print(f"\n{col}: {df[col].nunique()} unique values")
        print(df[col].value_counts())

DATA QUALITY ASSESSMENT

Missing Values:
                                    Missing_Count  Missing_Percent
NumSusp                                     17683        94.876060
NumDaysSusp                                 17683        94.876060
Cumulative Weighted Total GPA (TP)          11852        63.590514
Current Weighted Total GPA (GT)             11852        63.590514
PG_Email_1                                    123         0.659942
Telephone                                      43         0.230711
ParentName                                      9         0.048288
AddressResidence                                1         0.005365

Data Types:
ID                                             int64
LastName                                      object
FirstName                                     object
Birthdate                             datetime64[ns]
DT                                    datetime64[ns]
DaysEnr                                        int64
DaysAbs                  

In [4]:
# Create a copy for cleaning
df_clean = df.copy()

print("CLEANING STEPS")
print("="*50)

# STEP 1: Handle duplicate IDs
print("\nSTEP 1: Investigating Duplicate IDs")
duplicate_ids = df_clean[df_clean['ID'].duplicated(keep=False)].sort_values('ID')
print(f"Total rows with duplicate IDs: {len(duplicate_ids)}")
print("\nSample of duplicates:")
print(duplicate_ids[['ID', 'LastName', 'FirstName', 'SiteName', 'Gr', 'DT']].head(10))

# Check if duplicates are different snapshots or true duplicates
print("\nAre duplicates at different schools or grades?")
dup_check = duplicate_ids.groupby('ID').agg({
    'SiteName': 'nunique',
    'Gr': 'nunique',
    'DT': 'nunique'
})
print(dup_check.head())

CLEANING STEPS

STEP 1: Investigating Duplicate IDs
Total rows with duplicate IDs: 1964

Sample of duplicates:
           ID     LastName  FirstName                       SiteName  Gr  \
16957  270598       Watson  Brooklynn            Young Adult Program  15   
17956  270598       Watson  Brooklynn            Young Adult Program  15   
18561  273093  Zavala-Cruz   Kimberly      Madison Park Academy 6-12  12   
17528  273093  Zavala-Cruz   Kimberly      Madison Park Academy 6-12  12   
17388  274130          Xie    Allison  Oakland Technical High School  12   
18406  274130          Xie    Allison  Oakland Technical High School  12   
17569  274140         Zhen   Michelle  Oakland Technical High School  12   
18604  274140         Zhen   Michelle  Oakland Technical High School  12   
18603  274141         Zhen     Elaine  Oakland Technical High School  12   
17568  274141         Zhen     Elaine  Oakland Technical High School  12   

              DT  
16957 2025-05-29  
17956 2025-05-

We have some duplicates of the same child same year ands same grade, So we will investigate this further to detremine next steps


In [5]:
# Check all duplicate date patterns
print("Date patterns in duplicates:")
print(duplicate_ids.groupby('ID')['DT'].apply(list).head(10))

# Compare the data between duplicate dates
print("\n" + "="*50)
print("Do attendance metrics differ between dates?")
sample_id = 270598
sample_student = df_clean[df_clean['ID'] == sample_id][['ID', 'LastName', 'FirstName',
                                                          'DT', 'DaysEnr', 'DaysAbs',
                                                          'DaysPresent', 'AttRate', 'AttGrp']]
print(sample_student)

# Check if this is consistent across all duplicates
print("\n" + "="*50)
print("Summary of differences in duplicate records:")
for id_val in duplicate_ids['ID'].unique()[:5]:
    temp = df_clean[df_clean['ID'] == id_val]
    if len(temp) == 2:
        diff = temp.iloc[0]['DaysAbs'] - temp.iloc[1]['DaysAbs']
        print(f"ID {id_val}: Absence difference = {diff} days")

Date patterns in duplicates:
ID
270598    [2025-05-29 00:00:00, 2025-05-28 00:00:00]
273093    [2025-05-28 00:00:00, 2025-05-29 00:00:00]
274130    [2025-05-29 00:00:00, 2025-05-28 00:00:00]
274140    [2025-05-29 00:00:00, 2025-05-28 00:00:00]
274141    [2025-05-28 00:00:00, 2025-05-29 00:00:00]
274224    [2025-05-29 00:00:00, 2025-05-28 00:00:00]
274227    [2025-05-29 00:00:00, 2025-05-28 00:00:00]
274435    [2025-05-28 00:00:00, 2025-05-29 00:00:00]
274543    [2025-05-29 00:00:00, 2025-05-28 00:00:00]
274894    [2025-05-28 00:00:00, 2025-05-29 00:00:00]
Name: DT, dtype: object

Do attendance metrics differ between dates?
           ID LastName  FirstName         DT  DaysEnr  DaysAbs  DaysPresent  \
16957  270598   Watson  Brooklynn 2025-05-29      179       10          169   
17956  270598   Watson  Brooklynn 2025-05-28      178       10          168   

       AttRate   AttGrp  
16957   0.9441  At Risk  
17956   0.9438  At Risk  

Summary of differences in duplicate records:
ID 2705

**Duplicates**
This shows the duplicates are of the same child at different times, we will keep the most current and drop the rest. If we keep all of the dates it doesn't give us more information, it will just inflate any of the averaging we do in the future. If we kept the record with the most absences, we could pontenially be missing something the admin corrected, if a child was actually present of a day previously marked absent.  Out of an abunance of caution and to prevent any bias, we have kept the most recent recording.

**Suspension Missing Values**
While inputing the data, Suspension was left blank when a child was not suspened. Since this is is classified at MNAR, we have replaced it with 0 instead of Nan for future calculations.


**GPA**
Only secondary students have GPA: elementary students were kept NaN, and any GPA calculations should be filtered for secondary only.

**Missing Parental Data**
We have flagged missing parental date and created true/false statements so we don't lose the data. Provides a a to-do-list for updating contact information and flags for children who have parents that may be more difficult to contact.

**Standardized Categorical Data**
-2=Pre-K
-1=T-K
Provides clarity for reporting and visulaztions

**New Columns**
1. Chronic Absenteeism Flag
Federal definition of Chronic Absenteeism
90% or 18 out of 180 enrolled days.
2. Absence Severity Categories

 1.00 (100%) ‚Üê Perfect attendance
     ‚Üì
0.90 (90%)  ‚Üê FEDERAL CHRONIC THRESHOLD (everything below is chronic)
     ‚ïë
0.80 (80%)  ‚Üê HIGH CHRONIC THRESHOLD  
     ‚ïë
0.70 (70%)  ‚Üê SEVERE THRESHOLD
     ‚Üì
0.00 (0%)   ‚Üê Never present

3. School Level

Creates easy comparasion variables

**Parental Choice**
Early Childhood (before kinder)
Elementary (k-5)
Middle (6-8)

**Influced by Child**
High-School (9-12)

**Heavily Influced by Child/Young Adults**
Post Secondary (Beyond 12)



Checked for validation and verified total days.


In [9]:
# CHRONIC ABSENTEEISM DATA CLEANING PIPELINE
# This script cleans and prepares chronic asenteeism data for analysis
# Author: Quiana
# Date: 2025-02-07
import pandas as pd
import numpy as np

# Load the data
print("LOADING DATA")
print("="*70)
file_path = '/content/QARAWChronic_Absent_Students_with_DOB_2025-05-29.xlsx'
df = pd.read_excel(file_path)
print(f"‚úì Loaded {len(df):,} rows and {len(df.columns)} columns")

# Create working copy
df_clean = df.copy()

# STEP 1: REMOVE DUPLICATES
print("\nSTEP 1: REMOVING DUPLICATE IDs")
print("="*70)
print(f"Before: {len(df_clean):,} rows")

# Keep most recent date for each student
df_clean = df_clean.sort_values('DT', ascending=False).drop_duplicates(subset='ID', keep='first')

print(f"After: {len(df_clean):,} rows")
print(f"Removed: {len(df) - len(df_clean):,} duplicate records")
print("‚úì Kept most recent date (2025-05-29) for each student")

# STEP 2: HANDLE MISSING VALUES
print("\nSTEP 2: HANDLING MISSING VALUES")
print("="*70)

# Suspension data: NaN means no suspension
df_clean['NumSusp'] = df_clean['NumSusp'].fillna(0)
df_clean['NumDaysSusp'] = df_clean['NumDaysSusp'].fillna(0)
print("‚úì Filled suspension NaNs with 0 (no suspension recorded)")

# GPA: Keep as NaN (elementary students don't have GPAs)
print("‚úì Keeping GPA as NaN (not applicable for elementary students)")

# Parent contact: Flag missing for follow-up
df_clean['Missing_Email'] = df_clean['PG_Email_1'].isnull()
df_clean['Missing_Phone'] = df_clean['Telephone'].isnull()
print(f"‚úì Flagged {df_clean['Missing_Email'].sum():,} students with missing email")
print(f"‚úì Flagged {df_clean['Missing_Phone'].sum():,} students with missing phone")

# STEP 3: STANDARDIZE CATEGORICAL VARIABLES
print("\nSTEP 3: STANDARDIZING CATEGORIES")
print("="*70)

# Create readable grade labels
df_clean['Grade_Label'] = df_clean['Gr'].apply(lambda x:
    'Pre-K' if x == -2 else
    'TK' if x == -1 else
    f'Grade {x}' if x <= 12 else
    'Post-Secondary'
)
print("‚úì Created readable grade labels (Pre-K, TK, Grade 0-12, Post-Secondary)")

# STEP 4: CREATE DERIVED COLUMNS
print("\nSTEP 4: CREATING DERIVED COLUMNS")
print("="*70)

# Federal chronic absenteeism threshold
df_clean['Is_Chronic'] = df_clean['AttRate'] < 0.90
print("‚úì Created chronic absenteeism flag (<90% attendance = chronic)")

# 4-tier absence severity (better intervention targeting)
df_clean['Absence_Severity'] = pd.cut(
    df_clean['AttRate'],
    bins=[0, 0.70, 0.80, 0.90, 1.0],
    labels=['Severe', 'High Chronic', 'Moderate Chronic', 'Low Risk'],
    include_lowest=True
)

# Fix edge case: students with exactly 0% attendance
df_clean.loc[df_clean['AttRate'] == 0.0, 'Absence_Severity'] = 'Severe'

print("‚úì Created 4-tier absence severity categories:")
print("  ‚Ä¢ Severe (<70%): Crisis intervention needed")
print("  ‚Ä¢ High Chronic (70-79%): Intensive support needed")
print("  ‚Ä¢ Moderate Chronic (80-89%): Active monitoring needed")
print("  ‚Ä¢ Low Risk (90-100%): Minimal intervention needed")

# School level grouping
df_clean['School_Level'] = df_clean['Gr'].apply(lambda x:
    'Early Childhood' if x < 0 else
    'Elementary' if 0 <= x <= 5 else
    'Middle' if 6 <= x <= 8 else
    'High School' if 9 <= x <= 12 else
    'Post-Secondary'
)
print("‚úì Created school level categories (Early Childhood, Elementary, Middle, High School, Post-Secondary)")

# STEP 5: DATA VALIDATION
print("\nSTEP 5: DATA VALIDATION CHECKS")
print("="*70)

# Verify attendance rate calculation
df_clean['Calc_AttRate'] = df_clean['DaysPresent'] / df_clean['DaysEnr']
df_clean['AttRate_Match'] = abs(df_clean['AttRate'] - df_clean['Calc_AttRate']) < 0.01
matches = df_clean['AttRate_Match'].sum()
print(f"‚úì Attendance rate calculation verified: {matches:,}/{len(df_clean):,} records match")

# Verify days total
df_clean['Days_Check'] = (df_clean['DaysPresent'] + df_clean['DaysAbs']) == df_clean['DaysEnr']
matches = df_clean['Days_Check'].sum()
print(f"‚úì Days enrolled = Present + Absent: {matches:,}/{len(df_clean):,} records match")

# Check for remaining missing severity categories
missing_severity = df_clean['Absence_Severity'].isnull().sum()
print(f"‚úì Missing Absence_Severity: {missing_severity} records")

# FINAL SUMMARY REPORT
print("\nCLEANED DATASET SUMMARY")
print("="*70)

print(f"\nüìä OVERVIEW")
print(f"{'Total Students:':<40} {len(df_clean):>10,}")
print(f"{'Date Range:':<40} {df_clean['DT'].min().date()} to {df_clean['DT'].max().date()}")
print(f"{'Number of Schools:':<40} {df_clean['SiteName'].nunique():>10,}")

print(f"\nüìà ATTENDANCE BREAKDOWN")
severity_order = ['Severe', 'High Chronic', 'Moderate Chronic', 'Low Risk']
for category in severity_order:
    count = (df_clean['Absence_Severity'] == category).sum()
    pct = (count / len(df_clean) * 100)
    print(f"  {category:<25} {count:>6,} students ({pct:>5.1f}%)")

chronic_count = df_clean['Is_Chronic'].sum()
chronic_pct = (chronic_count / len(df_clean) * 100)
print(f"\n{'Chronic Absenteeism Rate:':<40} {chronic_pct:>5.1f}% ({chronic_count:,} students)")

print(f"\nüë• DEMOGRAPHICS")
print(f"\nSchool Level:")
for level in ['Early Childhood', 'Elementary', 'Middle', 'High School', 'Post-Secondary']:
    count = (df_clean['School_Level'] == level).sum()
    pct = (count / len(df_clean) * 100)
    print(f"  {level:<25} {count:>6,} ({pct:>5.1f}%)")

print(f"\nEthnicity (Top 5):")
eth_counts = df_clean['Eth'].value_counts().head(5)
for eth, count in eth_counts.items():
    pct = (count / len(df_clean) * 100)
    print(f"  {eth:<25} {count:>6,} ({pct:>5.1f}%)")

print(f"\nEnglish Fluency:")
fluency_counts = df_clean['Fluency'].value_counts().head(3)
for fluency, count in fluency_counts.items():
    pct = (count / len(df_clean) * 100)
    print(f"  {fluency:<25} {count:>6,} ({pct:>5.1f}%)")

sped_count = (df_clean['Special Ed Status'] == 'Special Ed').sum()
sped_pct = (sped_count / len(df_clean) * 100)
print(f"\n{'Special Education:':<40} {sped_count:>6,} ({sped_pct:>5.1f}%)")

sed_count = (df_clean['SED Status'] == 'SED').sum()
sed_pct = (sed_count / len(df_clean) * 100)
print(f"{'Socioeconomically Disadvantaged:':<40} {sed_count:>6,} ({sed_pct:>5.1f}%)")

print(f"\n‚ö†Ô∏è  DISCIPLINE")
susp_students = (df_clean['NumSusp'] > 0).sum()
total_susp = df_clean['NumSusp'].sum()
total_days = df_clean['NumDaysSusp'].sum()
print(f"{'Students with Suspensions:':<40} {susp_students:>10,}")
print(f"{'Total Suspensions:':<40} {total_susp:>10,.0f}")
print(f"{'Total Suspension Days:':<40} {total_days:>10,.0f}")

print(f"\nüìû CONTACT INFORMATION")
print(f"{'Missing Parent Email:':<40} {df_clean['Missing_Email'].sum():>10,}")
print(f"{'Missing Phone:':<40} {df_clean['Missing_Phone'].sum():>10,}")

print(f"\nüè´ TOP 5 SCHOOLS BY CHRONIC ABSENCE COUNT")
school_chronic = df_clean[df_clean['Is_Chronic']].groupby('SiteName').size().sort_values(ascending=False).head(5)
for i, (school, count) in enumerate(school_chronic.items(), 1):
    print(f"  {i}. {school:<45} {count:>4,} students")

# CREATE ANALYSIS-READY DATASET
print("\nCREATING ANALYSIS-READY DATASET")
print("="*70)

# Select core columns for analysis
core_cols = [
    'ID', 'LastName', 'FirstName', 'Birthdate', 'DT',
    'SiteName', 'Grade_Label', 'School_Level',
    'Gen', 'Eth', 'Fluency', 'Home Language',
    'Special Ed Status', 'SED Status',
    'DaysEnr', 'DaysAbs', 'DaysPresent', 'AttRate',
    'Is_Chronic', 'Absence_Severity',
    'NumSusp', 'NumDaysSusp',
    'Cumulative Weighted Total GPA (TP)', 'Current Weighted Total GPA (GT)',
    'ParentName', 'Telephone', 'PG_Email_1',
    'Missing_Email', 'Missing_Phone',
    'AddressResidence', 'CityResidence', 'ZipResidence'
]

df_analysis = df_clean[core_cols].copy()

print(f"‚úì Created streamlined dataset: {len(df_analysis):,} rows √ó {len(core_cols)} columns")
print(f"‚úì Ready for analysis and visualization")

# COMPLETION
print("\n‚úÖ DATA CLEANING COMPLETE")
print("="*70)
print(f"Cleaned dataset available in: df_clean")
print(f"Analysis-ready dataset in: df_analysis")
print(f"\nNew columns created:")
print(f"  ‚Ä¢ Missing_Email, Missing_Phone (contact flags)")
print(f"  ‚Ä¢ Grade_Label (human-readable grades)")
print(f"  ‚Ä¢ Is_Chronic (boolean chronic flag)")
print(f"  ‚Ä¢ Absence_Severity (4-tier severity)")
print(f"  ‚Ä¢ School_Level (grade groupings)")

LOADING DATA
‚úì Loaded 18,638 rows and 28 columns

STEP 1: REMOVING DUPLICATE IDs
Before: 18,638 rows
After: 17,656 rows
Removed: 982 duplicate records
‚úì Kept most recent date (2025-05-29) for each student

STEP 2: HANDLING MISSING VALUES
‚úì Filled suspension NaNs with 0 (no suspension recorded)
‚úì Keeping GPA as NaN (not applicable for elementary students)
‚úì Flagged 119 students with missing email
‚úì Flagged 40 students with missing phone

STEP 3: STANDARDIZING CATEGORIES
‚úì Created readable grade labels (Pre-K, TK, Grade 0-12, Post-Secondary)

STEP 4: CREATING DERIVED COLUMNS
‚úì Created chronic absenteeism flag (<90% attendance = chronic)
‚úì Created 4-tier absence severity categories:
  ‚Ä¢ Severe (<70%): Crisis intervention needed
  ‚Ä¢ High Chronic (70-79%): Intensive support needed
  ‚Ä¢ Moderate Chronic (80-89%): Active monitoring needed
  ‚Ä¢ Low Risk (90-100%): Minimal intervention needed
‚úì Created school level categories (Early Childhood, Elementary, Middle, High 

In [10]:
# Save as CSV
df_analysis.to_csv('cleaned_chronic_absence_data.csv', index=False)

# Save as Excel
df_analysis.to_excel('cleaned_chronic_absence_data.xlsx', index=False)
