In [1]:
import pandas as pd

# Read the Excel file
df = pd.read_excel('ONGB_EvalData_Complete_Anonymized.xlsx')

# Display basic info about the DataFrame
print(df.shape)
print(df.head())

(79460, 122)
   ANON_ID  Birthdate Gen          Eth_1718 Fluency_1718       SpEd_1718  \
0        1 1997-08-21   F             Asian         RFEP  Not Special Ed   
1        2 1999-10-10   F             Asian           EL  Not Special Ed   
2        3 2019-05-09   F               NaN          NaN             NaN   
3        4 2007-07-05   F  African American           EO  Not Special Ed   
4        5 2016-01-26   M               NaN          NaN             NaN   

              SiteName_1718 School Address_1718 City_1718  Zip_1718  ...  \
0  Oakland International HS     4521 Webster St   Oakland   94609.0  ...   
1  Oakland International HS     4521 Webster St   Oakland   94609.0  ...   
2                       NaN                 NaN       NaN       NaN  ...   
3         EnCompass Academy    1025 81st Avenue   Oakland   94621.0  ...   
4                       NaN                 NaN       NaN       NaN  ...   

   Grade_2324  AttRate_2324  DaysEnr_2324  DaysAbs_2324  Susp_2324  \
0  

In [2]:
import numpy as np

# Basic exploration
print("Dataset shape:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())

# Check missing data
print("\nMissing data summary:")
missing_summary = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isnull().sum(),
    'Missing_Percent': (df.isnull().sum() / len(df) * 100).round(2)
})
missing_summary = missing_summary[missing_summary['Missing_Count'] > 0].sort_values('Missing_Percent', ascending=False)
print(missing_summary)

# Check data types
print("\nData types:")
print(df.dtypes.value_counts())

# Basic statistics for key columns
print("\nGrade distribution:")
grade_cols = [col for col in df.columns if 'Grade_' in col]
for col in grade_cols:
    print(f"\n{col}:")
    print(df[col].value_counts().sort_index())

# Attendance rate summary
att_cols = [col for col in df.columns if 'AttRate_' in col]
print("\nAttendance rate summary:")
print(df[att_cols].describe())

Dataset shape: (79460, 122)

Column names:
['ANON_ID', 'Birthdate', 'Gen', 'Eth_1718', 'Fluency_1718', 'SpEd_1718', 'SiteName_1718', 'School Address_1718', 'City_1718', 'Zip_1718', 'Grade_1718', 'AttRate_1718', 'DaysEnr_1718', 'DaysAbs_1718', 'Susp_1718', 'Address_1718', 'City_1718.1', 'Zip_1718.1', 'CurrWeightedTotGPA_1718', 'SED_1718', 'Eth_1819', 'Fluency_1819', 'SpEd_1819', 'SiteName_1819', 'School Address_1819', 'City_1819', 'Zip_1819', 'Grade_1819', 'AttRate_1819', 'DaysEnr_1819', 'DaysAbs_1819', 'Susp_1819', 'Address_1819', 'City_1819.1', 'Zip_1819.1', 'CurrWeightedTotGPA_1819', 'SED_1819', 'Eth_1920', 'Fluency_1920', 'SpEd_1920', 'SiteName_1920', 'School Address_1920', 'City_1920', 'Zip_1920', 'Grade_1920', 'AttRate_1920', 'DaysEnr_1920', 'DaysAbs_1920', 'Susp_1920', 'Address_1920', 'City_1920.1', 'Zip_1920.1', 'CurrWeightedTotGPA_1920', 'SED_1920', 'Eth_2021', 'Fluency_2021', 'SpEd_2021', 'SiteName_2021', 'School Address_2021', 'City_2021', 'Zip_2021', 'Grade_2021', 'AttRate_2

Checking Data Quailty: Birthdays and Gender  

In [3]:
# Step 1: Examine the actual data quality issues
print("Checking key data quality issues:\n")

# Check for duplicate students
print(f"Total students: {df['ANON_ID'].nunique()}")
print(f"Total rows: {len(df)}")
print(f"Duplicates: {len(df) - df['ANON_ID'].nunique()}\n")

# Check birthdate validity
print("Birthdate issues:")
df['Birthdate'] = pd.to_datetime(df['Birthdate'], errors='coerce')
print(f"Invalid birthdates: {df['Birthdate'].isnull().sum()}")
print(f"Birthdate range: {df['Birthdate'].min()} to {df['Birthdate'].max()}\n")

# Check gender values
print("Gender distribution:")
print(df['Gen'].value_counts(dropna=False))

Checking key data quality issues:

Total students: 79460
Total rows: 79460
Duplicates: 0

Birthdate issues:
Invalid birthdates: 0
Birthdate range: 1996-03-26 00:00:00 to 2019-08-16 00:00:00

Gender distribution:
Gen
M    41291
F    38029
N      138
m        2
Name: count, dtype: int64


Checking quality of attendance/enrollment 

In [4]:
# Step 2: Check attendance and enrollment data quality

# Check attendance rates across all years
att_cols = [col for col in df.columns if 'AttRate_' in col]
print("Attendance rate ranges by year:")
for col in att_cols:
    valid_data = df[col].dropna()
    print(f"{col}: min={valid_data.min():.3f}, max={valid_data.max():.3f}, n={len(valid_data)}")

print("\n" + "="*50 + "\n")

# Check for impossible values (AttRate > 1 or < 0)
print("Checking for impossible attendance rates:")
for col in att_cols:
    invalid = ((df[col] > 1) | (df[col] < 0)).sum()
    if invalid > 0:
        print(f"{col}: {invalid} impossible values")

print("\n" + "="*50 + "\n")

# Check days enrolled vs days absent
days_enr_cols = [col for col in df.columns if 'DaysEnr_' in col]
days_abs_cols = [col for col in df.columns if 'DaysAbs_' in col]

print("Checking DaysAbs > DaysEnr (impossible):")
for enr_col, abs_col in zip(days_enr_cols, days_abs_cols):
    year = enr_col.split('_')[1]
    impossible = (df[abs_col] > df[enr_col]).sum()
    if impossible > 0:
        print(f"Year {year}: {impossible} cases where DaysAbs > DaysEnr")

Attendance rate ranges by year:
AttRate_1718: min=0.000, max=1.000, n=39929
AttRate_1819: min=0.000, max=1.000, n=39579
AttRate_1920: min=0.000, max=1.000, n=38839
AttRate_2021: min=0.000, max=1.000, n=37558
AttRate_2122: min=0.000, max=1.000, n=36153
AttRate_2223: min=0.000, max=1.000, n=36552
AttRate_2324: min=0.000, max=1.000, n=36695


Checking for impossible attendance rates:


Checking DaysAbs > DaysEnr (impossible):


Checking for categorical inconsistency with grades and enthicity.

In [5]:
# Step 3: Check categorical variables for consistency

print("Grade distribution across all years:")
grade_cols = [col for col in df.columns if 'Grade_' in col]
for col in grade_cols:
    print(f"\n{col}:")
    print(df[col].value_counts(dropna=False).sort_index())

print("\n" + "="*50 + "\n")

# Check ethnicity categories
print("Ethnicity categories by year:")
eth_cols = [col for col in df.columns if 'Eth_' in col]
for col in eth_cols:
    print(f"\n{col}: {df[col].nunique()} unique values")
    print(df[col].value_counts(dropna=False).head(10))

Grade distribution across all years:

Grade_1718:
Grade_1718
-1.0       835
 0.0      3645
 1.0      3611
 2.0      3494
 3.0      3548
 4.0      3482
 5.0      3343
 6.0      2717
 7.0      2577
 8.0      2547
 9.0      2781
 10.0     2786
 11.0     2666
 12.0     2593
 NaN     38835
Name: count, dtype: int64

Grade_1819:
Grade_1819
-1.0       690
 0.0      3557
 1.0      3610
 2.0      3482
 3.0      3426
 4.0      3414
 5.0      3419
 6.0      2541
 7.0      2657
 8.0      2581
 9.0      2725
 10.0     2706
 11.0     2634
 12.0     2771
 NaN     39247
Name: count, dtype: int64

Grade_1920:
Grade_1920
-1.0       654
 0.0      3394
 1.0      3478
 2.0      3447
 3.0      3403
 4.0      3339
 5.0      3292
 6.0      2554
 7.0      2516
 8.0      2646
 9.0      2827
 10.0     2648
 11.0     2580
 12.0     2787
 NaN     39895
Name: count, dtype: int64

Grade_2021:
Grade_2021
-1.0       608
 0.0      3078
 1.0      3266
 2.0      3287
 3.0      3279
 4.0      3279
 5.0      3170
 6.0     

Quailty of Special Ed, Language, Socioeconomical Disadvantage, Unique School Id

In [6]:
# Step 4: Check remaining categorical variables

print("Special Education status:")
sped_cols = [col for col in df.columns if 'SpEd_' in col]
print(df[sped_cols[0]].value_counts(dropna=False))

print("\n" + "="*50 + "\n")

print("English Fluency status:")
fluency_cols = [col for col in df.columns if 'Fluency_' in col]
print(df[fluency_cols[0]].value_counts(dropna=False))

print("\n" + "="*50 + "\n")

print("Socioeconomic Disadvantage (SED) status:")
sed_cols = [col for col in df.columns if 'SED_' in col]
print(df[sed_cols[0]].value_counts(dropna=False))

print("\n" + "="*50 + "\n")

print("Number of unique schools per year:")
school_cols = [col for col in df.columns if 'SiteName_' in col]
for col in school_cols:
    print(f"{col}: {df[col].nunique()} schools")

Special Education status:
SpEd_1718
NaN               38835
Not Special Ed    34939
Special Ed         5686
Name: count, dtype: int64


English Fluency status:
Fluency_1718
NaN     38835
EO      19842
EL      12671
RFEP     6767
IFEP     1135
TBD       210
Name: count, dtype: int64


Socioeconomic Disadvantage (SED) status:
SED_1718
Unknown    40625
NaN        38835
Name: count, dtype: int64


Number of unique schools per year:
SiteName_1718: 88 schools
SiteName_1819: 89 schools
SiteName_1920: 85 schools
SiteName_2021: 84 schools
SiteName_2122: 82 schools
SiteName_2223: 81 schools
SiteName_2324: 81 schools


1. Standardizing Gender 
a. capital M for Male throughout 
2. Creating Chroninic Absenteeism flags
a. Creating absenteeism rows for chronically absent students
b. Creating a standard definition for children missing 10% or more days.
c. NAN for students not enrolled that yr 
3. Calculating age each school 
a. example 1718(year)_7(age)
b. starts each school yr in september 
c. calculates age w/ 1 decimal 



In [7]:
# Step 5: Create cleaned dataset with standardized values

df_clean = df.copy()

# 1. Standardize gender (lowercase 'm' to 'M')
df_clean['Gen'] = df_clean['Gen'].replace({'m': 'M'})

print("Cleaned Gender distribution:")
print(df_clean['Gen'].value_counts(dropna=False))

print("\n" + "="*50 + "\n")

# 2. Create chronic absenteeism flags (< 90% attendance)
att_cols = [col for col in df_clean.columns if 'AttRate_' in col]

for col in att_cols:
    year = col.split('_')[1]
    df_clean[f'ChronicAbs_{year}'] = (df_clean[col] < 0.90).astype(float)
    # Keep as NaN where AttRate is NaN (not enrolled)
    df_clean.loc[df_clean[col].isna(), f'ChronicAbs_{year}'] = np.nan

print("Chronic absenteeism rates by year:")
for year in ['1718', '1819', '1920', '2021', '2122', '2223', '2324']:
    col = f'ChronicAbs_{year}'
    total = df_clean[col].notna().sum()
    chronic = df_clean[col].sum()
    rate = (chronic / total * 100) if total > 0 else 0
    print(f"{year}: {chronic:.0f}/{total:.0f} ({rate:.1f}%)")

print("\n" + "="*50 + "\n")

# 3. Calculate age at each school year
df_clean['Birthdate'] = pd.to_datetime(df_clean['Birthdate'])

for year in ['1718', '1819', '1920', '2021', '2122', '2223', '2324']:
    # School year starts in fall, so use Sept 1st of start year
    year_start = pd.to_datetime(f"20{year[:2]}-09-01")
    df_clean[f'Age_{year}'] = ((year_start - df_clean['Birthdate']).dt.days / 365.25).round(1)

print("Age distribution for 2023-24:")
print(df_clean['Age_2324'].describe())

print(f"\nCleaned dataset shape: {df_clean.shape}")

Cleaned Gender distribution:
Gen
M    41293
F    38029
N      138
Name: count, dtype: int64


Chronic absenteeism rates by year:
1718: 6488/39929 (16.2%)
1819: 13381/39579 (33.8%)
1920: 7442/38839 (19.2%)
2021: 7561/37558 (20.1%)
2122: 15820/36153 (43.8%)
2223: 21296/36552 (58.3%)
2324: 11992/36695 (32.7%)


Age distribution for 2023-24:
count    79460.000000
mean        14.138155
std          5.157163
min          4.000000
25%         10.000000
50%         14.100000
75%         18.100000
max         27.400000
Name: Age_2324, dtype: float64

Cleaned dataset shape: (79460, 136)


In [8]:
# Complete the cleaning from Step 5
print("Age distribution for 2023-24:")
print(df_clean['Age_2324'].describe())

print(f"\nCleaned dataset shape: {df_clean.shape}")
print(f"New columns added: {df_clean.shape[1] - df.shape[1]}")

Age distribution for 2023-24:
count    79460.000000
mean        14.138155
std          5.157163
min          4.000000
25%         10.000000
50%         14.100000
75%         18.100000
max         27.400000
Name: Age_2324, dtype: float64

Cleaned dataset shape: (79460, 136)
New columns added: 14


In [9]:
# Analyze missing data patterns by variable type

years = ['1718', '1819', '1920', '2021', '2122', '2223', '2324']

for year in years:
    enrolled = df_clean[f'AttRate_{year}'].notna().sum()
    total = len(df_clean)
    
    print(f"\n{year}: {enrolled} enrolled ({enrolled/total*100:.1f}%)")
    
    # For enrolled students, check other missing data
    enrolled_mask = df_clean[f'AttRate_{year}'].notna()
    
    print(f"  Among enrolled students:")
    print(f"    Ethnicity missing: {df_clean.loc[enrolled_mask, f'Eth_{year}'].isna().sum()}")
    print(f"    Fluency missing: {df_clean.loc[enrolled_mask, f'Fluency_{year}'].isna().sum()}")
    print(f"    SpEd missing: {df_clean.loc[enrolled_mask, f'SpEd_{year}'].isna().sum()}")
    print(f"    SED missing: {df_clean.loc[enrolled_mask, f'SED_{year}'].isna().sum()}")
    print(f"    Grade missing: {df_clean.loc[enrolled_mask, f'Grade_{year}'].isna().sum()}")


1718: 39929 enrolled (50.3%)
  Among enrolled students:
    Ethnicity missing: 0
    Fluency missing: 0
    SpEd missing: 0
    SED missing: 0
    Grade missing: 0

1819: 39579 enrolled (49.8%)
  Among enrolled students:
    Ethnicity missing: 0
    Fluency missing: 0
    SpEd missing: 0
    SED missing: 0
    Grade missing: 0

1920: 38839 enrolled (48.9%)
  Among enrolled students:
    Ethnicity missing: 0
    Fluency missing: 0
    SpEd missing: 0
    SED missing: 0
    Grade missing: 0

2021: 37558 enrolled (47.3%)
  Among enrolled students:
    Ethnicity missing: 0
    Fluency missing: 0
    SpEd missing: 0
    SED missing: 0
    Grade missing: 0

2122: 36153 enrolled (45.5%)
  Among enrolled students:
    Ethnicity missing: 0
    Fluency missing: 0
    SpEd missing: 0
    SED missing: 0
    Grade missing: 0

2223: 36552 enrolled (46.0%)
  Among enrolled students:
    Ethnicity missing: 0
    Fluency missing: 0
    SpEd missing: 0
    SED missing: 0
    Grade missing: 0

2324: 366

Checking if data is missing because students weren't enrolled or other?  Keeping Nan because student enrolled has complete data, and nan is a place holder for not enrolled. 

In [10]:
# Check suspension data for enrolled students

print("Suspension data check:\n")

for year in years:
    enrolled_mask = df_clean[f'AttRate_{year}'].notna()
    enrolled_count = enrolled_mask.sum()
    
    susp_missing = df_clean.loc[enrolled_mask, f'Susp_{year}'].isna().sum()
    susp_recorded = enrolled_count - susp_missing
    
    if susp_recorded > 0:
        susp_rate = (df_clean.loc[enrolled_mask, f'Susp_{year}'] > 0).sum() / enrolled_count * 100
        print(f"{year}: {susp_missing}/{enrolled_count} missing ({susp_missing/enrolled_count*100:.1f}%) | Suspension rate: {susp_rate:.1f}%")

Suspension data check:

1718: 38377/39929 missing (96.1%) | Suspension rate: 3.9%
1819: 38255/39579 missing (96.7%) | Suspension rate: 3.3%
1920: 37720/38839 missing (97.1%) | Suspension rate: 2.9%
2021: 37555/37558 missing (100.0%) | Suspension rate: 0.0%
2122: 34931/36153 missing (96.6%) | Suspension rate: 3.4%
2223: 35257/36552 missing (96.5%) | Suspension rate: 3.5%
2324: 35293/36695 missing (96.2%) | Suspension rate: 3.8%


In [11]:
# Fill suspension NaN with 0 for enrolled students only
# Keep NaN for non-enrolled students

for year in years:
    enrolled_mask = df_clean[f'AttRate_{year}'].notna()
    
    # Fill NaN with 0 (not suspended) only for enrolled students
    df_clean.loc[enrolled_mask, f'Susp_{year}'] = df_clean.loc[enrolled_mask, f'Susp_{year}'].fillna(0)

print("Suspension data after filling:\n")

for year in years:
    enrolled_mask = df_clean[f'AttRate_{year}'].notna()
    total_enrolled = enrolled_mask.sum()
    suspended = (df_clean.loc[enrolled_mask, f'Susp_{year}'] > 0).sum()
    
    print(f"{year}: {suspended}/{total_enrolled} suspended ({suspended/total_enrolled*100:.1f}%)")

Suspension data after filling:

1718: 1552/39929 suspended (3.9%)
1819: 1324/39579 suspended (3.3%)
1920: 1119/38839 suspended (2.9%)
2021: 3/37558 suspended (0.0%)
2122: 1222/36153 suspended (3.4%)
2223: 1295/36552 suspended (3.5%)
2324: 1402/36695 suspended (3.8%)


Confirmed amount of suspension data, calculated rates where data exsited. Filled nan with 0 as with pervious dataset when children were enrolled, NAN for children not enrolled. 

In [12]:
# Check SED categories for enrolled students

print("SED status distribution:\n")

for year in years:
    enrolled_mask = df_clean[f'AttRate_{year}'].notna()
    print(f"\n{year}:")
    print(df_clean.loc[enrolled_mask, f'SED_{year}'].value_counts(dropna=False))

SED status distribution:


1718:
SED_1718
Unknown    39929
Name: count, dtype: int64

1819:
SED_1819
Unknown    39579
Name: count, dtype: int64

1920:
SED_1920
SED        27269
Not SED    11570
Name: count, dtype: int64

2021:
SED_2021
SED        27632
Not SED     9926
Name: count, dtype: int64

2122:
SED_2122
SED        27360
Not SED     8793
Name: count, dtype: int64

2223:
SED_2223
SED        26996
Not SED     7249
Unknown     2307
Name: count, dtype: int64

2324:
SED_2324
SED        30015
Not SED     6680
Name: count, dtype: int64


Converting NAN for unknown socioeco. Creating binary flags SED 1 Not SED 0 Unknown nan. Better for training models 

In [13]:
# Create binary SED flag and data availability flag

for year in years:
    enrolled_mask = df_clean[f'AttRate_{year}'].notna()
    
    # Create binary flag: 1 = SED, 0 = Not SED, NaN = Unknown or not enrolled
    df_clean[f'SED_Binary_{year}'] = df_clean[f'SED_{year}'].map({
        'SED': 1,
        'Not SED': 0,
        'Unknown': np.nan
    })

print("SED Binary flags created. Summary:\n")

for year in years:
    enrolled_mask = df_clean[f'AttRate_{year}'].notna()
    sed_count = (df_clean.loc[enrolled_mask, f'SED_Binary_{year}'] == 1).sum()
    not_sed_count = (df_clean.loc[enrolled_mask, f'SED_Binary_{year}'] == 0).sum()
    unknown_count = df_clean.loc[enrolled_mask, f'SED_Binary_{year}'].isna().sum()
    total = enrolled_mask.sum()
    
    print(f"{year}: SED={sed_count} ({sed_count/total*100:.1f}%), Not SED={not_sed_count} ({not_sed_count/total*100:.1f}%), Unknown={unknown_count} ({unknown_count/total*100:.1f}%)")

print(f"\nNew dataset shape: {df_clean.shape}")

SED Binary flags created. Summary:

1718: SED=0 (0.0%), Not SED=0 (0.0%), Unknown=39929 (100.0%)
1819: SED=0 (0.0%), Not SED=0 (0.0%), Unknown=39579 (100.0%)
1920: SED=27269 (70.2%), Not SED=11570 (29.8%), Unknown=0 (0.0%)
2021: SED=27632 (73.6%), Not SED=9926 (26.4%), Unknown=0 (0.0%)
2122: SED=27360 (75.7%), Not SED=8793 (24.3%), Unknown=0 (0.0%)
2223: SED=26996 (73.9%), Not SED=7249 (19.8%), Unknown=2307 (6.3%)
2324: SED=30015 (81.8%), Not SED=6680 (18.2%), Unknown=0 (0.0%)

New dataset shape: (79460, 143)


In [14]:
# Summary of all cleaning steps completed

print("CLEANING SUMMARY")
print("="*60)
print(f"Original dataset: {df.shape}")
print(f"Cleaned dataset: {df_clean.shape}")
print(f"Columns added: {df_clean.shape[1] - df.shape[1]}")

print("\n" + "="*60)
print("CHANGES MADE:")
print("="*60)

print("\n1. Gender standardization:")
print(f"   - Changed lowercase 'm' to 'M': 2 records")

print("\n2. Chronic absenteeism flags (7 new columns):")
print("   - ChronicAbs_[year] = 1 if AttRate < 0.90, else 0")

print("\n3. Age calculations (7 new columns):")
print("   - Age_[year] = age on Sept 1 of each school year")

print("\n4. Suspension data:")
print("   - Filled NaN with 0 for enrolled students")
print("   - Kept NaN for non-enrolled students")

print("\n5. SED binary flags (7 new columns):")
print("   - SED_Binary_[year] = 1 (SED), 0 (Not SED), NaN (Unknown)")

print("\n" + "="*60)
print("DATA QUALITY CONFIRMED:")
print("="*60)
print("✓ No duplicate students")
print("✓ All birthdates valid")
print("✓ All attendance rates valid (0-1 range)")
print("✓ No cases of DaysAbs > DaysEnr")
print("✓ Missing data = students not enrolled (legitimate)")

print("\n" + "="*60)
print("READY FOR: EDA and modeling")
print("="*60)

CLEANING SUMMARY
Original dataset: (79460, 122)
Cleaned dataset: (79460, 143)
Columns added: 21

CHANGES MADE:

1. Gender standardization:
   - Changed lowercase 'm' to 'M': 2 records

2. Chronic absenteeism flags (7 new columns):
   - ChronicAbs_[year] = 1 if AttRate < 0.90, else 0

3. Age calculations (7 new columns):
   - Age_[year] = age on Sept 1 of each school year

4. Suspension data:
   - Filled NaN with 0 for enrolled students
   - Kept NaN for non-enrolled students

5. SED binary flags (7 new columns):
   - SED_Binary_[year] = 1 (SED), 0 (Not SED), NaN (Unknown)

DATA QUALITY CONFIRMED:
✓ No duplicate students
✓ All birthdates valid
✓ All attendance rates valid (0-1 range)
✓ No cases of DaysAbs > DaysEnr
✓ Missing data = students not enrolled (legitimate)

READY FOR: EDA and modeling


In [15]:
# Save cleaned dataset
df_clean.to_csv('ONGB_EvalData_CLEANED.csv', index=False)

print("✓ Cleaned dataset saved as: ONGB_EvalData_CLEANED.csv")
print(f"  Shape: {df_clean.shape}")
print(f"  File size: ~{df_clean.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

✓ Cleaned dataset saved as: ONGB_EvalData_CLEANED.csv
  Shape: (79460, 143)
  File size: ~271.3 MB


In [16]:
# Create .gitignore file to exclude data files

gitignore_content = """
# Data files
*.csv
*.xlsx
*.xls

# Jupyter Notebook checkpoints
.ipynb_checkpoints/

# Python cache
__pycache__/
*.pyc

# Environment
.env
venv/
env/

# OS files
.DS_Store
"""

with open('.gitignore', 'w') as f:
    f.write(gitignore_content)

print("✓ .gitignore file created")
print("\nThis will prevent git from tracking:")
print("  - All CSV and Excel files")
print("  - Jupyter checkpoints")
print("  - Python cache files")

✓ .gitignore file created

This will prevent git from tracking:
  - All CSV and Excel files
  - Jupyter checkpoints
  - Python cache files


In [17]:
# Clone the repository
!git clone https://github.com/kaiagaoo/ONGB-chronic-absenteeism.git

print("✓ Repository cloned")

Cloning into 'ONGB-chronic-absenteeism'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 6 (delta 0), reused 3 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (6/6), done.
✓ Repository cloned


In [1]:
# Create comprehensive data dictionary

print("ONGB CHRONIC ABSENTEEISM DATA DICTIONARY")
print("="*80)

# Core identifiers
print("\nCORE IDENTIFIERS:")
print("-" * 80)
print("ANON_ID          : Anonymous student ID (unique identifier)")
print("Birthdate        : Student date of birth")
print("Gen              : Gender (M=Male, F=Female, N=Non-binary)")

# Demographic variables (repeated for each year: _1718, _1819, _1920, _2021, _2122, _2223, _2324)
print("\nDEMOGRAPHIC VARIABLES (by year):")
print("-" * 80)
print("Eth_[year]       : Ethnicity categories:")
print("                   - Latino/Hispanic")
print("                   - African American")
print("                   - Asian")
print("                   - White")
print("                   - Two or More Races")
print("                   - Pacific Islander")
print("                   - Filipino")
print("                   - Native American")

print("\nFluency_[year]   : English language fluency status:")
print("                   - EO (English Only)")
print("                   - EL (English Learner)")
print("                   - RFEP (Reclassified Fluent English Proficient)")
print("                   - IFEP (Initially Fluent English Proficient)")
print("                   - TBD (To Be Determined)")

print("\nSpEd_[year]      : Special Education status:")
print("                   - Special Ed (receives special education services)")
print("                   - Not Special Ed")

print("\nSED_[year]       : Socioeconomically Disadvantaged status:")
print("                   - SED (socioeconomically disadvantaged)")
print("                   - Not SED")
print("                   - Unknown (data not collected in 2017-18, 2018-19)")

# School variables
print("\nSCHOOL VARIABLES (by year):")
print("-" * 80)
print("SiteName_[year]  : School name")
print("School Address_[year] : School street address")
print("City_[year]      : School city")
print("Zip_[year]       : School zip code")
print("Address_[year]   : Student home address")
print("City_[year].1    : Student home city")
print("Zip_[year].1     : Student home zip code")

# Academic variables
print("\nACADEMIC VARIABLES (by year):")
print("-" * 80)
print("Grade_[year]     : Grade level (-1=Pre-K, 0=Kindergarten, 1-12=grades)")
print("CurrWeightedTotGPA_[year] : Current weighted total GPA")

# Attendance variables
print("\nATTENDANCE VARIABLES (by year):")
print("-" * 80)
print("AttRate_[year]   : Attendance rate (0.0-1.0, where 1.0=100% attendance)")
print("DaysEnr_[year]   : Days enrolled")
print("DaysAbs_[year]   : Days absent")

# Discipline variables
print("\nDISCIPLINE VARIABLES (by year):")
print("-" * 80)
print("Susp_[year]      : Suspension count (0=not suspended, >0=number of suspensions)")
print("                   Note: NaN for non-enrolled students")
print("                   Note: 2020-21 has almost no suspensions (COVID remote learning)")

# Created/cleaned variables
print("\nCREATED VARIABLES (from cleaning process):")
print("-" * 80)
print("ChronicAbs_[year] : Chronic absenteeism flag:")
print("                    - 1 = chronically absent (attendance < 90%)")
print("                    - 0 = not chronically absent (attendance >= 90%)")
print("                    - NaN = not enrolled that year")

print("\nAge_[year]       : Student age as of September 1 of school year")
print("                   (calculated from birthdate)")

print("\nSED_Binary_[year] : Binary socioeconomic disadvantage flag:")
print("                    - 1 = SED")
print("                    - 0 = Not SED")
print("                    - NaN = Unknown (2017-18, 2018-19) or not enrolled")

# Year suffixes
print("\nYEAR SUFFIXES:")
print("-" * 80)
print("_1718            : School year 2017-2018")
print("_1819            : School year 2018-2019")
print("_1920            : School year 2019-2020 (COVID started March 2020)")
print("_2021            : School year 2020-2021 (full remote learning)")
print("_2122            : School year 2021-2022 (return to in-person)")
print("_2223            : School year 2022-2023")
print("_2324            : School year 2023-2024")

print("\n" + "="*80)
print("TOTAL COLUMNS: 143 (122 original + 21 created)")
print("TOTAL STUDENTS: 79,460")
print("="*80)

ONGB CHRONIC ABSENTEEISM DATA DICTIONARY

CORE IDENTIFIERS:
--------------------------------------------------------------------------------
ANON_ID          : Anonymous student ID (unique identifier)
Birthdate        : Student date of birth
Gen              : Gender (M=Male, F=Female, N=Non-binary)

DEMOGRAPHIC VARIABLES (by year):
--------------------------------------------------------------------------------
Eth_[year]       : Ethnicity categories:
                   - Latino/Hispanic
                   - African American
                   - Asian
                   - White
                   - Two or More Races
                   - Pacific Islander
                   - Filipino
                   - Native American

Fluency_[year]   : English language fluency status:
                   - EO (English Only)
                   - EL (English Learner)
                   - RFEP (Reclassified Fluent English Proficient)
                   - IFEP (Initially Fluent English Proficient)
     