In [2]:
import pandas as pd

# Load the chronic absenteeism file
df_chronic = pd.read_csv('RAWChronic Absent Students with DOB_2025-05-29.csv')

print("Shape:", df_chronic.shape)
print("\nColumns:", df_chronic.columns.tolist())
print("\nFirst 10 rows:")
print(df_chronic.head(10))
print("\nData types:")
print(df_chronic.dtypes)

Shape: (18638, 28)

Columns: ['ID', 'LastName', 'FirstName', 'Birthdate', 'DT', 'DaysEnr', 'DaysAbs', 'DaysPresent', 'AttRate', 'AttGrp', 'SiteName', 'Gr', 'Gen', 'Eth', 'Fluency', 'Home Language', 'Special Ed Status', 'Cumulative Weighted Total GPA (TP)', 'Current Weighted Total GPA (GT)', 'AddressResidence', 'CityResidence', 'ZipResidence', 'ParentName', 'Telephone', 'PG_Email_1', 'SED Status', 'NumSusp', 'NumDaysSusp']

First 10 rows:
       ID         LastName FirstName Birthdate       DT  DaysEnr  DaysAbs  \
0  443282            Aarif     Aslam    7/2/10  5/29/25      180       87   
1  436859           Abarca    Josiah   5/31/18  5/29/25      180       31   
2  435234  Abarca Carranza     Maura   8/26/04  5/29/25      180      121   
3  408468   Abarca Climaco   Valeria    3/9/16  5/29/25      167       17   
4  440496   Abarca Escobar   Genesis  11/24/15  5/29/25      180       20   
5  359660   Abarca-Climaco     Zully   5/31/08  5/29/25      180       41   
6  379380     Abbat

In [3]:
import numpy as np

# Load the chronic absenteeism file
df_chronic = pd.read_csv('RAWChronic Absent Students with DOB_2025-05-29.csv')

print("=" * 80)
print("STEP 1: UNDERSTANDING YOUR RAW CHRONIC ABSENTEEISM DATA")
print("=" * 80)

print(f"\nShape: {df_chronic.shape}")
print(f"  Rows: {df_chronic.shape[0]:,}")
print(f"  Columns: {df_chronic.shape[1]}")

print(f"\n\nData types:")
print(df_chronic.dtypes)

print(f"\n\nMissing values:")
print(df_chronic.isnull().sum())

print(f"\n\nSample data:")
print(df_chronic.head(5))

STEP 1: UNDERSTANDING YOUR RAW CHRONIC ABSENTEEISM DATA

Shape: (18638, 28)
  Rows: 18,638
  Columns: 28


Data types:
ID                                      int64
LastName                               object
FirstName                              object
Birthdate                              object
DT                                     object
DaysEnr                                 int64
DaysAbs                                 int64
DaysPresent                             int64
AttRate                                object
AttGrp                                 object
SiteName                               object
Gr                                      int64
Gen                                    object
Eth                                    object
Fluency                                object
Home Language                          object
Special Ed Status                      object
Cumulative Weighted Total GPA (TP)    float64
Current Weighted Total GPA (GT)       float64
Address

In [4]:
print("=" * 80)
print("PIVOTING CHRONIC ABSENTEEISM DATA FOR MODEL TESTING")
print("=" * 80)

print(f"\nOriginal shape: {df_chronic.shape}")
print(f"Original columns: {df_chronic.columns.tolist()}")

# Keep only the essential columns - drop PII (names, phone, email, addresses)
# Keep: ID, demographics, attendance metrics, academic info, suspension info
columns_to_keep = [
    'ID', 'Birthdate', 'DT', 'DaysEnr', 'DaysAbs', 'DaysPresent', 'AttRate', 
    'AttGrp', 'SiteName', 'Gr', 'Gen', 'Eth', 'Fluency', 'Home Language',
    'Special Ed Status', 'Cumulative Weighted Total GPA (TP)', 
    'Current Weighted Total GPA (GT)', 'SED Status', 'NumSusp', 'NumDaysSusp'
]

df_chronic_clean = df_chronic[columns_to_keep].copy()

print(f"\nAfter removing PII: {df_chronic_clean.shape}")

# Now pivot: each student gets multiple rows (one per metric)
df_chronic_pivoted = df_chronic_clean.melt(
    id_vars=['ID', 'Birthdate', 'DT', 'Gen', 'Eth', 'Fluency', 'Home Language', 
             'Special Ed Status', 'SED Status', 'SiteName', 'Gr'],
    var_name='Metric',
    value_name='Value'
)

print(f"After pivot: {df_chronic_pivoted.shape}")
print(f"Pivoted columns: {df_chronic_pivoted.columns.tolist()}")

print(f"\n\nFirst 15 rows of pivoted data:")
print(df_chronic_pivoted.head(15))

print(f"\n\nMetrics in pivoted data:")
print(df_chronic_pivoted['Metric'].unique())

# Save both versions
df_chronic.to_csv('RAWChronic_Cleaned.csv', index=False)
df_chronic_pivoted.to_csv('RAWChronic_Pivoted.csv', index=False)

print(f"\n\n✓ Saved: RAWChronic_Cleaned.csv (18,638 rows × {df_chronic.shape[1]} columns)")
print(f"✓ Saved: RAWChronic_Pivoted.csv ({df_chronic_pivoted.shape[0]:,} rows × {df_chronic_pivoted.shape[1]} columns)")

PIVOTING CHRONIC ABSENTEEISM DATA FOR MODEL TESTING

Original shape: (18638, 28)
Original columns: ['ID', 'LastName', 'FirstName', 'Birthdate', 'DT', 'DaysEnr', 'DaysAbs', 'DaysPresent', 'AttRate', 'AttGrp', 'SiteName', 'Gr', 'Gen', 'Eth', 'Fluency', 'Home Language', 'Special Ed Status', 'Cumulative Weighted Total GPA (TP)', 'Current Weighted Total GPA (GT)', 'AddressResidence', 'CityResidence', 'ZipResidence', 'ParentName', 'Telephone', 'PG_Email_1', 'SED Status', 'NumSusp', 'NumDaysSusp']

After removing PII: (18638, 20)
After pivot: (167742, 13)
Pivoted columns: ['ID', 'Birthdate', 'DT', 'Gen', 'Eth', 'Fluency', 'Home Language', 'Special Ed Status', 'SED Status', 'SiteName', 'Gr', 'Metric', 'Value']


First 15 rows of pivoted data:
        ID Birthdate       DT Gen                 Eth              Fluency  \
0   443282    7/2/10  5/29/25   M    African American         English Only   
1   436859   5/31/18  5/29/25   M              Latino         English Only   
2   435234   8/26/04 

In [5]:
print("=" * 80)
print("STEP 1: IDENTIFYING WHAT NEEDS CLEANING")
print("=" * 80)

print(f"\nShape: {df_chronic.shape}")

print(f"\n\nDATA TYPE ISSUES:")
print(f"\n1. AttRate column:")
print(f"   Type: {df_chronic['AttRate'].dtype}")
print(f"   Sample values: {df_chronic['AttRate'].head(5).tolist()}")
print(f"   Problem: Has '%' symbol, should be decimal (0.517 not 51.7%)")

print(f"\n2. Birthdate column:")
print(f"   Type: {df_chronic['Birthdate'].dtype}")
print(f"   Sample values: {df_chronic['Birthdate'].head(5).tolist()}")
print(f"   Problem: Text format, should be datetime")

print(f"\n3. GPA columns:")
print(f"   Cumulative GPA type: {df_chronic['Cumulative Weighted Total GPA (TP)'].dtype}")
print(f"   Current GPA type: {df_chronic['Current Weighted Total GPA (GT)'].dtype}")
print(f"   Sample cumulative: {df_chronic['Cumulative Weighted Total GPA (TP)'].head(5).tolist()}")

print(f"\n\nMISSING VALUES:")
missing = df_chronic.isnull().sum()
has_missing = missing[missing > 0]
for col in has_missing.index:
    pct = missing[col] / len(df_chronic) * 100
    print(f"  {col}: {missing[col]:,} ({pct:.1f}%)")

print(f"\n\nREDUNDANT COLUMNS (PII to remove):")
pii_cols = ['LastName', 'FirstName', 'ParentName', 'Telephone', 'PG_Email_1', 
            'AddressResidence', 'CityResidence', 'ZipResidence']
print(f"  These columns contain personal info we can drop:")
for col in pii_cols:
    print(f"    - {col}")

STEP 1: IDENTIFYING WHAT NEEDS CLEANING

Shape: (18638, 28)


DATA TYPE ISSUES:

1. AttRate column:
   Type: object
   Sample values: ['51.7%', '82.8%', '32.8%', '89.8%', '88.9%']
   Problem: Has '%' symbol, should be decimal (0.517 not 51.7%)

2. Birthdate column:
   Type: object
   Sample values: ['7/2/10', '5/31/18', '8/26/04', '3/9/16', '11/24/15']
   Problem: Text format, should be datetime

3. GPA columns:
   Cumulative GPA type: float64
   Current GPA type: float64
   Sample cumulative: [nan, nan, 0.3, nan, nan]


MISSING VALUES:
  Cumulative Weighted Total GPA (TP): 11,852 (63.6%)
  Current Weighted Total GPA (GT): 11,852 (63.6%)
  AddressResidence: 1 (0.0%)
  ParentName: 9 (0.0%)
  Telephone: 43 (0.2%)
  PG_Email_1: 123 (0.7%)
  NumSusp: 17,683 (94.9%)
  NumDaysSusp: 17,683 (94.9%)


REDUNDANT COLUMNS (PII to remove):
  These columns contain personal info we can drop:
    - LastName
    - FirstName
    - ParentName
    - Telephone
    - PG_Email_1
    - AddressResidence
    - 

In [6]:
print("=" * 80)
print("STEP 2: CONVERT ATTRATE FROM % TO DECIMAL")
print("=" * 80)

print(f"\nBefore:")
print(f"  Type: {df_chronic['AttRate'].dtype}")
print(f"  Sample values: {df_chronic['AttRate'].head(10).tolist()}")

# Remove '%' and convert to decimal
df_chronic['AttRate'] = df_chronic['AttRate'].str.replace('%', '').astype(float) / 100

print(f"\nAfter:")
print(f"  Type: {df_chronic['AttRate'].dtype}")
print(f"  Sample values: {df_chronic['AttRate'].head(10).tolist()}")
print(f"\n  ✓ AttRate converted: 51.7% → 0.517")

STEP 2: CONVERT ATTRATE FROM % TO DECIMAL

Before:
  Type: object
  Sample values: ['51.7%', '82.8%', '32.8%', '89.8%', '88.9%', '77.2%', '93.3%', '11.1%', '85.6%', '86.9%']

After:
  Type: float64
  Sample values: [0.517, 0.828, 0.32799999999999996, 0.898, 0.889, 0.772, 0.9329999999999999, 0.111, 0.856, 0.8690000000000001]

  ✓ AttRate converted: 51.7% → 0.517


In [7]:
print("=" * 80)
print("STEP 3: CONVERT BIRTHDATE TO DATETIME")
print("=" * 80)

print(f"\nBefore:")
print(f"  Type: {df_chronic['Birthdate'].dtype}")
print(f"  Sample values: {df_chronic['Birthdate'].head(10).tolist()}")

# Convert to datetime format
df_chronic['Birthdate'] = pd.to_datetime(df_chronic['Birthdate'], format='%m/%d/%y')

print(f"\nAfter:")
print(f"  Type: {df_chronic['Birthdate'].dtype}")
print(f"  Sample values: {df_chronic['Birthdate'].head(10).tolist()}")
print(f"\n  ✓ Birthdate converted: '7/2/10' → 2010-07-02")

STEP 3: CONVERT BIRTHDATE TO DATETIME

Before:
  Type: object
  Sample values: ['7/2/10', '5/31/18', '8/26/04', '3/9/16', '11/24/15', '5/31/08', '10/13/11', '1/22/10', '4/4/17', '7/28/11']

After:
  Type: datetime64[ns]
  Sample values: [Timestamp('2010-07-02 00:00:00'), Timestamp('2018-05-31 00:00:00'), Timestamp('2004-08-26 00:00:00'), Timestamp('2016-03-09 00:00:00'), Timestamp('2015-11-24 00:00:00'), Timestamp('2008-05-31 00:00:00'), Timestamp('2011-10-13 00:00:00'), Timestamp('2010-01-22 00:00:00'), Timestamp('2017-04-04 00:00:00'), Timestamp('2011-07-28 00:00:00')]

  ✓ Birthdate converted: '7/2/10' → 2010-07-02


In [8]:
print("=" * 80)
print("STEP 4: DROP PERSONAL IDENTIFYING INFORMATION (PII)")
print("=" * 80)

print(f"\nBefore: {df_chronic.shape}")
print(f"Columns: {df_chronic.columns.tolist()}")

# Drop PII columns (names, contact info, addresses)
pii_columns = ['LastName', 'FirstName', 'ParentName', 'Telephone', 'PG_Email_1',
               'AddressResidence', 'CityResidence', 'ZipResidence']

df_chronic = df_chronic.drop(columns=pii_columns)

print(f"\nAfter dropping PII: {df_chronic.shape}")
print(f"Columns kept: {df_chronic.columns.tolist()}")

print(f"\nDropped columns:")
for col in pii_columns:
    print(f"  - {col}")

print(f"\n✓ Removed 8 PII columns")

STEP 4: DROP PERSONAL IDENTIFYING INFORMATION (PII)

Before: (18638, 28)
Columns: ['ID', 'LastName', 'FirstName', 'Birthdate', 'DT', 'DaysEnr', 'DaysAbs', 'DaysPresent', 'AttRate', 'AttGrp', 'SiteName', 'Gr', 'Gen', 'Eth', 'Fluency', 'Home Language', 'Special Ed Status', 'Cumulative Weighted Total GPA (TP)', 'Current Weighted Total GPA (GT)', 'AddressResidence', 'CityResidence', 'ZipResidence', 'ParentName', 'Telephone', 'PG_Email_1', 'SED Status', 'NumSusp', 'NumDaysSusp']

After dropping PII: (18638, 20)
Columns kept: ['ID', 'Birthdate', 'DT', 'DaysEnr', 'DaysAbs', 'DaysPresent', 'AttRate', 'AttGrp', 'SiteName', 'Gr', 'Gen', 'Eth', 'Fluency', 'Home Language', 'Special Ed Status', 'Cumulative Weighted Total GPA (TP)', 'Current Weighted Total GPA (GT)', 'SED Status', 'NumSusp', 'NumDaysSusp']

Dropped columns:
  - LastName
  - FirstName
  - ParentName
  - Telephone
  - PG_Email_1
  - AddressResidence
  - CityResidence
  - ZipResidence

✓ Removed 8 PII columns


In [9]:
print("=" * 80)
print("CURRENT STATE OF df_chronic")
print("=" * 80)

print(f"\nCurrent shape: {df_chronic.shape}")
print(f"Current columns: {df_chronic.columns.tolist()}")

print(f"\nDo we have address columns?")
print(f"  'AddressResidence' in columns: {'AddressResidence' in df_chronic.columns}")
print(f"  'CityResidence' in columns: {'CityResidence' in df_chronic.columns}")
print(f"  'ZipResidence' in columns: {'ZipResidence' in df_chronic.columns}")

CURRENT STATE OF df_chronic

Current shape: (18638, 20)
Current columns: ['ID', 'Birthdate', 'DT', 'DaysEnr', 'DaysAbs', 'DaysPresent', 'AttRate', 'AttGrp', 'SiteName', 'Gr', 'Gen', 'Eth', 'Fluency', 'Home Language', 'Special Ed Status', 'Cumulative Weighted Total GPA (TP)', 'Current Weighted Total GPA (GT)', 'SED Status', 'NumSusp', 'NumDaysSusp']

Do we have address columns?
  'AddressResidence' in columns: False
  'CityResidence' in columns: False
  'ZipResidence' in columns: False


In [10]:
print("=" * 80)
print("RELOADING ORIGINAL FILE TO KEEP ADDRESS COLUMNS")
print("=" * 80)

# Reload the original file
df_chronic = pd.read_csv('RAWChronic Absent Students with DOB_2025-05-29.csv')

print(f"\nReloaded: {df_chronic.shape}")

# Convert AttRate (% to decimal)
df_chronic['AttRate'] = df_chronic['AttRate'].str.replace('%', '').astype(float) / 100
print(f"✓ Converted AttRate to decimal")

# Convert Birthdate to datetime
df_chronic['Birthdate'] = pd.to_datetime(df_chronic['Birthdate'], format='%m/%d/%y')
print(f"✓ Converted Birthdate to datetime")

# Drop ONLY sensitive PII (names, phone, email)
# KEEP address columns for distance calculations
pii_columns = ['LastName', 'FirstName', 'ParentName', 'Telephone', 'PG_Email_1']
df_chronic = df_chronic.drop(columns=pii_columns)

print(f"\nAfter cleaning: {df_chronic.shape}")
print(f"Columns: {df_chronic.columns.tolist()}")

print(f"\n✓ Address columns kept:")
print(f"  - AddressResidence")
print(f"  - CityResidence")
print(f"  - ZipResidence")

RELOADING ORIGINAL FILE TO KEEP ADDRESS COLUMNS

Reloaded: (18638, 28)
✓ Converted AttRate to decimal
✓ Converted Birthdate to datetime

After cleaning: (18638, 23)
Columns: ['ID', 'Birthdate', 'DT', 'DaysEnr', 'DaysAbs', 'DaysPresent', 'AttRate', 'AttGrp', 'SiteName', 'Gr', 'Gen', 'Eth', 'Fluency', 'Home Language', 'Special Ed Status', 'Cumulative Weighted Total GPA (TP)', 'Current Weighted Total GPA (GT)', 'AddressResidence', 'CityResidence', 'ZipResidence', 'SED Status', 'NumSusp', 'NumDaysSusp']

✓ Address columns kept:
  - AddressResidence
  - CityResidence
  - ZipResidence


In [11]:
print("=" * 80)
print("STEP 5: HANDLE MISSING VALUES")
print("=" * 80)

print(f"\nMissing values by column:")
missing = df_chronic.isnull().sum()
has_missing = missing[missing > 0].sort_values(ascending=False)

for col in has_missing.index:
    pct = missing[col] / len(df_chronic) * 100
    print(f"  {col}: {missing[col]:,} ({pct:.1f}%)")

print(f"\n\nDecisions:")
print(f"  GPA columns: Keep NaN (students without grades)")
print(f"  NumSusp: Keep NaN (students with 0 suspensions)")
print(f"  NumDaysSusp: Keep NaN (students with 0 suspension days)")
print(f"  Address columns: Keep NaN (some students may not have residential address on file)")

print(f"\nNo action needed - NaN values are meaningful here")
print(f"(They represent missing data, not data entry errors)")

print(f"\n✓ Step 5 complete - NaN values preserved")

STEP 5: HANDLE MISSING VALUES

Missing values by column:
  NumSusp: 17,683 (94.9%)
  NumDaysSusp: 17,683 (94.9%)
  Cumulative Weighted Total GPA (TP): 11,852 (63.6%)
  Current Weighted Total GPA (GT): 11,852 (63.6%)
  AddressResidence: 1 (0.0%)


Decisions:
  GPA columns: Keep NaN (students without grades)
  NumSusp: Keep NaN (students with 0 suspensions)
  NumDaysSusp: Keep NaN (students with 0 suspension days)
  Address columns: Keep NaN (some students may not have residential address on file)

No action needed - NaN values are meaningful here
(They represent missing data, not data entry errors)

✓ Step 5 complete - NaN values preserved


In [12]:
print("=" * 80)
print("STEP 5: HANDLE MISSING VALUES")
print("=" * 80)

print(f"\nBefore filling:")
print(f"  NumSusp NaN count: {df_chronic['NumSusp'].isna().sum():,}")
print(f"  NumDaysSusp NaN count: {df_chronic['NumDaysSusp'].isna().sum():,}")

# Fill suspension NaN values with 0 (no suspensions = 0)
df_chronic['NumSusp'] = df_chronic['NumSusp'].fillna(0)
df_chronic['NumDaysSusp'] = df_chronic['NumDaysSusp'].fillna(0)

print(f"\nAfter filling with 0:")
print(f"  NumSusp NaN count: {df_chronic['NumSusp'].isna().sum():,}")
print(f"  NumDaysSusp NaN count: {df_chronic['NumDaysSusp'].isna().sum():,}")

print(f"\nSample suspension values:")
print(df_chronic[['ID', 'NumSusp', 'NumDaysSusp']].head(15))

print(f"\n✓ Filled suspension NaN with 0 (no suspensions)")

# Show remaining NaN values
print(f"\n\nRemaining NaN values (expected):")
remaining_nan = df_chronic.isnull().sum()
has_nan = remaining_nan[remaining_nan > 0].sort_values(ascending=False)
for col in has_nan.index:
    pct = remaining_nan[col] / len(df_chronic) * 100
    print(f"  {col}: {remaining_nan[col]:,} ({pct:.1f}%)")

STEP 5: HANDLE MISSING VALUES

Before filling:
  NumSusp NaN count: 17,683
  NumDaysSusp NaN count: 17,683

After filling with 0:
  NumSusp NaN count: 0
  NumDaysSusp NaN count: 0

Sample suspension values:
        ID  NumSusp  NumDaysSusp
0   443282      0.0          0.0
1   436859      0.0          0.0
2   435234      0.0          0.0
3   408468      0.0          0.0
4   440496      0.0          0.0
5   359660      0.0          0.0
6   379380      0.0          0.0
7   360578      0.0          0.0
8   413269      0.0          0.0
9   447013      0.0          0.0
10  401312      0.0          0.0
11  375373      0.0          0.0
12  446777      0.0          0.0
13  434736      0.0          0.0
14  434951      0.0          0.0

✓ Filled suspension NaN with 0 (no suspensions)


Remaining NaN values (expected):
  Cumulative Weighted Total GPA (TP): 11,852 (63.6%)
  Current Weighted Total GPA (GT): 11,852 (63.6%)
  AddressResidence: 1 (0.0%)


In [14]:
print("=" * 80)
print("CHECKING GPA COLUMNS")
print("=" * 80)

print(f"\nGPA column names in dataset:")
for col in df_chronic.columns:
    if 'GPA' in col.upper():
        print(f"  {col}")

print(f"\nCumulative GPA:")
print(f"  Type: {df_chronic['Cumulative Weighted Total GPA (TP)'].dtype}")
print(f"  Sample values: {df_chronic['Cumulative Weighted Total GPA (TP)'].head(10).tolist()}")
print(f"  NaN count: {df_chronic['Cumulative Weighted Total GPA (TP)'].isna().sum():,}")

print(f"\nCurrent GPA:")
print(f"  Type: {df_chronic['Current Weighted Total GPA (GT)'].dtype}")
print(f"  Sample values: {df_chronic['Current Weighted Total GPA (GT)'].head(10).tolist()}")
print(f"  NaN count: {df_chronic['Current Weighted Total GPA (GT)'].isna().sum():,}")

print(f"\nNote: GPA columns have lots of NaN (63.6%)")
print(f"This is normal - not all students have GPA data on file")
print(f"(e.g., elementary students, students without grades yet)")

CHECKING GPA COLUMNS

GPA column names in dataset:
  Cumulative Weighted Total GPA (TP)
  Current Weighted Total GPA (GT)

Cumulative GPA:
  Type: float64
  Sample values: [nan, nan, 0.3, nan, nan, 3.0, 3.73, 4.0, nan, nan]
  NaN count: 11,852

Current GPA:
  Type: float64
  Sample values: [nan, nan, 0.0, nan, nan, 3.57, 0.0, 4.0, nan, nan]
  NaN count: 11,852

Note: GPA columns have lots of NaN (63.6%)
This is normal - not all students have GPA data on file
(e.g., elementary students, students without grades yet)


In [16]:
print("=" * 80)
print("STEP 8: PIVOT THE CLEANED DATA")
print("=" * 80)

print(f"\nOriginal shape: {df_chronic.shape}")

# Pivot: each student gets multiple rows (one per metric)
df_chronic_pivoted = df_chronic.melt(
    id_vars=['ID', 'Birthdate', 'DT', 'Gen', 'Eth', 'Fluency', 'Home Language', 
             'Special Ed Status', 'SED Status', 'SiteName', 'Gr',
             'AddressResidence', 'CityResidence', 'ZipResidence'],
    var_name='Metric',
    value_name='Value'
)

print(f"Pivoted shape: {df_chronic_pivoted.shape}")

print(f"\n\nMetrics in pivoted data:")
print(df_chronic_pivoted['Metric'].unique())

print(f"\n\nFirst 10 rows of pivoted data:")
print(df_chronic_pivoted.head(10))

# Save both versions
df_chronic.to_csv('RAWChronic_Cleaned.csv', index=False)
df_chronic_pivoted.to_csv('RAWChronic_Cleaned_Pivoted.csv', index=False)

print(f"\n\n✓ SAVED: RAWChronic_Cleaned.csv ({df_chronic.shape[0]:,} rows × {df_chronic.shape[1]} columns)")
print(f"✓ SAVED: RAWChronic_Cleaned_Pivoted.csv ({df_chronic_pivoted.shape[0]:,} rows × {df_chronic_pivoted.shape[1]} columns)")

STEP 8: PIVOT THE CLEANED DATA

Original shape: (18638, 23)
Pivoted shape: (167742, 16)


Metrics in pivoted data:
['DaysEnr' 'DaysAbs' 'DaysPresent' 'AttRate' 'AttGrp'
 'Cumulative Weighted Total GPA (TP)' 'Current Weighted Total GPA (GT)'
 'NumSusp' 'NumDaysSusp']


First 10 rows of pivoted data:
       ID  Birthdate       DT Gen                 Eth              Fluency  \
0  443282 2010-07-02  5/29/25   M    African American         English Only   
1  436859 2018-05-31  5/29/25   M              Latino         English Only   
2  435234 2004-08-26  5/29/25   F              Latino      English Learner   
3  408468 2016-03-09  5/29/25   F              Latino      English Learner   
4  440496 2015-11-24  5/29/25   F              Latino      English Learner   
5  359660 2008-05-31  5/29/25   F              Latino  Recl English Fluent   
6  379380 2011-10-13  5/29/25   M            Filipino         English Only   
7  360578 2010-01-22  5/29/25   M    African American         English Only  

In [18]:
# STEP 1: Create .gitignore file
gitignore_content = """# Data files (do NOT commit)
*.csv
*.xlsx
*.xls
*.parquet
*.json

# Large data files
**/data/
**/datasets/
RAWChronic*
ONGB*

# Python cache
__pycache__/
*.py[cod]
*$py.class
*.so

# Jupyter
.ipynb_checkpoints/
*.ipynb_checkpoints

# Environment
venv/
env/
.env
"""

with open('.gitignore', 'w') as f:
    f.write(gitignore_content)

print("✓ Created .gitignore file")

# STEP 2: Create README.md file
readme_content = """# Data Cleaning Pipelines

## Notebooks
- `ONGB_Data_Cleaning.ipynb` - OUSD evaluation data pipeline
- `Chronic_Data_Cleaning.ipynb` - Chronic absenteeism cleaning and pivoting

## Data Files
Data files are NOT included in this repository (see `.gitignore`).

To run these notebooks, you need:
1. `RAWChronic Absent Students with DOB_2025-05-29.csv`
2. `ONGB_EvalData_Complete_Anonymized.xlsx`

Place these files in the project root directory before running notebooks.

## What Each Notebook Does

### ONGB_Data_Cleaning.ipynb
- Pivots wide format → long format
- Removes duplicate demographic columns
- Converts numeric variables to proper data types
- Output: `ONGB_EvalData_Cleaned_Pivoted.csv`

### Chronic_Data_Cleaning.ipynb
- Converts AttRate from % to decimal
- Converts Birthdate to datetime
- Removes PII (keeps address for distance calculations)
- Fills suspension NaN with 0
- Pivots to long format
- Outputs: `RAWChronic_Cleaned.csv` and `RAWChronic_Cleaned_Pivoted.csv`

## Ready for Modeling
Both datasets are pivoted and cleaned, ready to feed into ML models to test which format works best.
"""

with open('README.md', 'w') as f:
    f.write(readme_content)

print("✓ Created README.md")
print("\nNow in your terminal, run:")
print("  git add .")
print("  git commit -m 'Add data cleaning notebooks'")
print("  git push origin main")

✓ Created .gitignore file
✓ Created README.md

Now in your terminal, run:
  git add .
  git commit -m 'Add data cleaning notebooks'
  git push origin main
