In [None]:
#
# Data merger for absenteeism data
#

#
# Need to add absenteeism reason loader
#

In [15]:
import os
import pandas as pd
import re

In [53]:
rename_columns = {
    'Academic Year': 'academic_year',
    'AcademicYear':  'academic_year',
    'school_code':   'school_code',
    'School Name':   'school_name',
    'SchoolName':    'school_name',
    'Reporting Category': 'cohort',
    'ReportingCategory':  'cohort',
    'ChronicAbsenteeismEligibleCumula': 'cohort_enrollment_count',
    'ChronicAbsenteeismEligibleCumulativeEnrollment': 'cohort_enrollment_count',
    'ChronicAbsenteeismCount': 'cohort_absenteeism_count'
}

raw_data_directory = '../raw/'
despace = lambda regex, df: [ col for col in df if re.search( regex, col ) ][0]

all_data = pd.DataFrame()

for filename in os.listdir( raw_data_directory ):
    full_filename = f'{raw_data_directory}{filename}'

    df = pd.read_csv( full_filename, sep = '\t', encoding = 'latin1' )

    county_name_col = despace( r'County\s*Name', df )
    aggregation_col = despace( r'Aggregate\s*Level', df )
    school_code_col = despace( r'School\s*Code', df )

    # Note: asterisks are censored, not zero (fix below)
    df = df \
        [lambda _df: _df[county_name_col] == 'San Francisco'] \
        [lambda _df: _df[aggregation_col] == 'S'] \
        .assign(
            school_code = lambda _df: _df[school_code_col].astype( int )
        ) \
        [lambda _df: [col for col in rename_columns.keys() if col in _df]] \
        .rename( columns = rename_columns ) \
        .assign( academic_year = lambda _df: _df.academic_year.apply( lambda _row: re.sub( r'-(\d+)$', '-20\\1', _row ) ) ) \
        .reset_index( drop = True )
    
    if df.dtypes['cohort_enrollment_count'] != 'float':
        df = df.assign(
            cohort_enrollment_count = lambda _df: _df.cohort_enrollment_count.apply( lambda _row: re.sub( r'[^\d]+', '', _row ) + '0' ).astype( int ),
            cohort_absenteeism_count = lambda _df: _df.cohort_absenteeism_count.apply( lambda _row: re.sub( r'[^\d]+', '', _row ) + '0' ).astype( int )
        )

    else:
        df = df.assign(
            cohort_enrollment_count = df.cohort_enrollment_count.fillna( 0 ),
            cohort_absenteeism_count = df.cohort_absenteeism_count.fillna( 0 )
        )
    
    all_data = pd.concat( [ all_data, df ], axis = 0, ignore_index = True )

In [78]:
cohort_renames = {
    'RB': [ 'race', 'black' ],
    'RI': [ 'race', 'american_indian_alaska_native' ],
    'RA': [ 'race', 'asian' ],
    'RF': [ 'race', 'filipino' ],
    'RH': [ 'race', 'hispanic_latino' ],
    'RD': [ 'race', 'not_reported' ],
    'RP': [ 'race', 'pacific_islander' ],
    'RT': [ 'race', 'mixed' ],
    'RW': [ 'race', 'white' ],
    'GM': [ 'gender', 'male' ],
    'GF': [ 'gender', 'female' ],
    'GX': [ 'gender', 'nonbinary' ],
    'GZ': [ 'gender', 'not_reported' ],
    'SE': [ 'student', 'english_learner' ],
    'SD': [ 'student', 'disabled' ],
    'SS': [ 'student', 'socioeconomically_disadvantaged' ],
    'SM': [ 'student', 'migrant' ],
    'SF': [ 'student', 'foster' ],
    'SH': [ 'student', 'homeless' ],
    'GRTKKN': [ 'grade', 'TK-K' ],
    'GRTK8': [ 'grade', 'TK-8' ],
    'GRKN': [ 'grade', 'K' ],
    'GRK': [ 'grade', 'K' ],
    'GRK8': [ 'grade', 'K-8' ],
    'GR13': [ 'grade', '1-3' ],
    'GR46': [ 'grade', '4-6' ],
    'GR78': [ 'grade', '7-8' ],
    'GR912': [ 'grade', '9-12' ],
    'GRUG': [ 'grade', 'ungraded' ],
    'TA': [ 'total', '' ]
}

#
# Todo: build in null checks
#
school_codes = all_data[['school_code','school_name']].drop_duplicates()

all_data \
    [[ col for col in all_data if col != 'school_name']] \
    .assign( cohort = lambda _df: _df.cohort.map( cohort_renames ) ) \
    .assign( cohort_dimension = lambda _df: _df.cohort.apply( lambda _row: _row[0] ) ) \
    .assign( cohort = lambda _df: _df.cohort.apply( lambda _row: _row[1] ) ) \
    .to_csv( './sfusd-absenteeism-data.csv', index = False )
school_codes.to_csv( './sfusd-school-codes.csv', index = False )

In [95]:
#
# Test the data
#
school_search_string = 'Parks'
school_data = pd.read_csv( './sfusd-school-codes.csv' )[lambda _df: _df.school_name.str.contains( school_search_string )]
absenteeism_data = pd.read_csv( './sfusd-absenteeism-data.csv' ).merge( school_data )

In [98]:
absenteeism_data \
    .sort_values( [ 'cohort_dimension', 'cohort', 'academic_year' ] ) \
    .assign( cohort_absenteeism_rate = lambda _df: _df.cohort_absenteeism_count / _df.cohort_enrollment_count ) \
    [lambda _df: _df.cohort_dimension.isin( [ 'race', 'total' ] )] \
    [lambda _df: _df.cohort.isin( [ 'black', 'white', 'hispanic_latino', '' ] ) | pd.isnull( _df.cohort )]

Unnamed: 0,academic_year,school_code,cohort,cohort_enrollment_count,cohort_absenteeism_count,cohort_dimension,school_name,cohort_absenteeism_rate
133,2016-2017,6041503,black,100.0,0.0,race,Parks (Rosa) Elementary,0.0
112,2017-2018,6041503,black,92.0,30.0,race,Parks (Rosa) Elementary,0.326087
70,2018-2019,6041503,black,88.0,35.0,race,Parks (Rosa) Elementary,0.397727
28,2020-2021,6041503,black,89.0,40.0,race,Parks (Rosa) Elementary,0.449438
7,2021-2022,6041503,black,870.0,640.0,race,Parks (Rosa) Elementary,0.735632
49,2022-2023,6041503,black,830.0,500.0,race,Parks (Rosa) Elementary,0.60241
91,2023-2024,6041503,black,760.0,400.0,race,Parks (Rosa) Elementary,0.526316
136,2016-2017,6041503,hispanic_latino,98.0,0.0,race,Parks (Rosa) Elementary,0.0
115,2017-2018,6041503,hispanic_latino,104.0,24.0,race,Parks (Rosa) Elementary,0.230769
73,2018-2019,6041503,hispanic_latino,88.0,15.0,race,Parks (Rosa) Elementary,0.170455
