In [71]:
#
# Data merger for SBAC data
#

In [95]:
import numpy as np
import os
import pandas as pd
import re
from zipfile import ZipFile

In [108]:
# this works for 2024
def parse_2023_2024 ( df, academic_year ):
    academic_year = f'{academic_year - 1}-{academic_year}'

    areas = set( [ re.search( r'((?:Composite )?Area \d+)', col ).group( 1 ) for col in df if re.search( r'Area \d', col ) ] )
    area_summary = pd.DataFrame()
    overall_summary = df[[
        'School Code','Test Type','Test ID','Student Group ID','Grade',
        'Total Students Enrolled','Total Students Tested','Total Students Tested with Scores',
        'Mean Scale Score','Count Standard Not Met','Count Standard Nearly Met','Count Standard Met','Count Standard Exceeded'
    ]]

    for area in areas:
        columns = [ col for col in df if re.match( area, col ) ]
        is_composite = area[0] == 'C'
        area_number = re.search( r'(\d+)', area ).group( 1 )

        area_df = df \
            [['School Code','Test Type','Test ID','Student Group ID','Grade'] + columns] \
            .assign( area_composite = 'composite' if is_composite else 'area' ) \
            .assign( area_id = area_number ) \
            .rename( columns = { col: re.sub( r'(?:composite )?area \d+\s*', '', col.lower() ) for col in columns } )
        area_df = area_df \
            .rename( columns = { col: re.sub( r'[^a-zA-Z]+', '_', col.lower() ) for col in area_df } ) \
            .rename( columns = { 'total': 'count_total' } )
        
        area_summary = pd.concat( [ area_summary, area_df ], axis = 0, ignore_index = True )

    overall_summary = overall_summary \
        .rename( columns = { col: re.sub( r'[^a-z]+', '_', col.lower() ) for col in overall_summary } )
    overall_summary = overall_summary \
        .rename( columns = { col: re.sub( r'^total_', 'count_', col ) for col in overall_summary } )
    
    return ( overall_summary.assign( academic_year = academic_year ), area_summary.assign( academic_year = academic_year ) )



def parse_2022_2023 ( df, academic_year ):
    academic_year = f'{academic_year - 1}-{academic_year}'

    areas = set( [ re.search( r'((?:Composite )?Area \d+)', col ).group( 1 ) for col in df if re.search( r'Area \d', col ) ] )
    area_summary = pd.DataFrame()
    overall_summary = df[[
        'School Code','Test Type','Test ID','Student Group ID','Grade',
        'Students Enrolled','Students Tested','Students with Scores','Mean Scale Score',
        'Percentage Standard Not Met','Percentage Standard Nearly Met','Percentage Standard Met','Percentage Standard Exceeded'
    ]]

    for area in areas:
        columns = [ col for col in df if re.match( area, col ) ]
        is_composite = area[0] == 'C'
        area_number = re.search( r'(\d+)', area ).group( 1 )

        area_df = df \
            [['School Code','Test Type','Test ID','Student Group ID','Grade'] + columns] \
            .assign( area_composite = 'composite' if is_composite else 'area' ) \
            .assign( area_id = area_number ) \
            .rename( columns = { col: re.sub( r'(?:composite )?area \d+\s*', '', col.lower() ) for col in columns } )
        area_df = area_df \
            .rename( columns = { col: re.sub( r'[^a-zA-Z]+', '_', col.lower() ) for col in area_df } ) \
            .rename( columns = { 'total': 'count_total' } )
        
        area_summary = pd.concat( [ area_summary, area_df ], axis = 0, ignore_index = True )

    overall_summary = overall_summary \
        .rename( columns = { col: re.sub( r'[^a-z]+', '_', col.lower() ) for col in overall_summary } )
    overall_summary = overall_summary \
        .assign( **{
            f'count_standard_{key}':
            np.where(
                overall_summary[f'percentage_standard_{key}'] == '*',
                0,
                overall_summary[f'percentage_standard_{key}']
            ).astype( float )
            * np.where( overall_summary['students_with_scores'] == '*', 0, overall_summary['students_with_scores'] ).astype( int )
            / 100
            for key in [ 'not_met', 'nearly_met', 'met', 'exceeded' ]
        } ) \
        .drop( [ column for column in overall_summary if re.match( r'percentage_', column ) ], axis = 1 )
    overall_summary = overall_summary \
        .rename( columns = { col: re.sub( r'^total_', 'count_', col ) for col in overall_summary } )
    
    return ( overall_summary.assign( academic_year = academic_year ), area_summary.assign( academic_year = academic_year ) )


parsers = {
    '2024': parse_2023_2024,
    '2023': parse_2022_2023,
    # '2022': parse_2021_2022,
    # '2021': parse_2020_2021
}

In [111]:
raw_data_directory = '../raw/'
san_francisco_county_code = 38

parsed_dfs = []
for filename in os.listdir( raw_data_directory ):
    print( f'Loading {filename}...' )

    if not re.match( r'sb_ca.*?\.zip', filename ):
        continue

    sbac_year = re.search( r'sb_ca(\d{4})', filename ).group( 1 )
    if sbac_year not in parsers:
        print( '> Cannot find parser; skipping...' )
        continue

    full_filename = f'{raw_data_directory}{filename}'
    with ZipFile( full_filename, 'r' ) as zip:
        sbac_filename = [ zip_filename for zip_filename in zip.namelist() if re.match( r'sb_ca.*?_all_csv.*?.txt', zip_filename ) ][0]
        with zip.open( sbac_filename ) as sbac_file:
            df = pd.read_csv( sbac_file, sep = '^', encoding = 'latin1' )[lambda _df: _df['County Code'] == san_francisco_county_code]
            parsed_dfs.append( parsers[sbac_year]( df, int( sbac_year ) ) )

Loading sb_ca2021_all_csv_v2.zip...
> Cannot find parser; skipping...
Loading .DS_Store...
Loading StudentGroups.zip...
Loading Tests.txt...
Loading StudentGroups (3).zip...
Loading sb_ca2022_all_csv_v1.zip...
> Cannot find parser; skipping...
Loading sb_ca2023_all_csv_v1.zip...
Loading StudentGroups (2).zip...
Loading sb_ca2024_all_csv_v1.zip...
Loading sb_ca2023entities_csv.zip...


IndexError: list index out of range

In [None]:
#
# Load and store essential keys
#

test_codes = pd \
    .read_csv( f'{raw_data_directory}Tests.txt', sep = '^' ) \
    .rename( columns = { 'Test ID': 'test_id', 'Test Name': 'test_name' } ) \
    [['test_id','test_name']] \
    .assign( test_name = lambda _df: _df.test_name.apply( lambda _row: _row.split( ' - ' ) ) ) \
    .assign( test_suite_name = lambda _df: _df.test_name.apply( lambda _row: _row[0] ) ) \
    .assign( test_name = lambda _df: _df.test_name.apply( lambda _row: _row[1] ) )
test_codes.to_csv( './sfusd-test-codes.csv', index = False )

student_groups = pd \
    .read_csv( f'{raw_data_directory}StudentGroups.txt', sep = '^', encoding = 'latin1' ) \
    .rename( columns = {
        'Demographic ID': 'demographic_id',
        'Demographic Name': 'demographic_name',
        'Student Group': 'demographic_dimension'
    } ) \
    [['demographic_id','demographic_dimension','demographic_name']]
student_groups.to_csv( './sfusd-student-groups.csv', index = False )

In [110]:
parsed_dfs

[(         school_code test_type  test_id  student_group_id  grade  \
  2975893            0         B        1                 1      3   
  2975894            0         B        1                 3      3   
  2975895            0         B        1                 4      3   
  2975896            0         B        1                 6      3   
  2975897            0         B        1                 7      3   
  ...              ...       ...      ...               ...    ...   
  3018866       137307         B        2               220     13   
  3018867       137307         B        2               226     13   
  3018868       137307         B        2               241     13   
  3018869       137307         B        2               251     13   
  3018870       137307         B        2               252     13   
  
          students_enrolled students_tested students_with_scores  \
  2975893              3979            3821                 3818   
  2975894            