In [3]:
#get Combined Df Cell
import pandas as pd

DIST_VA = pd.read_excel('data/2324_VA_DIST_DETAILS.xlsx', sheet_name=1,index_col=0)
DIST_GRAD_RATE = pd.read_excel('data/2024_DISTRICT_GRAD_RATE.xlsx', sheet_name=1,index_col=0)
DIST_RACE_DIS = pd.read_excel('data/2324_DIST_RACE_DIS.xlsx', sheet_name=1,index_col=0)
DIST_DETAIL = pd.read_excel('data/2024_District_Details.xlsx', sheet_name=1,index_col=0)
DIST_ACHEIVE = pd.read_excel('data/2324_VA_DIST_DETAILS.xlsx', sheet_name=1,index_col=0)
DIST_SPEND = pd.read_excel('data/2324_DISTRICT_SPEND_PER_PUPIL.xlsx', sheet_name=1,index_col=0)
# Clean Distric Value Added DF
DIST_VA = DIST_VA.drop(columns = 'Watermark')
DIST_GRAD_RATE = DIST_GRAD_RATE.drop(columns=['District Name','County','Region','Watermark'])
# Clean Distric Detail DF
import pandas as pd
import numpy as np

def transform_district_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Groups a DataFrame by district ID, creating columns for total students,
    enrollment per student group, and total attendance percentage.

    Args:
        df (pd.DataFrame): The input DataFrame with district data. It is expected
                           to have an index representing the district ID and columns
                           for 'Student Group', 'Enrollment', and 'Attendance Rate'.

    Returns:
        pd.DataFrame: A new DataFrame where each row represents a unique district,
                      with columns for total enrollment, total attendance rate, and
                      enrollment numbers for each student group.
    """
    # 1. Make a copy to prevent changes to the original DataFrame
    data = df.copy()

    # Use the DataFrame's index as the unique district identifier
    if data.index.name:
        district_id_col = data.index.name
    else:
        # Fallback if the index is not named
        district_id_col = 'District ID'
        data.index.name = district_id_col

    data.reset_index(inplace=True)

    # 2. Clean the data by converting relevant columns to numeric types
    # Errors are coerced to NaN (Not a Number) for non-numeric values
    data['Enrollment'] = pd.to_numeric(data['Enrollment'], errors='coerce')
    data['Attendance Rate'] = pd.to_numeric(data['Attendance Rate'], errors='coerce')

    # 3. Pivot the table to turn unique 'Student Group' values into columns
    # We use 'first' as the aggregation function, assuming one entry per group per district
    pivoted_data = data.pivot_table(
        index=district_id_col,
        columns='Student Group',
        values='Enrollment'
    ).fillna(0).astype(int)

    # 4. Get the overall attendance rate from the 'All Students' group
    # We create a mapping of District ID to its 'Attendance Rate'
    all_students_df = data[data['Student Group'] == 'All Students'].set_index(district_id_col)
    total_attendance_map = all_students_df['Attendance Rate']

    # 5. Combine the pivoted data with the total attendance rate
    # Rename the 'All Students' column to 'Total Students' for clarity
    if 'All Students' in pivoted_data.columns:
        pivoted_data.rename(columns={'All Students': 'Total Students'}, inplace=True)

    # Add the 'Total Attendance Percent' column by mapping it from the total_attendance_map
    pivoted_data['Total Attendance Percent'] = pivoted_data.index.map(total_attendance_map)

    # 6. Reorder columns for better readability
    if 'Total Students' in pivoted_data.columns:
        # Create a list of columns with 'Total Students' and 'Total Attendance Percent' first
        student_group_cols = [col for col in pivoted_data.columns if col not in ['Total Students', 'Total Attendance Percent']]
        final_order = ['Total Students', 'Total Attendance Percent'] + student_group_cols
        pivoted_data = pivoted_data[final_order]

    # Reset index to turn the District ID from an index back into a column
    return pivoted_data.reset_index()
DIST_DETAIL = transform_district_data(DIST_DETAIL)
DIST_DETAIL.set_index('District IRN', inplace=True)
DIST_SPEND = DIST_SPEND.drop(columns=['District Name','County','Region','Watermark','State-Level Expenditures per Equivalent Pupil','State-Level Expenditures per Equivalent Pupil - Federal Funds','State-Level Expenditures per Equivalent Pupil - State and Local Funds'])
# Join Dataframes
combined_df = pd.merge(DIST_VA,DIST_GRAD_RATE, on='District IRN')
combined_df = pd.merge(combined_df,DIST_DETAIL, on='District IRN')
combined_df = pd.merge(combined_df,DIST_SPEND, on='District IRN')
combined_df.drop(48975,inplace=True)

#convert Numbers
combined_df['Overall Composite'] = combined_df['Overall Composite'].astype(float)
combined_df['Overall Effect Size'] = combined_df['Overall Effect Size'].astype(float)
combined_df['Graduation Rate Component Percent (Weighted Graduation Rate)'] = combined_df['Graduation Rate Component Percent (Weighted Graduation Rate)'].astype(float)
combined_df['Four Year Graduation Rate - Class of 2023'] = combined_df['Four Year Graduation Rate - Class of 2023'].astype(float)
combined_df['Four Year Graduation Rate Numerator - Class of 2023'] = combined_df['Four Year Graduation Rate Numerator - Class of 2023'].astype(float)
combined_df['Four Year Graduation Rate Denominator - Class of 2023'] = combined_df['Four Year Graduation Rate Denominator - Class of 2023'].astype(float)
combined_df['Five Year Graduation Rate - Class of 2022'] = combined_df['Five Year Graduation Rate - Class of 2022'].astype(float)
combined_df['Five Year Graduation Rate Numerator - Class of 2022'] = combined_df['Five Year Graduation Rate Numerator - Class of 2022'].astype(float)
combined_df['Five Year Graduation Rate Denominator - Class of 2022'] = combined_df['Five Year Graduation Rate Denominator - Class of 2022'].astype(float)

#convert Money and Stars
combined_df['Expenditures per Equivalent Pupil'] = combined_df['Expenditures per Equivalent Pupil'].str.replace(r'[$,€£]', '', regex=True).str.replace(',', '').astype(float)
combined_df['Expenditures per Equivalent Pupil - Federal Funds'] = combined_df['Expenditures per Equivalent Pupil - Federal Funds'].str.replace(r'[$,€£]', '', regex=True).str.replace(',', '').astype(float)
combined_df['Expenditures per Equivalent Pupil - State and Local Funds'] = combined_df['Expenditures per Equivalent Pupil - State and Local Funds'].str.replace(r'[$,€£]', '', regex=True).str.replace(',', '').astype(float)

#Stars

combined_df['Progress Component Star Rating'] = combined_df['Progress Component Star Rating'].str.replace(' Stars','', regex=True).str.replace(' Star','',regex=True).astype(int)
combined_df['Graduation Rate Component Rating'] = combined_df['Graduation Rate Component Rating'].str.replace(' Stars','', regex=True).str.replace(' Star','',regex=True).astype(int)



  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")


In [21]:
combined_df.sort_values(by='Performance Index Percent 2023-2024',ascending=False)

KeyError: 'Performance Index Percent 2023-2024'