In [75]:
import pandas as pd
import os

# 1. Import CDE Raw Data 

## 1.1 Reading Raw Data

In [85]:
# Define the file path where you have downloaded and saved your CDE data
# Replace "SampleDrive/SampleFolder/SampleFile" with the correct path
file_path = 'SampleDrive/SampleFolder/SampleFile'

#check to see if the file path exists
print(os.path.exists(file_path))

# Read the file into a DataFrame
# Assuming the file is delimited by a specific character, for example, a comma. 
# Adjust the delimiter as needed.
df_cde = pd.read_csv(file_path, delimiter='\t')  # Change delimiter if necessary

# Display the first few rows of the DataFrame to verify
print(df_cde.head())

## 1.2 Filter the dataset with interetsed schools, ReportingCategory, and columns 

In [77]:
# Define the list of school names of interest
school_cde_lst = [...] #select the list of Highschools served by college from CDE data

# Define the list of reporting categories
reporting_categories_to_keep = [
    "GN_F", "GN_M", "GN_X", "RE_A", "RE_B", "RE_D", "RE_F", "RE_H",
    "RE_I", "RE_P", "RE_T", "RE_W", "SG_DS", "SG_SD"
]

# Define the list of columns to keep
columns_to_keep = ['AcademicYear', 'SchoolName', 'DistrictName', 'ReportingCategory','GR_09', 'GR_10', 'GR_11', 'GR_12']


In [78]:
# Filter the DataFrame to include only the specified columns
# 'columns_to_keep' should include columns necessary for analysis
df_cde_filtered = df_cde.loc[:, columns_to_keep]

# Define the grade-level columns for which enrollment totals will be calculated
GR_columns = ['GR_09', 'GR_10', 'GR_11', 'GR_12']

# Ensure grade-level columns are numeric, converting non-numeric values to NaN
# Then, cast the columns to the 'Int64' type (which supports nullable integers)
for col in GR_columns:
    df_cde_filtered[col] = pd.to_numeric(df_cde_filtered[col], errors='coerce').astype('Int64')

# Calculate the total enrollment across all grade levels (GR_09 to GR_12)
# This creates a new column 'TOTAL_ENR' that sums the grade-level enrollments row-wise
df_cde_filtered['TOTAL_ENR'] = df_cde_filtered[GR_columns].sum(axis=1)

# Display the first few rows of the filtered DataFrame to verify changes
#df_cde_filtered.head()

In [79]:
#  Filter the DataFrame by 'ReportingCategory'
# Retain rows where 'ReportingCategory' matches one of the predefined values in 'reporting_categories_to_keep'
df_cde_filtered_by_ReptCat = df_cde_filtered[df_cde_filtered['ReportingCategory'].isin(reporting_categories_to_keep)]

# Further filter the DataFrame by 'SchoolName'
# Retain rows where 'SchoolName' matches one of the schools in the predefined 'school_cde_lst'
df_cde_filtered_by_ReptCat_school = df_cde_filtered_by_ReptCat[df_cde_filtered_by_ReptCat['SchoolName'].isin(school_cde_lst)]

# Assign the filtered DataFrame to a new variable for later reshaping
# This variable serves as the final cleaned dataset for further processing
df_cde_long = df_cde_filtered_by_ReptCat_school

# (Optional) display a preview of the resulting DataFrame
# Uncomment the lines below to check the first 15 rows and the shape of the filtered dataset
# print(df_cde_long.head(15))
# print("Shape of the final DataFrame:", df_cde_long.shape)



## 1.3. Pivot the Data Table from Long table to wide Table

In [80]:
# Pivot the DataFrame to summarize 'TOTAL_ENR' by 'AcademicYear', 'SchoolName', and 'ReportingCategory'
df_cde_wide = df_cde_long.pivot_table(
    index=['AcademicYear', 'SchoolName'], 
    columns='ReportingCategory', 
    values='TOTAL_ENR',
    aggfunc='sum'
)

# Flatten the MultiIndex columns, if present, to ensure column names are simplified
df_cde_wide.columns = [col for col in df_cde_wide.columns]

# Reset the index to make 'AcademicYear' and 'SchoolName' regular columns
df_cde_wide.reset_index(inplace=True)

# Define a dictionary to rename columns for better readability
columns_rename_dict = {
    'AcademicYear': 'Academic Year',
    'SchoolName': 'High School (CDE name)',
    'GN_F': 'Gender_F',
    'GN_M': 'Gender_M',
    'GN_X': 'Gender_Non_Binary',
    'RE_A': 'Asian',
    'RE_B': 'African American',
    'RE_D': 'Not Reported',
    'RE_F': 'Filipino',
    'RE_H': 'Hispanic or Latino',
    'RE_I': 'American Indian',
    'RE_P': 'Pacific Islander',
    'RE_T': 'Two or More Races',
    'RE_W': 'White',
    'SG_DS': 'Students_with_Disabilities',
    'SG_SD': 'Socioeconomically Disadvantaged'
}

# Rename columns based on the defined dictionary
df_cde_wide.rename(columns=columns_rename_dict, inplace=True)

# Display the first few rows to verify the output
#print(df_cde_wide.head())


In [82]:
# Define DataFrame called "high_school_mapping_df" to match High School names from the CDE system with the corresponding ones in the college data system
high_school_mapping_df = pd.DataFrame({
    "High School (CDE name)": [
        # List of CDE names of High Schools served by the college SA programs
        # These are the names as they appear in the source dataset, similar to "school_cde_lst"
    ],  
    "HighSchool": [
        # List of names of the same High Schools, but as they are recognized in the college data system
        # These will be used for mapping in the merge operation
    ]
})

# Merge df_cde_wide with high_school_mapping_df based on 'High School (CDE name)'
# This will map the High School names from the CDE system to the corresponding names in the college data system
df_cde_wide = df_cde_wide.merge(high_school_mapping_df, how='left', on='High School (CDE name)')

# Display the first few rows to verify the successful addition of the new 'HighSchool' column
#print(df_cde_wide.head())
#print(df_cde_wide.info())


In [None]:
# Replace NaN values with 0
df_cde_wide.fillna(0, inplace=True)

# Define the list of ethnicity columns
ethnicity_columns = [
    'African American', 'American Indian', 'Asian', 
    'Filipino', 'Hispanic or Latino', 'Pacific Islander', 
    'White', 'Two or More Races', 'Not Reported'
]

# Calculate 'Total Enrollment' by summing up the ethnicity columns
df_cde_wide['Total Enrollment'] = df_cde_wide[ethnicity_columns].sum(axis=1)

# Define the desired column order
desired_columns_order = [
    'Academic Year', 'Total Enrollment', 'African American', 'American Indian', 'Asian', 
    'Filipino', 'Hispanic or Latino', 'Pacific Islander', 'White', 
    'Two or More Races', 'Not Reported', 'High School (CDE name)', 
    'HighSchool'
]

# Reorder the columns in df_cde_wide
df_cde_wide = df_cde_wide[desired_columns_order]


# Order by 'High School (CDE name)' and 'Academic Year'
df_cde_wide = df_cde_wide.sort_values(by=['HighSchool', 'Academic Year'])

# Display the first few rows to verify the changes
#print(df_cde_wide.head())

# 2. Output CDE data to an Excel File

In [84]:
# Define the file path where you want to save the cleaned-up CDE data
# Replace "SampleDrive/SampleFolder" with the correct path.
#The excel file is named as "SpecialAdmit_Demos.xlsx"
file_path = 'SampleDrive/SampleFolder/SpecialAdmit_Demos.xlsx'

# Output to the  Excel file
df_updated.to_excel(output_file_path, index=False)
