In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Import Special Admit Foundation Data

In [2]:
# Define the file path to import Special Admit Foundation Data
# Replace "sampledrive\samplefolder\samplefile" with correct path ("drive\folder\file")
file_path = r'C:\Project Files\CDE 2024 Python\RP_Conference_Share\Special Admit Foundation_sample.csv'

#check to see if the file path exists  
print(os.path.exists(file_path))  # Return "True" or "Flase"

# Read the file into a DataFrame named "df_ecc_2324"
# Assuming the file is delimited by a specific character, for example, a comma. 
# Adjust the delimiter as needed.
df_ecc_2324 = pd.read_csv(file_path, delimiter=',', low_memory=False)  # Change delimiter if necessary

# (Optional) Display the first few rows and the shape of the DataFrame to verify
print(df_ecc_2324.head())

True
   StudentID         Ethnicity     AcYr  EnrollType AdmitType HighSchool  \
0          1            Latinx  2023-24  Concurrent      Conc          A   
1          2             Asian  2023-24        CCAP      Dual          B   
2          3   Multi-Ethnicity  2023-24  Concurrent      Conc          C   
3          4            Latinx  2023-24  Concurrent      Conc          D   
4          5  African American  2023-24  Concurrent      Conc          D   

               District College  
0  High School District    ABCD  
1  High School District    ABCD  
2  High School District    ABCD  
3  High School District    ABCD  
4  High School District    ABCD  


### Check on Required Columns/Values

In [3]:
# Verify the required columns and values for processing

# Required column names
required_columns = ['StudentID', 'Ethnicity', 'AcYr', 'AdmitType', 'EnrollType', 'HighSchool']

# Check for missing columns
missing_columns = [col for col in required_columns if col not in df_ecc_2324.columns]
if missing_columns:
    raise ValueError(f"Missing required columns: {missing_columns}")
else:
    print("All required columns are present.")

# Verify required values in 'EnrollType'
required_enroll_types = {"NCD", "CCAP", "Concurrent"}
actual_enroll_types = set(df_ecc_2324['EnrollType'].unique())
missing_enroll_types = required_enroll_types - actual_enroll_types
if missing_enroll_types:
    raise ValueError(f"Missing required values in 'EnrollType': {missing_enroll_types}")
else:
    print("All required 'EnrollType' values are present.")

# Verify required values in 'AdmitType'
required_admit_types = {"Conc", "Dual"}
actual_admit_types = set(df_ecc_2324['AdmitType'].unique())
missing_admit_types = required_admit_types - actual_admit_types
if missing_admit_types:
    raise ValueError(f"Missing required values in 'AdmitType': {missing_admit_types}")
else:
    print("All required 'AdmitType' values are present.")

# Optionally rename columns if needed
i_renameCol = 0  # Set to 1 if renaming is required

if i_renameCol == 1:
    # Define renaming map
    rename_dict = {
        'OldColumnName1': 'NewColumnName1',
        'OldColumnName2': 'NewColumnName2',
        'OldColumnName3': 'NewColumnName3',
        # Add more mappings if needed
    }
    
    # Rename columns
    df_ecc_2324 = df_ecc_2324.rename(columns=rename_dict)

    # Display updated dataframe
    print("Updated Dataframe Preview:")
    print(df_ecc_2324.head())

# Output to confirm successful validation
print("Data validation completed successfully.")

All required columns are present.
All required 'EnrollType' values are present.
All required 'AdmitType' values are present.
Data validation completed successfully.


# 2. Aggragate Students by 'AcYr', 'EnrollType'

In [4]:
# Step 1: Filter out rows with missing values in the 'HighSchool' column.
# Students without high school information are excluded from the analysis.
filtered_data = df_ecc_2324.dropna(subset=['HighSchool'])

# Step 2: Aggregate the number of unique students ('StudentID') by 'AcYr' (Academic Year) and 'EnrollType'.
aggregated_data = filtered_data.groupby(['AcYr', 'EnrollType']).agg({'StudentID': 'nunique'}).reset_index()

# Step 3: Rename the aggregated 'StudentID' column to 'HeadCount' to represent the student count.
aggregated_data = aggregated_data.rename(columns={'StudentID': 'HeadCount'})

# Step 4: Pivot the aggregated data to a wide format with 'EnrollType' as columns and student counts ('HeadCount') as values.
wide_table_1 = aggregated_data.pivot_table(index='AcYr', columns=['EnrollType'], values='HeadCount', fill_value=0)

# Step 5: Aggregate the number of unique students ('StudentID') by 'AcYr' to get the total headcount for all special admits (SA), across all 'EnrollType'.
temp = filtered_data.groupby(['AcYr']).agg({'StudentID': 'nunique'})

# Step 6: Rename the aggregated 'StudentID' column in the temporary table to 'SA' to represent special admit totals.
temp = temp.rename(columns={'StudentID': 'SA'})

# Step 7: Combine the aggregated 'EnrollType' data and the 'SA' totals into a single wide-format table.
wide_table_1 = pd.concat([wide_table_1, temp], axis=1)

# Step 8: Replace "ABCD" with your college Abbreviation (e.g., "SDCCD").
# Add "ABCD_' as a prefix and '_Total' as a suffix to all column names to standardize the naming format.
wide_table_1.columns = ['ABCD_' + ''.join(col).strip() + '_Total' for col in wide_table_1.columns.values]

# Step 9: Reset the index to move 'AcYr' from the index to a regular column for better accessibility.
wide_table_1.reset_index(inplace=True)

# (Optional) Display the first few rows and the shape of the final wide-format table for verification.
# Uncomment the lines below to inspect the output.
print(wide_table_1.head())
# print(wide_table_1.shape)

      AcYr  ABCD_CCAP_Total  ABCD_Concurrent_Total  ABCD_NCD_Total  \
0  2021-22            236.0                    9.0           192.0   
1  2022-23             63.0                    3.0           166.0   
2  2023-24            374.0                   45.0            50.0   

   ABCD_SA_Total  
0            437  
1            232  
2            469  


# 3: Aggregate students by'AcYr', 'EnrollType', and 'Ethnicity'

In [5]:
# Aggregate students by 'AcYr', 'EnrollType', and 'Ethnicity'.
aggregated_data = filtered_data.groupby(['AcYr', 'EnrollType', 'Ethnicity']).agg({'StudentID': 'nunique'}).reset_index()

# Rename the 'StudentID' column to 'HeadCount' to indicate the number of unique students.
aggregated_data = aggregated_data.rename(columns={'StudentID': 'HeadCount'})

# Pivot the aggregated data to a wide format with combined 'EnrollType' and 'Ethnicity' as column headers.
wide_table_2 = aggregated_data.pivot_table(index='AcYr', columns=['EnrollType', 'Ethnicity'], values='HeadCount', fill_value=0)

# Rename columns by combining 'EnrollType' and 'Ethnicity' into a single string and adding the 'ABCD_' prefix.
wide_table_2.columns = ['ABCD_' + '_'.join(col).strip() for col in wide_table_2.columns.values]

# Aggregate students by 'AcYr' and 'Ethnicity' to get the headcounts of all special admit (SA) students by ethnicity.
temp = filtered_data.groupby(['AcYr', 'Ethnicity']).agg({'StudentID': 'nunique'})

# Rename the 'StudentID' column in the temporary dataframe to 'HeadCount'.
temp = temp.rename(columns={'StudentID': 'HeadCount'})

# Pivot the temporary data to a wide format with 'Ethnicity' as column headers.
temp = temp.pivot_table(index='AcYr', columns='Ethnicity', values='HeadCount', fill_value=0)

# Rename columns in the temporary dataframe by adding the 'ABCD_SA_' prefix to the ethnicity names.
temp.columns = ['ABCD_SA_' + ''.join(col).strip() for col in temp.columns.values]

# Combine the two aggregated dataframes (wide_table_2 and temp) side-by-side.
wide_table_2 = pd.concat([wide_table_2, temp], axis=1)

# Reset the index to make 'AcYr' a regular column.
wide_table_2.reset_index(inplace=True)

# (Optional) Display the first few rows and the shape of the final wide-format table for verification.
# Uncomment the lines below to inspect the output.
print(wide_table_2.head())
#print(wide_table_2.shape)

      AcYr  ABCD_CCAP_African American  ABCD_CCAP_Asian  ABCD_CCAP_Filipino  \
0  2021-22                        11.0             43.0                39.0   
1  2022-23                         4.0             12.0                 9.0   
2  2023-24                        28.0             76.0                44.0   

   ABCD_CCAP_Latinx  ABCD_CCAP_Multi-Ethnicity  ABCD_CCAP_Native American  \
0              74.0                       17.0                        0.0   
1              23.0                        5.0                        0.0   
2             112.0                       30.0                        2.0   

   ABCD_CCAP_Pacific Islander  ABCD_CCAP_Unreported  ABCD_CCAP_White  ...  \
0                         0.0                   1.0             51.0  ...   
1                         1.0                   3.0              6.0  ...   
2                         2.0                   4.0             76.0  ...   

   ABCD_NCD_White  ABCD_SA_African American  ABCD_SA_Asian  ABCD_

# 4. Aggragate Students by 'AcYr', 'AdmitType'

In [6]:
# Aggregate the number of unique students ('StudentID') by 'AcYr' (Academic Year) and 'AdmitType'.
aggregated_data = filtered_data.groupby(['AcYr', 'AdmitType']).agg({'StudentID': 'nunique'}).reset_index()

# Rename the aggregated 'StudentID' column to 'HeadCount' to indicate student counts.
aggregated_data = aggregated_data.rename(columns={'StudentID': 'HeadCount'})

# Pivot the data to a wide format where each 'AdmitType' becomes a column, with student counts as values.
wide_table_3 = aggregated_data.pivot_table(index='AcYr', columns=['AdmitType'], values='HeadCount', fill_value=0)

# Rename columns by prefixing them with 'ABCD_' and suffixing them with '_Total' for clarity.
wide_table_3.columns = ['ABCD_' + col + '_Total' for col in wide_table_3.columns]

# Reset the index to convert 'AcYr' from the index back into a regular column.
wide_table_3.reset_index(inplace=True)

# (Optional) Display the first few rows and the shape of the final wide-format table for verification.
# Uncomment the lines below to inspect the output.
print(wide_table_3.head())
#print(wide_table_3.shape)

      AcYr  ABCD_Conc_Total  ABCD_Dual_Total
0  2021-22             15.0            422.0
1  2022-23              4.0            228.0
2  2023-24             45.0            424.0


# 5. Aggragate Students by 'AcYr', 'AdmitType', 'Ethnicity'

In [7]:
# Aggregate the number of unique students ('StudentID') by 'AcYr' (Academic Year), 'AdmitType', and 'Ethnicity'.
aggregated_data = filtered_data.groupby(['AcYr', 'AdmitType', 'Ethnicity']).agg({'StudentID': 'nunique'}).reset_index()

# Rename the aggregated 'StudentID' column to 'HeadCount' to represent the count of students.
aggregated_data = aggregated_data.rename(columns={'StudentID': 'HeadCount'})

# Pivot the data to a wide format where each combination of 'AdmitType' and 'Ethnicity' becomes a column, 
# with student counts ('HeadCount') as values.
wide_table_4 = aggregated_data.pivot_table(index='AcYr', columns=['AdmitType', 'Ethnicity'], values='HeadCount', fill_value=0)

# Flatten the hierarchical columns by combining 'AdmitType' and 'Ethnicity' into a single string.
# Add a 'ABCD_' prefix to the column names for better identification.
wide_table_4.columns = ['ABCD_' + '_'.join(col).strip() for col in wide_table_4.columns.values]

# Reset the index to move 'AcYr' from the index back into a regular column.
wide_table_4.reset_index(inplace=True)

# (Optional) Display the first few rows and the shape of the final wide-format table for verification.
# Uncomment the lines below to inspect the output.
print(wide_table_4.head())
# print(wide_table_4.shape)



      AcYr  ABCD_Conc_African American  ABCD_Conc_Asian  ABCD_Conc_Filipino  \
0  2021-22                         0.0              1.0                 0.0   
1  2022-23                         0.0              0.0                 1.0   
2  2023-24                         2.0              3.0                 0.0   

   ABCD_Conc_Latinx  ABCD_Conc_Multi-Ethnicity  ABCD_Conc_Unreported  \
0               5.0                        3.0                   0.0   
1               1.0                        0.0                   0.0   
2              17.0                        5.0                   2.0   

   ABCD_Conc_White  ABCD_Dual_African American  ABCD_Dual_Asian  \
0              6.0                        20.0             69.0   
1              2.0                        13.0             40.0   
2             16.0                        29.0             80.0   

   ABCD_Dual_Filipino  ABCD_Dual_Latinx  ABCD_Dual_Multi-Ethnicity  \
0                76.0             126.0                

# 6. Combine All above Aggretated Data Together

In [8]:
# Concatenate tables horizontally based on common columns
combined_table = pd.concat([wide_table_1.set_index("AcYr"),
                            wide_table_2.set_index("AcYr"),
                            wide_table_3.set_index("AcYr"),
                            wide_table_4.set_index("AcYr")],
                            axis=1)

# Reset index to convert 'AcYr' back to columns
combined_table.reset_index(inplace=True)
combined_table.shape

#drop duplicated columns, i.e,.'ABCD_Concurrent_African America' and 'ABCD_Conc_African America ' are duplicates, and so on 
#for example, 'SDCCD_Concurrent_African America' is a duplicate of 'SDCCD_Conc_African America ' 
# List of potential columns to drop
columns_to_drop = [
    'ABCD_Conc_Total', 'ABCD_Concurrent_African American',
    'ABCD_Concurrent_Asian', 'ABCD_Concurrent_Filipino',
    'ABCD_Concurrent_Latinx', 'ABCD_Concurrent_Multi-Ethnicity',
    'ABCD_Concurrent_Native American', 'ABCD_Concurrent_Pacific Islander',
    'ABCD_Concurrent_Unreported', 'ABCD_Concurrent_White'
]

# Filter the list to only include columns that exist in combined_table.columns
columns_to_drop = [col for col in columns_to_drop if col in combined_table.columns]


# Drop the specified columns
combined_table.drop(columns=columns_to_drop, inplace=True)
# Rename columns as needed
combined_table = combined_table.rename(columns={'ABCD_Concurrent_Total': 'ABCD_Conc_Total'})



# Adding the constant column "ABCD" with the value "Overall" as the first column
combined_table.insert(0, "ABCD","Overall")

# 6. Import HighSchool Aggregated Data
#### The HighSchool Aggregated Data was Produced by "Sample_SpecialAdmit_Headcount_by_HighSchool.ipynb"

In [9]:
# Define the file path where your Highschool aggreated data is stored. 
# Replace "sampledrive\samplefolder\samplefile" with correct path ("drive\folder\file")
file_path = r'C:\Project Files\CDE 2024 Python\RP_Conference_Share\Special Admit Heaadcount_by_HioghSchool.xlsx'

# Check if file exists before reading
if os.path.exists(file_path):
    # Load data into a DataFrame
    df_agg_HS = pd.read_excel(file_path)
    print("File loaded successfully.")
else:
    print(f"Error: The file at {file_path} does not exist.")

File loaded successfully.


#### Formatiing HighSchool Aggregated Data and the ablove Combined_table for concatenation

In [10]:
# Step 1: Add "Level of Aggregation" column with value "HighSchool" in df_agg_HS
df_agg_HS.insert(0, "Level of Aggregation", "HighSchool")

# Step 2: Rename the 'ABCD' column in combined_table to 'School_ABCD' for consistency
combined_table = combined_table.rename(columns={'ABCD': 'School_ABCD'})

# Step 3: Add "Level of Aggregation" column with value "Overall" in combined_table
combined_table.insert(0, "Level of Aggregation", "Overall")

# Step 4: Concatenate the two dataframes df_agg_HS and combined_table vertically (along rows)
df_combined = pd.concat([df_agg_HS, combined_table], ignore_index=True)

# Optionally, display the first few rows of the combined dataframe
# Uncomment the lines below to inspect the output.
#print(df_combined.head())


# 7. Output the concatenated table 

In [11]:
# Step 1: Define the file path where the Excel file will be saved
# Replace "sampledrive\samplefolder\samplefile" with correct path ("drive\folder\file")

file_path = r'C:\Project Files\CDE 2024 Python\RP_Conference_Share\Special Admit Heaadcount_combined.xlsx'

# Step 2: Save the aggregated data (df_combined) to an Excel file
df_combined.to_excel(file_path, index=False)

# Optionally, print a confirmation message
print(f"Combined data has been saved to: {file_path}")

Combined data has been saved to: C:\Project Files\CDE 2024 Python\RP_Conference_Share\Special Admit Heaadcount_combined.xlsx
