# 1. Import Python Pakcages 

In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 2. Import "Special Admit Foundation" Data

In [1]:
# Define the file path to import Special Admit Foundation Data
# Replace "sampledrive\samplefolder\samplefile" with correct path ("drive\folder\file")
file_path = 'sampledrive\samplefolder\samplefile'

#check to see if the file path exists  
print(os.path.exists(file_path))  # Return "True" or "Flase"

# Read the file into a DataFrame named "df_ecc_2324"
# Assuming the file is delimited by a specific character, for example, a comma. 
# Adjust the delimiter as needed.
df_ecc_2324 = pd.read_csv(file_path, delimiter=',', low_memory=False)  # Change delimiter if necessary

# (Optional) Display the first few rows and the shape of the DataFrame to verify
print(df_ecc_2324.head())

### Check on Required Columns/Values

In [2]:
# The .info() method displays summary information about the dataframe,  
# including the number of non-null values, data types, and memory usage.  
df_ecc_2324.info()

In [None]:
# Verify the required columns and values for processing

# Required column names
required_columns = ['Emplid', 'Ethnicity', 'AcYr', 'AdmitType', 'EnrollType', 'HighSchool']

# Check for missing columns
missing_columns = [col for col in required_columns if col not in df_ecc_2324.columns]
if missing_columns:
    raise ValueError(f"Missing required columns: {missing_columns}")
else:
    print("All required columns are present.")

# Verify required values in 'EnrollType'
required_enroll_types = {"non-CCAP Dual", "CCAP", "Concurrent"}
actual_enroll_types = set(df_ecc_2324['EnrollType'].unique())
missing_enroll_types = required_enroll_types - actual_enroll_types
if missing_enroll_types:
    raise ValueError(f"Missing required values in 'EnrollType': {missing_enroll_types}")
else:
    print("All required 'EnrollType' values are present.")

# Verify required values in 'AdmitType'
required_admit_types = {"Conc", "Dual"}
actual_admit_types = set(df_ecc_2324['AdmitType'].unique())
missing_admit_types = required_admit_types - actual_admit_types
if missing_admit_types:
    raise ValueError(f"Missing required values in 'AdmitType': {missing_admit_types}")
else:
    print("All required 'AdmitType' values are present.")

# Optionally rename columns if needed
i_renameCol = 0  # Set to 1 if renaming is required

if i_renameCol == 1:
    # Define renaming map
    rename_dict = {
        'OldColumnName1': 'NewColumnName1',
        'OldColumnName2': 'NewColumnName2',
        'OldColumnName3': 'NewColumnName3',
        # Add more mappings if needed
    }
    
    # Rename columns
    df_ecc_2324 = df_ecc_2324.rename(columns=rename_dict)

    # Display updated dataframe
    print("Updated Dataframe Preview:")
    print(df_ecc_2324.head())

# Output to confirm successful validation
print("Data validation completed successfully.")

 ### Clean up "HighSchool" for missing values, if needed

In [3]:
# Display the count of unique values in the 'HighSchool' column, including NaN values.
# This helps check the distribution of high school names and detect missing or inconsistent entries.
print(df_ecc_2324.HighSchool.value_counts(dropna=False))

# Comment: Identified that missing values in the 'HighSchool' column are represented by various symbols:
# blank (''), double quotes ('""'), and a space (' '). These values need to be standardized to 'Unreported'.
# This ensures consistency in how missing values are handled and prevents errors during analysis.
df_ecc_2324['HighSchool'] = df_ecc_2324['HighSchool'].replace(['', '""', ' '], 'Unreported')

# Optionally, display the updated 'HighSchool' column to verify changes.
print(df_ecc_2324['HighSchool'].value_counts(dropna=False))

 ### Check Columns "AcYr", "EnrollType", "Ethnicity", "AdmitType"

In [4]:
# Display the count of unique values in the 'EnrollType' column, including NaN values.
# This helps identify the distribution of enroll types and detect any missing values (NaN).
print(df_ecc_2324.EnrollType.value_counts(dropna=False))

# Display the count of unique values in the 'AcYr' (Academic Year) column, including NaN values.
# This will give an overview of the academic years present in the dataset, including any missing years.
print(df_ecc_2324.AcYr.value_counts(dropna=False))

# Display the count of unique values in the 'Ethnicity' column, including NaN values.
# This shows the distribution of ethnicities in the dataset, including any missing data (NaN).
print(df_ecc_2324.Ethnicity.value_counts(dropna=False))

# Display the count of unique values in the 'AdmitType' column, including NaN values.
# This shows the distribution of ethnicities in the dataset, including any missing data (NaN).
print(df_ecc_2324.AdmitType.value_counts(dropna=False))

 ### Two way contigency tables to examine distribution of data, if needed

In [5]:
# Create and display a two-way frequency table (crosstab)  
# showing the count of 'EnrollType' occurrences for each 'HighSchool'.  
print(pd.crosstab(df_ecc_2324['HighSchool'], df_ecc_2324['EnrollType']))

 ### Visually Examine Overall Missing Data 

In [9]:
# change i_missing value to 1 to visually check on missing data in other variavbles
i_missing=0

if i_missing==1:
    plt.figure(figsize=(10,6))
    sns.displot(
        data=df_ecc_2324.isna().melt(value_name="missing"),
        y="variable",
        hue="missing",
        multiple="fill",
        aspect=1.25
    )

# 3. Data Aggregation

## 3.1. Aggregate Students by 'HighSchool', 'AcYr', 'EnrollType'

In [10]:
# Step 1: Filter out rows with missing values in the 'HighSchool' column
# Only include rows where 'HighSchool' is not NaN
filtered_data = df_ecc_2324.dropna(subset=['HighSchool'])

# Step 2: Aggregate students by 'HighSchool', 'AcYr', and 'EnrollType' to calculate unique student headcounts
# The unique count of 'Emplid' represents the number of students for each combination
aggregated_data = (
    filtered_data.groupby(['HighSchool', 'AcYr', 'EnrollType'])
    .agg({'Emplid': 'nunique'})
    .reset_index()
)

# Rename the count column to 'HeadCount' for clarity
aggregated_data = aggregated_data.rename(columns={'Emplid': 'HeadCount'})

# Step 3: Pivot the aggregated data to wide format
# Create a table where each 'EnrollType' becomes a column
wide_table_1 = aggregated_data.pivot_table(
    index=['HighSchool', 'AcYr'],  # Group by 'HighSchool' and 'AcYr'
    columns=['EnrollType'],        # Use 'EnrollType' as column headers
    values='HeadCount',            # Use 'HeadCount' as cell values
    fill_value=0                   # Fill missing values with 0
)

# Step 4: Aggregate students by 'HighSchool' and 'AcYr' to calculate total unique students across all EnrollTypes
total_headcount = (
    filtered_data.groupby(['HighSchool', 'AcYr'])
    .agg({'Emplid': 'nunique'})
    .rename(columns={'Emplid': 'SA'})  # Rename the count column to 'SA'
)

# Step 5: Combine the pivoted data and total headcount data horizontally
# This results in a table with both 'EnrollType' headcounts and total 'SA' counts
wide_table_1 = pd.concat([wide_table_1, total_headcount], axis=1)


# Step 6: Replace "SampleCollege" with your college Abbreviation (e.g., "SDCCD"). 
# Add "SampleCollege_' as a prefix and '_Total' as a suffix to all column names to standardize the naming format.
# For example: "CCAP" becomes "SampleCollege_CCAP_Total" (e.g.,"SDCCD_CCAP_Total").

wide_table_1.columns = [
    'SampleCollege_' + ''.join(col).strip() + '_Total' for col in wide_table_1.columns.values
]

# Step 7: Reset the index to make 'HighSchool' and 'AcYr' regular columns
wide_table_1.reset_index(inplace=True)

# Step 8: Rename the 'HighSchool' column to 'School_SampleCollege'
# This aligns with naming conventions for later joins with other tables
wide_table_1 = wide_table_1.rename(columns={'HighSchool': 'School_SampleCollege'})

# Step 9: (Optional) 
#Display the first few rows and the shape of the final wide-format table for verification.
# Uncomment the lines below to inspect the output.
# print(wide_table_1.head())
# print(wide_table_1.shape)


## 3.2. Aggregate students by 'HighSchool', 'AcYr', 'EnrollType', and 'Ethnicity'

In [11]:
# Step 1: Aggregate students by 'HighSchool', 'AcYr', 'EnrollType', and 'Ethnicity' to calculate unique headcounts
# The unique count of 'Emplid' represents the number of students for each combination
aggregated_data = (
    filtered_data.groupby(['HighSchool', 'AcYr', 'EnrollType', 'Ethnicity'])
    .agg({'Emplid': 'nunique'})
    .reset_index()
)

# Rename the count column to 'HeadCount'
aggregated_data = aggregated_data.rename(columns={'Emplid': 'HeadCount'})

# Step 2: Pivot the aggregated data to wide format
# Create a table where the combination of 'EnrollType' and 'Ethnicity' becomes column headers
wide_table_2 = aggregated_data.pivot_table(
    index=['HighSchool', 'AcYr'],  # Group by 'HighSchool' and 'AcYr'
    columns=['EnrollType', 'Ethnicity'],  # Use combined 'EnrollType' and 'Ethnicity' as columns
    values='HeadCount',  # Use 'HeadCount' as cell values
    fill_value=0  # Fill missing values with 0
)

# Step 3: Format column names by combining 'EnrollType' and 'Ethnicity' with an 'SampleCollege_' prefix
wide_table_2.columns = ['SampleCollege_' + '_'.join(col).strip() for col in wide_table_2.columns.values]

# Step 4: Aggregate students by 'HighSchool', 'AcYr', and 'Ethnicity' to calculate headcounts across all 'EnrollType'
# This will give the total unique headcounts by ethnicity
temp = (
    filtered_data.groupby(['HighSchool', 'AcYr', 'Ethnicity'])
    .agg({'Emplid': 'nunique'})
    .rename(columns={'Emplid': 'HeadCount'})  # Rename the count column
)

# Step 5: Pivot the aggregated data to wide format with 'Ethnicity' as columns
temp = temp.pivot_table(
    index=['HighSchool', 'AcYr'],  # Group by 'HighSchool' and 'AcYr'
    columns='Ethnicity',  # Use 'Ethnicity' as columns
    values='HeadCount',  # Use 'HeadCount' as cell values
    fill_value=0  # Fill missing values with 0
)

# Step 6: Add 'SampleCollege_SA_' prefix to column names for clarity
temp.columns = ['SampleCollege_SA_' + col.strip() for col in temp.columns.values]

# Step 7: Combine the two pivoted tables horizontally
# This results in a table with both 'EnrollType-Ethnicity' and total 'SA-Ethnicity' headcounts
wide_table_2 = pd.concat([wide_table_2, temp], axis=1)

# Step 8: Reset the index to make 'HighSchool' and 'AcYr' regular columns
wide_table_2.reset_index(inplace=True)

# Step 9: Rename the 'HighSchool' column to 'School_SampleCollege' for consistency and future joins
wide_table_2 = wide_table_2.rename(columns={'HighSchool': 'School_SampleCollege'})

# Step 10: (Optional) 
#Display the first few rows and the shape of the final wide-format table for verification.
# Uncomment the lines below to inspect the output.
# print(wide_table_2.head())
# print(wide_table_2.shape)


## 3.3. Aggregate Students by 'HighSchool', 'AcYr', 'AdmitType'

In [12]:
# Step 1: Aggregate students by 'HighSchool', 'AcYr', and 'AdmitType'
# Count unique 'Emplid' to calculate the number of students for each combination
aggregated_data = (
    filtered_data.groupby(['HighSchool', 'AcYr', 'AdmitType'])
    .agg({'Emplid': 'nunique'})
    .reset_index()
)

# Step 2: Rename the count column to 'HeadCount'
aggregated_data = aggregated_data.rename(columns={'Emplid': 'HeadCount'})

# Step 3: Pivot the aggregated data to wide format
# Use 'AdmitType' as columns and 'HeadCount' as cell values
wide_table_3 = aggregated_data.pivot_table(
    index=['HighSchool', 'AcYr'],  # Group by 'HighSchool' and 'AcYr'
    columns='AdmitType',  # Use 'AdmitType' as columns
    values='HeadCount',  # Use 'HeadCount' as cell values
    fill_value=0  # Fill missing values with 0
)

# Step 4: Add 'SampleCollege_' prefix to column names to clarify 'AdmitType'
wide_table_3.columns = ['SampleCollege_' + col + '_Total' for col in wide_table_3.columns]

# Step 5: Reset the index to make 'HighSchool' and 'AcYr' regular columns
wide_table_3.reset_index(inplace=True)

# Step 6: Rename the 'HighSchool' column to 'School_SampleCollege' for consistency in joins
wide_table_3 = wide_table_3.rename(columns={'HighSchool': 'School_SampleCollege'})

# Step 7: (Optional) 
#Display the first few rows and the shape of the final wide-format table for verification.
# Uncomment the lines below to inspect the output.
# print(wide_table_3.head())
# print(wide_table_3.shape)


## 3.4. Aggragate Students by 'HighSchool', 'AcYr', 'AdmitType', 'Ethnicity'

In [13]:
# Step 1: Aggregate students by 'HighSchool', 'AcYr', 'AdmitType', and 'Ethnicity'
# Count unique 'Emplid' to calculate headcounts for each combination
aggregated_data = (
    filtered_data.groupby(['HighSchool', 'AcYr', 'AdmitType', 'Ethnicity'])
    .agg({'Emplid': 'nunique'})
    .reset_index()
)

# Step 2: Rename the count column to 'HeadCount'
aggregated_data = aggregated_data.rename(columns={'Emplid': 'HeadCount'})

# Step 3: Pivot the aggregated data to wide format
# Use combined values of 'AdmitType' and 'Ethnicity' as column headers
wide_table_4 = aggregated_data.pivot_table(
    index=['HighSchool', 'AcYr'],  # Group by 'HighSchool' and 'AcYr'
    columns=['AdmitType', 'Ethnicity'],  # Use 'AdmitType' and 'Ethnicity' as columns
    values='HeadCount',  # Use 'HeadCount' as cell values
    fill_value=0  # Fill missing values with 0
)

# Step 4: Rename the columns
# Combine 'AdmitType' and 'Ethnicity' into a single string and add 'SampleCollege_' prefix for clarity
wide_table_4.columns = ['SampleCollege_' + '_'.join(col).strip() for col in wide_table_4.columns.values]

# Step 5: Reset the index to make 'HighSchool' and 'AcYr' regular columns
wide_table_4.reset_index(inplace=True)

# Step 6: Rename the 'HighSchool' column to 'School_SampleCollege' for consistency in naming
wide_table_4 = wide_table_4.rename(columns={'HighSchool': 'School_SampleCollege'})

# Step 7: (Optional) 
#Display the first few rows and the shape of the final wide-format table for verification.
# Uncomment the lines below to inspect the output.
# print(wide_table_4.head())
# print(wide_table_4.shape)

# 4. Combine all the aggregation tables

In [None]:
# Concatenate tables horizontally based on common columns
combined_table = pd.concat([wide_table_1.set_index(["School_SampleCollege", "AcYr"]),
                            wide_table_2.set_index(["School_SampleCollege", "AcYr"]),
                            wide_table_3.set_index(["School_SampleCollege", "AcYr"]),
                            wide_table_4.set_index(["School_SampleCollege", "AcYr"])],
                            axis=1)

# Reset index to convert 'School_SampleCollege' and 'AcYr' back to columns
combined_table.reset_index(inplace=True)
combined_table.shape

#drop duplicated columns, i.e,.'SampleCollege_Concurrent_African America' and 'SampleCollege_Conc_African America ' are duplicates, and so on 
#for example, 'SDCCD_Concurrent_African America' is a duplicate of 'SDCCD_Conc_African America ' 
# List of columns to drop
columns_to_drop = [
    'SampleCollege_Conc_Total', 'SampleCollege_Concurrent_African America',
    'SampleCollege_Concurrent_Asian', 'SampleCollege_Concurrent_Filipino',
    'SampleCollege_Concurrent_Latinx', 'SampleCollege_Concurrent_Multi-Ethnicity',
    'SampleCollege_Concurrent_Native American', 'SampleCollege_Concurrent_Pacific Islande',
    'SampleCollege_Concurrent_Unreported', 'SampleCollege_Concurrent_White'
]

# Drop the specified columns
combined_table.drop(columns=columns_to_drop, inplace=True)
# Rename columns as needed
combined_table = combined_table.rename(columns={'SampleCollege_Concurrent_Total': 'SampleCollege_Conc_Total'})

# 5. Output Aggregated Highshool data into An Excel File for Later Import into "RP_SpecialAdmit_Headcount_Combined.ipynb"

In [None]:
# Define the file path where you want your  aggregated Highschool data to be stored
#Replace "sampledrive\samplefolder\samplefile" with correct path ("drive\folder\file")
file_path = 'sampledrive\samplefolder\samplefile'

# Save the aggregated data to an Excel file
combined_table.to_excel(file_path, index=False)

print(f"Combined data has been saved to: {file_path}")