# 1. Import Python Pakcages 

In [6]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 2. Import "Special Admit Foundation" Data

In [8]:
# Define the file path
file_path = r'G:\Shared\SS\From RPNet\Early College Credit\Special Admit Dashboard\SDCCD Special Admit Foundation.csv'

#check to see if the file path exists
print(os.path.exists(file_path))

# Read the file into a DataFrame named "df_ecc_2324"
# Assuming the file is delimited by a specific character, for example, a comma. 
# Adjust the delimiter as needed.
df_ecc_2324 = pd.read_csv(file_path, delimiter=',', low_memory=False)  # Change delimiter if necessary

# use print(...) to Display the first few rows of the DataFrame to verify
print(df_ecc_2324.head())

True


 ## Examine "Special Admit Foundation" Data

In [1]:
# The .info() method displays summary information about the dataframe,  
# including the number of non-null values, data types, and memory usage.  
df_ecc_2324.info()

In [13]:
# Display the column names of the dataframe.  
df_ecc_2324.columns

In [None]:
# Display the count of unique values in the 'EnrollType' column, including NaN values.
print(df_ecc_2324.EnrollType.value_counts(dropna=False))

# Display the count of unique values in the 'HighSchool' column, including NaN values.
print(df_ecc_2324.HighSchool.value_counts(dropna=False))

#It is found that for SDCCD, the possible symbols to repsent missing values 
#in HighSchool are blank, "", and a space.

# Display the count of unique values in the 'AcYr' column, including NaN values.
print(df_ecc_2324.AcYr.value_counts(dropna=False))

# Display the count of unique values in the 'Ethnicity' column, including NaN values.
print(df_ecc_2324.Ethnicity.value_counts(dropna=False))

In [10]:
# Replace both '' and '""' with 'Unreported' in the HighSchool column
df_ecc_2324['HighSchool'] = df_ecc_2324['HighSchool'].replace(['', '""',' '], 'Unreported')

In [None]:
#convert College values to texts if numeric (1=City, 2 = Mesa, 3 = Miramar)
#In SDCCD, we have multiple colleges and colleges are represented by numbers 
#You may skip this step if this situation does not apply to you

# Mapping dictionary
college_mapping = {1: 'City', 2: 'Mesa', 3: 'Miramar', 4: 'SDCCE'}

# Replacing the numeric values with text values
df_ecc_2324['College'] = df_ecc_2324['College'].replace(college_mapping)


In [None]:
# Create and display a two-way frequency table (crosstab)  
# showing the count of 'EnrollType' occurrences for each 'HighSchool'.  
print(pd.crosstab(df_ecc_2324['HighSchool'], df_ecc_2324['EnrollType']))

 ## Examine Overall Missing Data 

In [12]:
# change i_missing value to 1 to visually check on missing data in other variavbles
i_missing=0

if i_missing==1:
    plt.figure(figsize=(10,6))
    sns.displot(
        data=df_ecc_2324.isna().melt(value_name="missing"),
        y="variable",
        hue="missing",
        multiple="fill",
        aspect=1.25
    )

# 3. Data Aggregation

## 3.1. Aggragate Students by 'HighSchool', 'AcYr', 'EnrollType'

In [14]:
# Step 1: Filter out all rows with no missing values in the 'HighSchool' column
filtered_data = df_ecc_2324.dropna(subset=['HighSchool'])

# Aggregate students by 'HighSchool', 'AcYr', 'EnrollType' to get headcounts for each 'EnrollType'
aggregated_data = filtered_data.groupby(['HighSchool', 'AcYr', 'EnrollType']).agg({'Emplid': 'nunique'}).reset_index()
# rename the count column
aggregated_data = aggregated_data.rename(columns={'Emplid': 'HeadCount'})
# Pivot the aggregated data to wide format with 'EnrollType' as column
wide_table_1 = aggregated_data.pivot_table(index=['HighSchool', 'AcYr'], columns=['EnrollType'], values='HeadCount', fill_value=0)

# Aggregate students by 'HighSchool', 'AcYr' to get headcounts ("SA") of all EnrollType studnets 
temp = filtered_data.groupby(['HighSchool', 'AcYr']).agg({'Emplid': 'nunique'})
# Rename the count column with 'SA'
temp = temp.rename(columns={'Emplid': 'SA'})

#combine two aggregations together horizontally
wide_table_1= pd.concat([wide_table_1, temp], axis=1)

# Add 'SDCCD_' prefix so that the columns names are in foramt of SDCCD_EnrollType_Total
wide_table_1.columns = ['SDCCD_' + ''.join(col).strip() + '_Total' for col in wide_table_1.columns.values]
wide_table_1.reset_index(inplace=True)
# Rename the HighSchool column as School_SDCCD for later join with other tables
wide_table_1 = wide_table_1.rename(columns={'HighSchool': 'School_SDCCD'})

# Display the aggregated data
print(wide_table_1.head())
print(wide_table_1.shape)


## 3.2. Aggregate students by 'HighSchool', 'AcYr', 'EnrollType', and 'Ethnicity'

In [22]:
# Aggregate students by 'HighSchool', 'AcYr', 'EnrollType', and 'Ethnicity'  to get headcounts by EnrollTyoe & Ethinicity
aggregated_data = filtered_data.groupby(['HighSchool', 'AcYr', 'EnrollType', 'Ethnicity']).agg({'Emplid': 'nunique'}).reset_index()
# Rename the count column
aggregated_data = aggregated_data.rename(columns={'Emplid': 'HeadCount'})

# Pivot the aggregated data to wide format with combined values of 'EnrollType' and 'Ethnicity' as columns
wide_table_2 = aggregated_data.pivot_table(index=['HighSchool', 'AcYr'], columns=['EnrollType', 'Ethnicity'], values='HeadCount', fill_value=0)
# Change column names--Combining 'EnrollType' and 'Ethnicity' into a single string and adding 'SDCCD_' prefix
wide_table_2.columns = ['SDCCD_' + '_'.join(col).strip() for col in wide_table_2.columns.values]


# Aggregate students by 'HighSchool', 'AcYr' to get  headcounts for All EnrollType together by Ethinicity
temp = filtered_data.groupby(['HighSchool', 'AcYr','Ethnicity']).agg({'Emplid': 'nunique'})
#Rename the count column
temp = temp.rename(columns={'Emplid': 'HeadCount'})
# Pivot the aggregated data to wide format with 'Ethnicity' as columns
temp = temp.pivot_table(index=['HighSchool', 'AcYr'], columns='Ethnicity', values='HeadCount', fill_value=0)
# Adding 'SDCCD_SA_' prefix
temp.columns = ['SDCCD_SA_' + ''.join(col).strip() for col in temp.columns.values]

#combine two aggregations together horizontally
wide_table_2= pd.concat([wide_table_2, temp], axis=1)
wide_table_2.reset_index(inplace=True)

# Rename the HighSchool column for later join with otehr tables
wide_table_2 = wide_table_2.rename(columns={'HighSchool': 'School_SDCCD'})
print(wide_table_2.head())
print(wide_table_2.shape)

## 3.3. Aggragate Students by 'HighSchool', 'AcYr', 'AdmitType'

In [24]:
# Aggregate students by 'HighSchool', 'AcYr', 'AdmitType'
aggregated_data = filtered_data.groupby(['HighSchool', 'AcYr', 'AdmitType']).agg({'Emplid': 'nunique'}).reset_index()

# Rename the count column
aggregated_data = aggregated_data.rename(columns={'Emplid': 'HeadCount'})

# Pivot the aggregated data to wide format with 'AdmitType' as columns
wide_table_3 = aggregated_data.pivot_table(index=['HighSchool', 'AcYr'], columns=['AdmitType'], values='HeadCount', fill_value=0)

#Add 'SDCCD_' prefix to 'AdmitType'
wide_table_3.columns = ['SDCCD_' + col + '_Total'  for col in wide_table_3.columns]
wide_table_3.reset_index(inplace=True)

# Rename the HighSchool column for later join with otehr tables
wide_table_3 = wide_table_3.rename(columns={'HighSchool': 'School_SDCCD'})
# Display the aggregated data
print(wide_table_3.head())
print(wide_table_3.shape)


## 3.4. Aggragate Students by 'HighSchool', 'AcYr', 'AdmitType', 'Ethnicity'

In [26]:
# Aggregate students by 'HighSchool', 'AcYr', 'EnrollType', and 'Ethnicity'
aggregated_data = filtered_data.groupby(['HighSchool', 'AcYr', 'AdmitType', 'Ethnicity']).agg({'Emplid': 'nunique'}).reset_index()

# Rename the count column
aggregated_data = aggregated_data.rename(columns={'Emplid': 'HeadCount'})

# Pivot the aggregated data to wide format with combined values of 'AdmitType' and 'Ethnicity' as columns
wide_table_4 = aggregated_data.pivot_table(index=['HighSchool', 'AcYr'], columns=['AdmitType', 'Ethnicity'], values='HeadCount', fill_value=0)

# Change column names--Combining 'AdmitType' and 'Ethnicity' into a single string and adding 'SDCCD_' prefix
wide_table_4.columns = ['SDCCD_' + '_'.join(col).strip() for col in wide_table_4.columns.values]
wide_table_4.reset_index(inplace=True)

# Rename the HighSchool column
wide_table_4 = wide_table_4.rename(columns={'HighSchool': 'School_SDCCD'})


# Display the aggregated data
print(wide_table_4.head())
print(wide_table_4.shape)


# 4. Combine all the aggregation tables and output as one wide table into an excel file

In [28]:
# Concatenate tables horizontally based on common columns
combined_table = pd.concat([wide_table_1.set_index(["School_SDCCD", "AcYr"]),
                            wide_table_2.set_index(["School_SDCCD", "AcYr"]),
                            wide_table_3.set_index(["School_SDCCD", "AcYr"]),
                            wide_table_4.set_index(["School_SDCCD", "AcYr"])],
                            axis=1)

# Reset index to convert 'School_SDCCD' and 'AcYr' back to columns
combined_table.reset_index(inplace=True)
combined_table.shape

#drop duplicated columns, i.e,.'SDCCD_Concurrent_African America'='SDCCD_Conc_African America ' and so on 
# List of columns to drop
columns_to_drop = [
    'SDCCD_Conc_Total', 'SDCCD_Concurrent_African America',
    'SDCCD_Concurrent_Asian', 'SDCCD_Concurrent_Filipino',
    'SDCCD_Concurrent_Latinx', 'SDCCD_Concurrent_Multi-Ethnicity',
    'SDCCD_Concurrent_Native American', 'SDCCD_Concurrent_Pacific Islande',
    'SDCCD_Concurrent_Unreported', 'SDCCD_Concurrent_White'
]

# Drop the specified columns
combined_table.drop(columns=columns_to_drop, inplace=True)
#renmae 
combined_table = combined_table.rename(columns={'SDCCD_Concurrent_Total': 'SDCCD_Conc_Total'})


# Define the file path
file_path = r'G:\Shared\SS\From RPNet\Early College Credit\Special Admit Dashboard\Working Files\SDCCD_SpecialAdmit_Headcount Aggregation Python Files\SDCCD_SpecialAdmit_Headcount_agg_HS.xlsx'

# Save the aggregated data to an Excel file
combined_table.to_excel(file_path, index=False)

print(f"Combined data has been saved to: {file_path}")