## Section 1 of this script loads in enrollment data for AY 2022-2023 and outputs the number of teacher preparation program enrollments by  campus by race and ethnicity.

## Section 2 of this script focuses on completers.

# Section 1

In [2]:
import os
os.getcwd()

'z:\\02 EdQ DataView and TPDM\\New Pipeline Data\\scripts'

In [58]:
os.chdir("Z:\\02 EdQ DataView and TPDM\\New Pipeline Data\\scripts")

In [3]:
# import pandas and pandasql libraries
import pandas as pd
from pandasql import sqldf

# Load in enrollment data

In [4]:
df_erss = pd.read_csv(
    "..\data\erss\ERSS_20223_20234profileSLASH_N_REMOVED_240123.csv",
    dtype={
        "erss_cred_stat": str,
        "erss_ethnic_old": str,
        "erss_cred_emph": str,
        "erss_spec_prog": str,
    },
)

  "..\data\erss\ERSS_20223_20234profileSLASH_N_REMOVED_240123.csv",


In [139]:
## AY 2021-2022
df_erss_21_22 = pd.read_csv(
    "..\data\erss\ERSS_20213_20222_221215.csv",
    dtype={
        "erss_cred_stat": str,
        "erss_ethnic_old": str,
        "erss_cred_emph": str,
        "erss_spec_prog": str,
    },
)

  "..\data\erss\ERSS_20213_20222_221215.csv",


In [5]:
# load in credential objective lookup table
df_lookup = pd.read_excel(
    "..\data\credential_objective_lookup\erss_cred_obj_lookup.xlsx",
    sheet_name="appendix_c_ir",
)

  "..\data\credential_objective_lookup\erss_cred_obj_lookup.xlsx",


In [6]:
# change datatype of erss_cred_obj to int so it can be joined on the erss table
df_lookup["erss_cred_obj"] = df_lookup["erss_cred_obj"].astype(int)

In [7]:
# load in campus names and campus codes
df_campus_codes = pd.read_excel("..\data\campus_codes\campus_codes_and_names.xlsx")

  df_campus_codes = pd.read_excel("..\data\campus_codes\campus_codes_and_names.xlsx")


---------

- filter down to Long Beach and SLO
- filter down to Fall and Spring semesters for Long Beach
- filter down to Fall, Winter, and Spring terms for SLO


- 1	Winter Quarter or Term
- 2	Spring Semester, Quarter, or Term
- 3	Summer Semester, Quarter, or Term
- 4	Fall Semester, Quarter, or Term
- 9	All Terms

- ers_year for 2022 has erss_term 3 and 4
- ers_year for 2023 has erss_term 1, 2, 3, and 4

WANT
- 2022 Fall: erss_year = 2022 & erss_term = 4
- 2023 Winter: erss_year = 2023 & erss_term =  1 
- 2023 Spring: erss_year = 2023 & erss_term =  2

## 2a: INTERN

In [144]:
# recreate intern only
## filter down to teacher credential enrollments
df_2022_fall_intern = df_erss[
    (df_erss["erss_year"] == 2022)
    & (df_erss["erss_term"] == 4)
    & (df_erss["erss_cred_stat"].isin(["8"]))
    & (df_erss["erss_stud_lev"] == 5)
]

In [143]:
# AY 2021-2022
# recreate intern only
## filter down to teacher credential enrollments
df_2021_fall_intern = df_erss_21_22[
    (df_erss_21_22["erss_year"] == 2021)
    & (df_erss_21_22["erss_term"] == 4)
    & (df_erss_21_22["erss_cred_stat"].isin(["8"]))
    & (df_erss_21_22["erss_stud_lev"] == 5)
]

In [148]:
len(df_2022_fall_intern)

738

In [149]:
len(df_2021_fall_intern)

744

In [145]:
# recreate intern only
## filter down to teacher credential enrollments
df_2023_winter_spring_intern = df_erss[
    (df_erss["erss_year"] == 2023)
    & (df_erss["erss_term"].isin([1,2]))
    & (df_erss["erss_cred_stat"].isin(["8"]))
    & (df_erss["erss_stud_lev"] == 5)
]

In [146]:
# AY 2021-2022
# recreate intern only
## filter down to teacher credential enrollments
df_2022_winter_spring_intern = df_erss_21_22[
    (df_erss_21_22["erss_year"] == 2022)
    & (df_erss_21_22["erss_term"].isin([1,2]))
    & (df_erss_21_22["erss_cred_stat"].isin(["8"]))
    & (df_erss_21_22["erss_stud_lev"] == 5)
]

In [147]:
len(df_2023_winter_spring_intern)

720

In [150]:
len(df_2022_winter_spring_intern)

764

In [68]:
# combine intern df's together
df_22_23_intern = sqldf(
    """
    SELECT *
    FROM df_2022_fall_intern
    UNION
    SELECT * FROM df_2023_winter_spring_intern
"""
)

In [151]:
# AY 2021-2022
# combine intern df's together
df_21_22_intern = sqldf(
    """
    SELECT *
    FROM df_2021_fall_intern
    UNION
    SELECT * FROM df_2022_winter_spring_intern
"""
)

In [152]:
print(len(df_22_23_intern))
len(df_21_22_intern)

966


1508

In [70]:
# Note: Dropping duplicates can be problematic here because records with missing SSN's get the same masked calstateEduPersonUID
df_22_23_intern.drop_duplicates(subset=['calstateEduPersonUID'], inplace = True)


In [154]:
# AY 2021-2022
df_21_22_intern.drop_duplicates(subset=['calstateEduPersonUID'], inplace = True)


In [156]:
print(len(df_22_23_intern))
len(df_21_22_intern)
# Next: Do the same for AY 2021-2022 for part 2d

966


991

## 2d: POSTBACC AND GRADUATES IN REGULAR PROGRAMS

In [73]:
## Postbacc and graduates
## filter down to teacher credential enrollments

df_2022_fall_postbacc = df_erss[
    (df_erss["erss_year"] == 2022)
    & (df_erss["erss_term"] == 4)
    & (df_erss["erss_stud_lev"].isin([5]))
    & (df_erss["erss_cred_stat"].isin(["4","5", "6", "V", "H", "I", "J", "K"]))
    & (df_erss["erss_stud_stand"].isin(["C", "5", "1", "2", "3", "6", "7", "8"]))
]
len(df_2022_fall_postbacc)

11083

In [157]:
# AY 2021-2022
## Postbacc and graduates
## filter down to teacher credential enrollments

df_2021_fall_postbacc = df_erss_21_22[
    (df_erss_21_22["erss_year"] == 2021)
    & (df_erss_21_22["erss_term"] == 4)
    & (df_erss_21_22["erss_stud_lev"].isin([5]))
    & (df_erss_21_22["erss_cred_stat"].isin(["4","5", "6", "V", "H", "I", "J", "K"]))
    & (df_erss_21_22["erss_stud_stand"].isin(["C", "5", "1", "2", "3", "6", "7", "8"]))
]
len(df_2021_fall_postbacc)

12243

In [74]:
## Postbacc and graduates
## filter down to teacher credential enrollments

df_2023_winter_spring_postbacc = df_erss[
    (df_erss["erss_year"] == 2023)
    & (df_erss["erss_term"].isin([1,2]))
    & (df_erss["erss_stud_lev"].isin([5]))
    & (df_erss["erss_cred_stat"].isin(["4","5", "6", "V", "H", "I", "J", "K"]))
    & (df_erss["erss_stud_stand"].isin(["C", "5", "1", "2", "3", "6", "7", "8"]))
]
len(df_2023_winter_spring_postbacc)

9980

In [158]:
# AY 2021-2022
## Postbacc and graduates
## filter down to teacher credential enrollments

df_2022_winter_spring_postbacc = df_erss_21_22[
    (df_erss_21_22["erss_year"] == 2022)
    & (df_erss_21_22["erss_term"].isin([1,2]))
    & (df_erss_21_22["erss_stud_lev"].isin([5]))
    & (df_erss_21_22["erss_cred_stat"].isin(["4","5", "6", "V", "H", "I", "J", "K"]))
    & (df_erss_21_22["erss_stud_stand"].isin(["C", "5", "1", "2", "3", "6", "7", "8"]))
]
len(df_2022_winter_spring_postbacc)

11091

In [75]:
df_2022_fall_postbacc.erss_cred_stat.value_counts()

erss_cred_stat
5    10166
V      437
4      420
6       60
Name: count, dtype: int64

In [159]:
df_2021_fall_postbacc.erss_cred_stat.value_counts()

erss_cred_stat
5    11349
4      456
V      408
6       30
Name: count, dtype: int64

In [76]:
# check MS, ES, SS counts
# left join the program type onto the main DataFrame
df_2022_2023_merge_test = pd.merge(
    df_2022_fall_postbacc,
    df_lookup,
    left_on="erss_cred_obj",
    right_on="erss_cred_obj",
    how="left",
)
len(df_2022_2023_merge_test)

11083

In [77]:
df_2022_2023_merge_test["code_value"].value_counts()

code_value
MS       4344
SS       3544
Other    1623
ES       1572
Name: count, dtype: int64

In [78]:
(4344+3544+1572)/0.89

10629.213483146068

In [79]:
# combine postbacc df's together
df_22_23_postbacc = sqldf(
    """
    SELECT *
    FROM df_2022_fall_postbacc
    UNION
    SELECT * FROM df_2023_winter_spring_postbacc
"""
)
len(df_22_23_postbacc)

21063

In [160]:
# AY 2021-2022
# combine postbacc df's together
df_21_22_postbacc = sqldf(
    """
    SELECT *
    FROM df_2021_fall_postbacc
    UNION
    SELECT * FROM df_2022_winter_spring_postbacc
"""
)
len(df_21_22_postbacc)

23334

In [80]:
df_22_23_postbacc.drop_duplicates(subset=['calstateEduPersonUID'], inplace = True)
len(df_22_23_postbacc)

13828

In [161]:
df_21_22_postbacc.drop_duplicates(subset=['calstateEduPersonUID'], inplace = True)
len(df_21_22_postbacc)

14326

----------

# Part 3: Combine 2a: Intern and 2d: Postbacc and Graduates in regular programs

In [81]:
# combine A) INTERN and D) postbacc/graduate table together
# Why? Because this will only include (df["erss_stud_lev"].isin([5])) aka those NOT in undergraduate programs.
df_2022_2023 = sqldf(
    """
    SELECT *
    FROM df_22_23_intern
    UNION
    SELECT * FROM df_22_23_postbacc
"""
)
len(df_2022_2023)

14794

In [82]:
df_2022_2023.drop_duplicates(subset=['calstateEduPersonUID'], inplace = True)
len(df_2022_2023)

14608

In [83]:
# left join the program type onto the main DataFrame
df_2022_2023_merge = pd.merge(
    df_2022_2023,
    df_lookup,
    left_on="erss_cred_obj",
    right_on="erss_cred_obj",
    how="left",
)
len(df_2022_2023_merge)

14608

In [162]:
df_2022_2023_merge["code_value"].value_counts()

code_value
MS       5920
SS       4595
ES       2168
Other    1925
Name: count, dtype: int64

In [85]:
df_2022_2023_merge["erss_stud_lev"].value_counts()

erss_stud_lev
5    14608
Name: count, dtype: int64

## 4a: Load in IPEDS Race Lookup table

In [86]:
# read in ipeds lookup table
ipeds_race_df = pd.read_excel("..\data\ipeds_race_lookup\ipeds_race_lookup.xlsx")

  ipeds_race_df = pd.read_excel("..\data\ipeds_race_lookup\ipeds_race_lookup.xlsx")


In [87]:
# join race descriptions onto the DataFrame
df_2022_2023_race = pd.merge(
    df_2022_2023_merge,
    ipeds_race_df,
    left_on="erss_ipeds_race_catg",
    right_on="ipeds_value",
    how="left",
)

In [88]:
# left join the campus names onto the main DataFrame

df_2022_2023_campus_names = pd.merge(
    df_2022_2023_race,
    df_campus_codes,
    left_on="erss_campus",
    right_on="campus_code",
    how="left",
)

# Part 3: For AY 2021-2022 Combine 2a: Intern and 2d: Postbacc and Graduates in regular programs

In [163]:
# combine A) INTERN and D) postbacc/graduate table together
# Why? Because this will only include (df["erss_stud_lev"].isin([5])) aka those NOT in undergraduate programs.
df_2021_2022 = sqldf(
    """
    SELECT *
    FROM df_21_22_intern
    UNION
    SELECT * FROM df_21_22_postbacc
"""
)
len(df_2021_2022)

15317

In [164]:
df_2021_2022.drop_duplicates(subset=['calstateEduPersonUID'], inplace = True)
len(df_2021_2022)

15129

In [165]:
# left join the program type onto the main DataFrame
df_2021_2022_merge = pd.merge(
    df_2021_2022,
    df_lookup,
    left_on="erss_cred_obj",
    right_on="erss_cred_obj",
    how="left",
)
len(df_2021_2022_merge)

15129

In [166]:
df_2021_2022_merge["code_value"].value_counts()

code_value
MS       5774
SS       4975
ES       2551
Other    1829
Name: count, dtype: int64

In [167]:
df_2021_2022_merge["erss_stud_lev"].value_counts()

erss_stud_lev
5    15129
Name: count, dtype: int64

## 4a: Load in IPEDS Race Lookup table

In [None]:
# read in ipeds lookup table
ipeds_race_df = pd.read_excel("..\data\ipeds_race_lookup\ipeds_race_lookup.xlsx")

  ipeds_race_df = pd.read_excel("..\data\ipeds_race_lookup\ipeds_race_lookup.xlsx")


In [168]:
# join race descriptions onto the DataFrame
df_2021_2022_race = pd.merge(
    df_2021_2022_merge,
    ipeds_race_df,
    left_on="erss_ipeds_race_catg",
    right_on="ipeds_value",
    how="left",
)

In [169]:
# left join the campus names onto the main DataFrame

df_2021_2022_campus_names = pd.merge(
    df_2021_2022_race,
    df_campus_codes,
    left_on="erss_campus",
    right_on="campus_code",
    how="left",
)

-----

# 5: Determine enrollment rate

- want total enrollments for this campus
- want total applications for this campus for the same years and terms

In [89]:
# filter down to MS, SS, ES only

df_2022_2023_campus_names = sqldf(
    """
    SELECT *
    FROM df_2022_2023_campus_names
    WHERE code_value IN ('MS', 'SS', 'ES')
"""
)
print(len(df_2022_2023_campus_names))

12683


In [90]:
df_2022_2023_campus_names.columns

Index(['calstateEduPersonUID', 'erss_year', 'erss_term', 'erss_campus',
       'erss_birth_date', 'erss_sex', 'erss_ethnic_old', 'erss_cit',
       'erss_country', 'erss_res', 'erss_res_stat', 'erss_inst_orig',
       'erss_matric_per', 'erss_adm_basis_old', 'erss_enroll_stat',
       'erss_stud_lev', 'erss_deg_obj', 'erss_conc', 'erss_cred_stat',
       'erss_cred_obj', 'erss_deg_held', 'erss_stud_stand', 'erss_transf_gpa',
       'erss_campus_gpa', 'erss_total_ue', 'erss_total_gpa', 'erss_eop_stat',
       'erss_dss', 'erss_dss_prog', 'erss_cip', 'erss_cred_emph',
       'erss_tua_ld', 'erss_tua_ud', 'erss_tua_gd', 'erss_hs_gpa',
       'erss_imm_yr', 'erss_spec_prog', 'erss_start_date_CST', 'erss_sufw',
       'erss_matric_type', 'erss_tua_pc', 'erss_mil_stat', 'erss_hl_stat',
       'erss_hl_catg', 'erss_multi_race_catg', 'erss_ipeds_race_catg',
       'erss_emplid', 'erss_major', 'erss_cur_mil_stat', 'erss_mil_dep_stat',
       'erss_ccc_control', 'erss_cum_ue_campus', 'erss_deg_p

In [133]:
df_2022_2023_campus_names.campus_name.value_counts()

campus_name
CalStateTEACH      1534
Long Beach         1467
Northridge         1091
Los Angeles         910
San Bernardino      776
Fullerton           731
Fresno              647
San Jose            638
San Diego           537
Stanislaus          515
Bakersfield         514
Pomona              503
San Francisco       466
Dominguez Hills     454
Sacramento          387
Chico               298
East Bay            244
Sonoma              228
San Marcos          197
Monterey Bay        176
San Luis Obispo     142
Channel Islands     124
Humboldt            104
Name: count, dtype: int64

# From here, you can get counts for all campuses - use this for the CTEPP report.

In [92]:
# race by campus
sql_all_df = sqldf(
    """
    SELECT 
        campus_name,
        race_description,
        COUNT(calstateEduPersonUID) as total_records
        FROM df_2022_2023_campus_names
        GROUP BY 
            campus_name,
            race_description
        ORDER BY campus_name
            """
)

In [93]:
sql_all_df.head()

Unnamed: 0,campus_name,race_description,total_records
0,Bakersfield,American Indian or Alaska Native,5
1,Bakersfield,Asian,10
2,Bakersfield,Black or African American,16
3,Bakersfield,Hispanic/Latino,343
4,Bakersfield,Two or More Races,11


In [94]:
sql_all_df.to_csv("..\data\output\ctepp_enrollment_22_23.csv", index = False)

  sql_all_df.to_csv("..\data\output\ctepp_enrollment_22_23.csv", index = False)


In [95]:
# not used: label cohort 1 and cohort 2
cohort_1 = ['Bakersfield','Humboldt','Northridge','San Luis Obispo']
cohort_2 = ['Chico','Los Angeles','Long Beach','Pomona','Sacramento','Stanislaus']

In [96]:
# filter down to the two campuses of interest
df_LB_SLO = df_2022_2023_campus_names[
    (df_2022_2023_campus_names["campus_name"].isin(["Long Beach", "San Luis Obispo"]))
]
len(df_LB_SLO)

1609

In [97]:
df_LB_SLO.campus_name.value_counts()

campus_name
Long Beach         1467
San Luis Obispo     142
Name: count, dtype: int64

In [98]:
# race by campus
sql_df = sqldf(
    """
    SELECT 
        campus_name,
        race_description,
        COUNT(calstateEduPersonUID) as total_records
        FROM df_LB_SLO
        GROUP BY 
            campus_name,
            race_description
        ORDER BY campus_name
            """
)

In [99]:
sql_df

Unnamed: 0,campus_name,race_description,total_records
0,Long Beach,American Indian or Alaska Native,2
1,Long Beach,Asian,217
2,Long Beach,Black or African American,37
3,Long Beach,Hispanic/Latino,735
4,Long Beach,Native Hawaiian or Other Pacific Islander,1
5,Long Beach,Two or More Races,56
6,Long Beach,Unknown,79
7,Long Beach,White,340
8,San Luis Obispo,Asian,4
9,San Luis Obispo,Hispanic/Latino,51


---------

# Section 2

- **Number of candidates** can be determined by the number of new enrollments by campus by race/ethnicity in AY 2022-2023.
    - Add this to the candidates count
- When dropping duplicates, did it keep the newest or oldest record? To avoid this, filter by the new enrollment record first, THEN drop duplicates.
- **Number of completers** can be determined by counting the number of completer records by campus by race/ethnicity in the completer records.

### Intern students

In [100]:
# combine intern df's together
# filter to new students only using "WHERE erss_enroll_stat IN (4,5)"
df_22_23_intern_new = sqldf(
    """
    SELECT *
    FROM df_2022_fall_intern
    WHERE erss_enroll_stat IN (4,5)
    UNION
    SELECT * FROM df_2023_winter_spring_intern
    WHERE erss_enroll_stat IN (4,5)
"""
)

In [101]:
df_22_23_intern_new.erss_enroll_stat.value_counts()

erss_enroll_stat
5    80
Name: count, dtype: int64

In [102]:
# drop duplicates
df_22_23_intern_new.drop_duplicates(subset=['calstateEduPersonUID'], inplace = True)
len(df_22_23_intern_new)


80

### Postbacc students

In [103]:
# combine postbacc df's together
df_22_23_postbacc_new = sqldf(
    """
    SELECT *
    FROM df_2022_fall_postbacc
    WHERE erss_enroll_stat IN (4,5)
    UNION
    SELECT * FROM df_2023_winter_spring_postbacc
    WHERE erss_enroll_stat IN (4,5)
"""
)
len(df_22_23_postbacc_new)

6648

In [104]:
df_22_23_postbacc_new.drop_duplicates(subset=['calstateEduPersonUID'], inplace = True)
len(df_22_23_postbacc)

13828

### Combine intern and postbacc records

In [105]:
# combine A) INTERN and D) postbacc/graduate table together
# Why? Because this will only include (df["erss_stud_lev"].isin([5])) aka those NOT in undergraduate programs.
df_2022_2023_new = sqldf(
    """
    SELECT *
    FROM df_22_23_intern_new
    UNION
    SELECT * FROM df_22_23_postbacc_new
"""
)
print(len(df_2022_2023_new))
df_2022_2023_new.drop_duplicates(subset=['calstateEduPersonUID'], inplace = True)
print(len(df_2022_2023_new))

6697
6695


In [106]:
# left join the program type onto the main DataFrame
df_2022_2023_merge_new = pd.merge(
    df_2022_2023_new,
    df_lookup,
    left_on="erss_cred_obj",
    right_on="erss_cred_obj",
    how="left",
)
len(df_2022_2023_merge_new)

6695

In [107]:
df_2022_2023_merge_new["code_value"].value_counts()

code_value
MS       2721
SS       2429
Other     790
ES        755
Name: count, dtype: int64

In [108]:
df_2022_2023_merge_new["erss_stud_lev"].value_counts()

erss_stud_lev
5    6695
Name: count, dtype: int64

## 4a: Load in IPEDS Race Lookup table

In [109]:
# read in ipeds lookup table
ipeds_race_df = pd.read_excel("..\data\ipeds_race_lookup\ipeds_race_lookup.xlsx")

  ipeds_race_df = pd.read_excel("..\data\ipeds_race_lookup\ipeds_race_lookup.xlsx")


In [110]:
# join race descriptions onto the DataFrame
df_2022_2023_race = pd.merge(
    df_2022_2023_merge_new,
    ipeds_race_df,
    left_on="erss_ipeds_race_catg",
    right_on="ipeds_value",
    how="left",
)

In [111]:
# left join the campus names onto the main DataFrame

df_2022_2023_campus_names_new = pd.merge(
    df_2022_2023_race,
    df_campus_codes,
    left_on="erss_campus",
    right_on="campus_code",
    how="left",
)

In [112]:
# filter down to MS, SS, ES only

df_2022_2023_final = sqldf(
    """
    SELECT *
    FROM df_2022_2023_campus_names_new
    WHERE code_value IN ('MS', 'SS', 'ES')
"""
)
print(len(df_2022_2023_final))

5905


In [113]:
# use this for all campuses candidate counts
# these are new students at each campus
# race by campus
sql_all_df_new_candidates = sqldf(
    """
    SELECT 
        campus_name,
        race_description,
        COUNT(calstateEduPersonUID) as total_records
        FROM df_2022_2023_final
        GROUP BY 
            campus_name,
            race_description
        ORDER BY campus_name
            """
)

In [114]:
sql_all_df_new_candidates.to_csv("..\data\output\ctepp_new_candidates.csv", index = False)

  sql_all_df_new_candidates.to_csv("..\data\output\ctepp_new_candidates.csv", index = False)


In [115]:
# race by campus LB and SLO only
sql_new_candidates_LB_SLO = sqldf(
    """
    SELECT 
        campus_name,
        race_description,
        total_records as new_candidates
        FROM sql_all_df_new_candidates
        WHERE campus_name IN ('Long Beach','San Luis Obispo')
    """
)


In [116]:
sql_new_candidates_LB_SLO

Unnamed: 0,campus_name,race_description,new_candidates
0,Long Beach,American Indian or Alaska Native,2
1,Long Beach,Asian,104
2,Long Beach,Black or African American,24
3,Long Beach,Hispanic/Latino,357
4,Long Beach,Two or More Races,31
5,Long Beach,Unknown,44
6,Long Beach,White,159
7,San Luis Obispo,Asian,2
8,San Luis Obispo,Hispanic/Latino,24
9,San Luis Obispo,Two or More Races,2


In [117]:
# next steps:
# [Complete] modify query for 1 and 2 to only include MS, ES, and SS students
# [Complete] add these new enrollments numbers to the Number of Candidates numbers
# calculate completers by race/ethnicity for AY 2022/2023 and add them to the Number of Completers numbers
# recalculate completion rate

# time to completion can be calculated using matriculation date and weighting the completers for AY 2022/2023.

# Section 3: Completers
- connect completers records with enrollment records to get
    - race/ethnicity
    - matriculation period

In [179]:
# read in AY 2022-2023
df_22_23 = pd.read_csv("..\\data\\completer lists\\EDQ_COMB_COMP_2223_120424.csv")
print(len(df_22_23))

5169


In [180]:
# drop unneeded columns

df_22_23 = df_22_23.drop(columns = [
    'Personal Non-CSU Email',
    'Other Email',
    'Phone Number',
    'Noyce Grant Recipients',
    'Noyce Code',
    'campus_name',
    'campus_code'
])

### Join enrollment onto completers

In [181]:
# combine the AY 2021-2022 and AY 2022-2023 in an attempt to get more CalStateEduPersonUID to match on.
df_2021_2023_enrollment = sqldf(
    """
    SELECT *
    FROM df_2022_2023_campus_names
    UNION
    SELECT * FROM df_2021_2022_campus_names
"""
)
len(df_2021_2023_enrollment)


27812

In [182]:
df_2021_2023_enrollment.drop_duplicates(subset=['calstateEduPersonUID'], inplace = True)
len(df_2021_2023_enrollment)

22600

In [None]:
# Data issue: San Diego is not matching many completers with enrollments, compared to all other campuses.

In [183]:
df_2021_2023_enrollment.campus_name.value_counts()

campus_name
Long Beach         2412
CalStateTEACH      2212
Northridge         1856
Fullerton          1482
San Bernardino     1445
Los Angeles        1361
San Jose           1354
San Diego          1202
Fresno             1106
Bakersfield         987
San Francisco       977
Stanislaus          778
Sacramento          749
Dominguez Hills     746
Pomona              732
East Bay            664
Chico               542
San Marcos          410
Sonoma              406
Monterey Bay        393
San Luis Obispo     297
Channel Islands     282
Humboldt            207
Name: count, dtype: int64

In [138]:
# use the enrollment df for all enrollments for AY 2022-23

# 96 is CalStateTEACH
# there are 20 completers who may not match with df_2021_new_race because the SSN was blank
# after joining these tables, 17 records were duplicated because they were found in 2 different completer lists
# 1 student, DEDF71281814529, showed an enrollment at 2 campuses, but only completed at Long Beach that year.
# business rule: Only keep the first record (earliest record) of completion
## What does this mean? This means that some campuses may have included a completer in both the 2021-2022 AND 2022-2023 lists.
# If someone completed multiple programs, we look at their first completed program for the 2-year completion rate.
df_all = sqldf(
    """
    SELECT *
    FROM df_22_23 AS a
    LEFT JOIN df_2022_2023_campus_names AS b
    USING(calstateEduPersonUID)
    """
)

In [184]:
# use the enrollment df for all enrollments for AY 2021-2022 AND AY 2022-23

# 96 is CalStateTEACH
# there are 20 completers who may not match with df_2021_new_race because the SSN was blank
# after joining these tables, 17 records were duplicated because they were found in 2 different completer lists
# 1 student, DEDF71281814529, showed an enrollment at 2 campuses, but only completed at Long Beach that year.
# business rule: Only keep the first record (earliest record) of completion
## What does this mean? This means that some campuses may have included a completer in both the 2021-2022 AND 2022-2023 lists.
# If someone completed multiple programs, we look at their first completed program for the 2-year completion rate.
df_all_21_23 = sqldf(
    """
    SELECT *
    FROM df_22_23 AS a
    LEFT JOIN df_2021_2023_enrollment AS b
    USING(calstateEduPersonUID)
    """
)

In [185]:
# Including AY 2021-2022 records yieled about 300 more matches.
# Including summer terms did not yield a significant increase in matches.
df_all_21_23.campus_name.value_counts().sum()

4301

In [174]:
df_all_21_23.to_csv("..\data\output\ctepp_completers_21_23_enrollment.csv", index = False)

  df_all_21_23.to_csv("..\data\output\ctepp_completers_21_23_enrollment.csv", index = False)


In [121]:
len(df_all)
len

5169

In [122]:
def csv_reader(file_name):
  for row in open(file_name, 'r'):
    yield row

csv_gen = csv_reader("..\\data\\output\\ctepp_completers.csv")
row_count = 0

for row in csv_gen:
  row_count += 1

print(f"Row count is : {row_count}")
csv_gen.close()

Row count is : 5171


In [123]:
csv_gen2 = csv_reader("..\\data\\output\\ctepp_completers.csv")
row_count = 0

for row in csv_gen2:
  row_count += 1

print(f"Row count is : {row_count}")
csv_gen2.close()

Row count is : 5171


In [124]:
df_all.to_csv("..\data\output\ctepp_completers.csv", index = False)

  df_all.to_csv("..\data\output\ctepp_completers.csv", index = False)


In [125]:
df_all.campus_name.value_counts().sum()

4003

In [126]:
# LB: 509 completers --> 103 unmatched completers
# SLO: 120 completers --> 0 unmatched completers
df_all.campus_name.value_counts()

campus_name
Fullerton          411
Long Beach         406
Fresno             382
San Jose           302
Northridge         256
Bakersfield        255
Sacramento         240
Los Angeles        218
Chico              212
San Bernardino     204
San Francisco      139
Pomona             132
San Marcos         131
Stanislaus         130
Sonoma             129
San Luis Obispo    120
Dominguez Hills    112
East Bay            91
Humboldt            68
Monterey Bay        65
Name: count, dtype: int64

In [127]:
df_all.drop(labels= ['Campus ID Number', 'Campus Letter Code'], axis = 1, inplace = True)

In [128]:
# race by campus
completers_df = sqldf(
    """
    SELECT 
        campus_name,
        race_description,
        COUNT(calstateEduPersonUID) as total_records
        FROM df_all
        WHERE campus_name IN ('Long Beach', 'San Luis Obispo')
        GROUP BY 
            campus_name,
            race_description
        ORDER BY campus_name
            """
)

In [129]:
completers_df

Unnamed: 0,campus_name,race_description,total_records
0,Long Beach,Asian,61
1,Long Beach,Black or African American,6
2,Long Beach,Hispanic/Latino,211
3,Long Beach,Two or More Races,18
4,Long Beach,Unknown,14
5,Long Beach,White,96
6,San Luis Obispo,Asian,3
7,San Luis Obispo,Hispanic/Latino,40
8,San Luis Obispo,Two or More Races,1
9,San Luis Obispo,Unknown,5


- Time to completion was calculated in Excel using an estimated completion date of 1/1/2023.
- Outliers removed:
    - Time to completion over 9 years
    - Time to completion under 0.5 years

In [134]:
# dataframe for all records by race for all campuses
# race by campus
completers_df_all = sqldf(
    """
    SELECT 
        campus_name,
        race_description,
        COUNT(calstateEduPersonUID) as total_records
        FROM df_all
        GROUP BY 
            campus_name,
            race_description
        ORDER BY campus_name
            """
)

In [135]:
completers_df_all

Unnamed: 0,campus_name,race_description,total_records
0,,,1166
1,Bakersfield,American Indian or Alaska Native,2
2,Bakersfield,Asian,6
3,Bakersfield,Black or African American,4
4,Bakersfield,Hispanic/Latino,168
...,...,...,...
123,Stanislaus,Asian,9
124,Stanislaus,Hispanic/Latino,70
125,Stanislaus,Two or More Races,2
126,Stanislaus,Unknown,2


In [None]:
# consider matching 2022-2023 completers with 2 years of enrollment data, AND/OR include the summer session for enrollment.