In [143]:
# Business logic applied to ERSS file:
# We take a distinct count of a primary key based on an obfuscated SSN.
# 0. Deduplicate the records based on calStateEduPersonUID (there doesn't seem to be any duplicates based on ID, Year, and Term.)
# 1. erss_campus NOT IN (7, 97, 93): This includes all campuses including CalStateTEACH with an educator preparation program
# 2. erss_cred_stat IN ('4','5','6','8', 'V'): This identifies students in a teaching credential program
# 3. erss_year = 2021: Chosen to align with erss year 2021 in the Teaching Credentials Enrollment Dashboard
# 4. erss_term = 4: Chosen to align with the Fall term in the Teaching Credentials Enrollment Dashboard
# 5. erss_cred_obj mapped to SourceCode (MS, ES, and SS flag) using the "Credential and Subject Matter Waiver Objective" data dictionary definition


In [144]:
# import pandas
import pandas as pd

In [145]:
# read in raw data
df = pd.read_csv("..\data\erss\ERSS_20213_20222_221215.csv",
dtype={'erss_cred_stat': str, 'erss_ethnic_old': str, 'erss_cred_emph': str, 'erss_spec_prog' :str})

In [146]:
# confirm that only CSU campuses are in the dataset

# create a DataFrame with CSU Name and CSU Campus Code
# Dict = dict([(1, 'Geeks'), (2, 'For')])

csu_campus_dictionary = dict(
    [
('35' ,'Bakersfield'),
('73' ,'Channel Islands'),
('20' ,'Chico'),
('55' ,'Dominguez Hills'),
('05' ,'East Bay'),
('25' ,'Fresno'),
('50' ,'Fullerton'),
('30' ,'Humboldt'),
('40' ,'Long Beach'),
('45' ,'Los Angeles'),
('06' ,'Monterey Bay'),
('70' ,'Northridge'),
('10' ,'Pomona'),
('60' ,'Sacramento'),
('63' ,'San Bernardino'),
('65' ,'San Diego'),
('75' ,'San Francisco'),
('80' ,'San Jose'),
('15' ,'San Luis Obispo'),
('68' ,'San Marcos'),
('85' ,'Sonoma'),
('90' ,'Stanislaus'),
('96' ,'CalStateTEACH')
    ]
)

In [147]:
csu_campus_dictionary 

{'35': 'Bakersfield',
 '73': 'Channel Islands',
 '20': 'Chico',
 '55': 'Dominguez Hills',
 '05': 'East Bay',
 '25': 'Fresno',
 '50': 'Fullerton',
 '30': 'Humboldt',
 '40': 'Long Beach',
 '45': 'Los Angeles',
 '06': 'Monterey Bay',
 '70': 'Northridge',
 '10': 'Pomona',
 '60': 'Sacramento',
 '63': 'San Bernardino',
 '65': 'San Diego',
 '75': 'San Francisco',
 '80': 'San Jose',
 '15': 'San Luis Obispo',
 '68': 'San Marcos',
 '85': 'Sonoma',
 '90': 'Stanislaus',
 '96': 'CalStateTEACH'}

In [148]:
# create DataFrame from the dictionary

# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.from_dict.html

csu_campus_name_df = pd.DataFrame.from_dict(csu_campus_dictionary, orient='index',
                       columns=['campus_name'])

In [149]:
csu_campus_name_df.reset_index()
csu_campus_name_df.rename(columns={"index": "campus_code"}).reset_index()
csu_campus_name_df.rename(columns={"index": "campus_code"})

Unnamed: 0,campus_name
35,Bakersfield
73,Channel Islands
20,Chico
55,Dominguez Hills
5,East Bay
25,Fresno
50,Fullerton
30,Humboldt
40,Long Beach
45,Los Angeles


In [150]:
df_2021_fall = df[(df['erss_year'] == 2021) &
 (df['erss_term'] == 4) &
 (df['erss_cred_stat'].isin(['4', '5', '6', '8', 'V']))]

In [151]:
# load in credential objective lookup table
df_lookup = pd.read_excel("..\data\credential_objective_lookup\erss_cred_obj_lookup.xlsx", sheet_name = "edq_version")

In [152]:
# load in campus names and campus codes
df_campus_codes = pd.read_excel("..\data\campus_codes\campus_codes_and_names.xlsx")

In [153]:
df_lookup['erss_cred_obj']=df_lookup['erss_cred_obj'].astype(int)

In [154]:
# left join the program type onto the main DataFrame
df_2021_combined = pd.merge(df_2021_fall, df_lookup, left_on = "erss_cred_obj", right_on = "erss_cred_obj",how = 'left')

In [155]:
df_2021_combined["code_value"].value_counts()

MS    5327
SS    4384
ES    2323
Name: code_value, dtype: int64

In [156]:
# left join the campus names onto the main DataFrame
df_2021_combined_names = pd.merge(df_2021_combined, df_campus_codes, left_on = "erss_campus", right_on = "campus_code",how = 'left')

In [157]:
df_2021_combined_names_MS_SS_ES_only = df_2021_combined_names[df_2021_combined_names["code_value"].isin(['MS','SS','ES'])]

In [158]:
df_2021_combined_names_MS_SS_ES_only.to_csv("..\data\output\erss_2021_2022.csv")

In [159]:
# look at ES only
df_es_only = df_2021_combined_names_MS_SS_ES_only[(df_2021_combined_names_MS_SS_ES_only["code_value"] == 'ES')]
len(df_es_only)

2323

In [186]:
# 400, 436, 481. 482, 483, 484, 485, 486, 487, 490
df_es_only.erss_cred_obj.value_counts()
# 410, 415, 430 (early childhood), 463 

481.0    1340
482.0     528
436.0     338
483.0      63
485.0      34
410.0       4
490.0       4
415.0       3
430.0       2
484.0       1
463.0       1
Name: erss_cred_obj, dtype: int64

In [187]:
1340+528+338+63+35+4+1

2309

In [163]:
len(df_2021_combined_names_MS_SS_ES_only)

12034

In [164]:
df_2021_combined_names_MS_SS_ES_only.erss_cred_stat.value_counts()

5    11002
8      724
4      194
V       96
6       18
Name: erss_cred_stat, dtype: int64

In [165]:
# Ethnicity
df_2021_combined_names_MS_SS_ES_only.erss_ipeds_race_catg.value_counts()

7    5955
1    3635
4    1162
8     536
6     395
2     310
3      28
5      13
Name: erss_ipeds_race_catg, dtype: int64

In [166]:
df_2021_combined_names_MS_SS_ES_only.erss_sex.value_counts()

F    8926
M    3097
N      11
Name: erss_sex, dtype: int64

In [167]:
df_ms_ss_only.erss_enroll_stat.value_counts()

1    4969
5    4148
2     452
4     129
3      13
Name: erss_enroll_stat, dtype: int64

In [174]:
# take out ES students, then look at first-time numbers
# look at ES only
df_ms_ss_only = df_2021_combined_names_MS_SS_ES_only[(df_2021_combined_names_MS_SS_ES_only["code_value"] == 'SS') | (df_2021_combined_names_MS_SS_ES_only["code_value"] == 'MS') ]
len(df_ms_ss_only)

9711

In [175]:
# Mapping of erss fields to dashboard filters.

# CAMPUS: erss_campus which is a 2-digit code
# CREDENTIAL TYPE: erss_cred_obj mapped to SourceCode (MS, ES, and SS flag) using the "Credential and Subject Matter Waiver Objective" data dictionary definition
# ***CREDENTIAL OBJECTIVE***: erss_cred_obj CREDENTIAL TYPE is built off of this. Provides specific credential objective. primary California public school credential or Subject Matter waiver objective of the applicant
# PATHWAY: REGULAR OR INTERNSHIP - erss_cred_stat depends on CRED_STAT field
# ENTRY COHORT: erss_year and erss_term. Depends on earliest enrollment record. 
# STUDENT LEVEL: erss_stud_lev."Grade Level" in the EnrollmentProgression view. ITEP == Integrated Teacher Educator Education Program.
# GENDER: erss_sex. M, F, or N. IR & A does NOT show non-binary as an option.
# FIRST GENERATION STATUS: Comes from "PARENT/GUARDIAN #1 EDUCATION CODE" in ERSA. Not suggested for use.
# BREAKDOWN BY: "Race/Ethnicity". erss_ipeds_race_catg. Differes from IR & A who have a value for "International Student"

## Post Completion Page only
# CREDENTIAL LEVEL: Hard to understand where this is coming from.
# COMPLETION YEAR: I suggest that we use the year of completion from the completer lists.

### New or continuing student? erss_enroll_stat

In [176]:
df_2020_2021_benchmark = pd.read_csv("..\data\erss\ERSS_20203_20212_211020.csv")

  df_2020_2021_benchmark = pd.read_csv("..\data\erss\ERSS_20203_20212_211020.csv")


In [177]:
df_2020_2021 = pd.read_csv("..\data\erss\ERSS_20203_20212_211020.csv", dtype={'erss_cred_stat': str, 'erss_ethnic_old': str, 'erss_cred_emph': str, 'erss_spec_prog' :str})

In [178]:
# check: one more year of data for Fall 2020

df_2020_2021 = pd.read_csv("..\data\erss\ERSS_20203_20212_211020.csv", dtype={'erss_cred_stat': str, 'erss_ethnic_old': str, 'erss_cred_emph': str, 'erss_spec_prog' :str})

In [179]:
df_2020_fall = df_2020_2021[(df_2020_2021['erss_year'] == 2020) &
 (df_2020_2021['erss_term'] == 4) &
 (df_2020_2021['erss_cred_stat'].isin(['4', '5', '6', '8', 'V']))]

In [180]:
# left join the program type onto the main DataFrame
df_2020_combined = pd.merge(df_2020_fall, df_lookup, left_on = "erss_cred_obj", right_on = "erss_cred_obj",how = 'left')

In [181]:
df_2020_combined["code_value"].value_counts()

MS    5162
SS    3889
ES    2318
Name: code_value, dtype: int64

In [182]:
df_es_only = df_2020_combined[(df_2020_combined["code_value"] == 'ES')]

In [183]:
df_es_only["description"].value_counts()

Mild/Moderate Disabilities (2010)                              1340
Moderate/Severe Disabilities (2010)                             528
Early Childhood Special Education (2010)                        338
Visual Impairments (2010)                                        63
Deaf and Hard of Hearing (2010)                                  34
Reading Language Arts Specialist                                  4
Adapted Physical Education                                        4
Reading Certificate                                               3
Early Childhood Education Specialist (retiring Summer 2023)       2
Physical and Health Impairments (2010)                            1
Learning Handicapped                                              1
Name: description, dtype: int64

In [184]:
df_es_only["erss_cred_obj"].value_counts()

481.0    1340
482.0     528
436.0     338
483.0      63
485.0      34
410.0       4
490.0       4
415.0       3
430.0       2
484.0       1
463.0       1
Name: erss_cred_obj, dtype: int64

In [185]:
df_es_only_2 = df_2021_combined[(df_2021_combined["Descriptor.CodeValue"] == 'ES')]

KeyError: 'Descriptor.CodeValue'

In [None]:
df_es_only_2["SourceCode"].value_counts()

481    1339
482     503
436     329
483      76
485      36
410      23
490      14
Name: SourceCode, dtype: int64

In [None]:
# Could SourceCode == 410 not be counted by IR & A for the Education Specialist counts?