# Data Preparation

Focus on school years 2021-2022


In [11]:
# import libraries
import importlib
import os

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import subprocess
import sys

from pathlib import Path

In [12]:
# import other libraries
# check if jcds library is installed
package_name = "jcds"

if importlib.util.find_spec(package_name) is None:
    print(f" '{package_name}' not found. Installing from Github... ")
    subprocess.check_call(
        [
            sys.executable,
            "-m",
            "pip",
            "install",
            "https://github.com/junclemente/jcds.git",
        ]
    )
else:
    print(f" '{package_name}' is already installed.")

from jcds import eda as jeda
from jcds import reports as jrep

 'jcds' is already installed.


In [13]:
data_folder = Path("../data")
ca_doe = Path(data_folder / "raw/ca_doe")
ca_schls = Path(data_folder / "raw/ca_schls")
civil_rights = Path(data_folder / "raw/civil_rights_data")
raw_data = Path(data_folder / "raw")

# CA Dept of Education

**Adjusted Cohort Graduation Rate and Outcome Data**
Four-year Adjusted Cohort Graduation Rate (ACGR) and Outcome data reported by race/ethnicity, student group, and gender.  
Source: [https://www.cde.ca.gov/ds/ad/filesacgr.asp](https://www.cde.ca.gov/ds/ad/filesacgr.asp)

**Note:** To protect student privacy, data are suppressed (\*) on the data file if the cell size within a selected student population (cohort students) is 10 or less. Additionally, the “Not Reported” race/ethnicity is suppressed, regardless of actual cell size, if the student population for one or more other race/ethnicity groups is suppressed.

Raw Datasets may not be included in the repository due to its size.
The datasets can be downloaded from here: [raw datasets](https://drive.google.com/drive/folders/1OUvpL1nY1uTu2PmjIHJ7wZmY4YgW4onM?usp=sharing)


In [14]:
dataset = "acgr21.txt"
df = pd.read_csv(ca_doe / dataset, sep="\t", dtype=str)
df

Unnamed: 0,AcademicYear,AggregateLevel,CountyCode,DistrictCode,SchoolCode,CountyName,DistrictName,SchoolName,CharterSchool,DASS,...,SPED Certificate (Count),SPED Certificate (Rate),GED Completer (Count),GED Completer (Rate),Other Transfer (Count),Other Transfer (Rate),Dropout (Count),Dropout (Rate),Still Enrolled (Count),Still Enrolled (Rate)
0,2020-21,C,01,,,Alameda,,,All,All,...,57,0.7,0,0.0,15,0.2,449,5.2,305,3.5
1,2020-21,C,01,,,Alameda,,,All,All,...,97,1.1,5,0.1,29,0.3,849,9.3,472,5.2
2,2020-21,C,01,,,Alameda,,,All,All,...,*,*,*,*,*,*,*,*,*,*
3,2020-21,C,01,,,Alameda,,,All,All,...,18,0.4,1,0.0,2,0.0,112,2.5,47,1.1
4,2020-21,C,01,,,Alameda,,,All,All,...,27,1.6,0,0.0,3,0.2,172,10.1,139,8.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
254933,2020-21,T,00,,,State,,,Yes,Yes,...,0,0.0,1,0.1,6,0.8,273,37.6,269,37.1
254934,2020-21,T,00,,,State,,,Yes,Yes,...,2,0.1,28,0.7,62,1.6,2007,52.1,1056,27.4
254935,2020-21,T,00,,,State,,,Yes,Yes,...,0,0.0,0,0.0,2,1.3,33,22.1,55,36.9
254936,2020-21,T,00,,,State,,,Yes,Yes,...,7,0.0,142,0.6,376,1.6,10306,44.2,6911,29.6


| **Column Name**       | **Description**                                                                                                                                                                                                                                                           | **Values Kept** | **Reason for Filter**                                                                                                              |
| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
| **AggregateLevel**    | Indicates the data aggregation level within the ACGR dataset.<br><br>Possible values:<br>• `T` = State<br>• `C` = County<br>• `D` = District<br>• `S` = School                                                                                                            | `S` (School)    | Focuses analysis on **individual schools**, providing more data points and allowing finer-grained modeling of graduation outcomes. |
| **CharterSchool**     | Indicates whether the record includes data for charter schools.<br><br>Possible values:<br>• `All` = All schools combined<br>• `Y` = Charter only<br>• `N` = Non-charter only                                                                                             | `No` (or `N`)   | Excludes **charter schools** to focus on **traditional public high schools**.                                                      |
| **DASS**              | Dashboard Alternative School Status (DASS) flag.<br><br>Possible values:<br>• `All` = All schools combined<br>• `Y` = DASS only<br>• `N` = Non-DASS only                                                                                                                  | `No` (or `N`)   | Removes **alternative/continuation programs** so graduation rates reflect typical comprehensive high schools.                      |
| **ReportingCategory** | Identifies which subgroup the record represents.<br><br>Examples:<br>• Race/Ethnicity (`RB` = African American, `RH` = Hispanic, etc.)<br>• Gender (`GM` = Male, `GF` = Female)<br>• Program groups (`SD` = Students with Disabilities, etc.)<br>• `TA` = Total Aggregate | `TA`            | Keeps **aggregate totals** for each school (not broken down by subgroup) to simplify modeling.                                     |


In [15]:
# filter dataset
df_filtered = df[
    (df["AggregateLevel"].str.strip() == "S")
    & (df["CharterSchool"].str.strip() == "No")
    & (df["DASS"].str.strip() == "No")
    & (df["ReportingCategory"] == "TA")
]

df_filtered

Unnamed: 0,AcademicYear,AggregateLevel,CountyCode,DistrictCode,SchoolCode,CountyName,DistrictName,SchoolName,CharterSchool,DASS,...,SPED Certificate (Count),SPED Certificate (Rate),GED Completer (Count),GED Completer (Rate),Other Transfer (Count),Other Transfer (Rate),Dropout (Count),Dropout (Rate),Still Enrolled (Count),Still Enrolled (Rate)
66594,2020-21,S,01,31609,0131755,Alameda,California School for the Blind (State Special...,California School for the Blind,No,No,...,4,36.4,0,0.0,0,0.0,7,63.6,0,0.0
66654,2020-21,S,01,31617,0131763,Alameda,California School for the Deaf-Fremont (State ...,California School for the Deaf-Fremont,No,No,...,2,5.3,0,0.0,0,0.0,1,2.6,11,28.9
66718,2020-21,S,01,61119,0000001,Alameda,Alameda Unified,"Nonpublic, Nonsectarian Schools",No,No,...,*,*,*,*,*,*,*,*,*,*
66782,2020-21,S,01,61119,0106401,Alameda,Alameda Unified,Alameda Science and Technology Institute,No,No,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
66910,2020-21,S,01,61119,0130229,Alameda,Alameda Unified,Alameda High,No,No,...,11,2.8,0,0.0,1,0.3,9,2.3,4,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
254262,2020-21,S,58,72736,0000000,Yuba,Marysville Joint Unified,District Office,No,No,...,0,0.0,0,0.0,0,0.0,30,43.5,0,0.0
254314,2020-21,S,58,72736,0000001,Yuba,Marysville Joint Unified,"Nonpublic, Nonsectarian Schools",No,No,...,*,*,*,*,*,*,*,*,*,*
254426,2020-21,S,58,72736,5830013,Yuba,Marysville Joint Unified,Lindhurst High,No,No,...,0,0.0,0,0.0,2,0.9,14,6.2,13,5.8
254630,2020-21,S,58,72736,5835202,Yuba,Marysville Joint Unified,Marysville High,No,No,...,0,0.0,0,0.0,1,0.5,14,7.0,4,2.0


## Save dataset as pickle file


In [20]:
output_path = Path(data_folder / "acgr21_22_raw.pkl")

# if output_path.exists():
#     print(f"File already exists: {output_path}")
# else:
#     df_filtered.to_pickle(output_path)
#     print(f"File saved successfully: {output_path}")

df_filtered.to_pickle(output_path)

In [19]:
jrep.data_info(df_filtered, show_columns=True)


SHAPE:
There are 1502 rows and 34 columns (2.97 MB).

DUPLICATES:
There are 0 duplicated rows.

COLUMNS/VARIABLES:
Column dType Summary:
 * object: 34
There are 0 numerical (int/float/bool) variables.
 * Columns: []
There are 34 categorical (nominal/ordinal) variables.
 * Columns: ['AcademicYear', 'AggregateLevel', 'CountyCode', 'DistrictCode', 'SchoolCode', 'CountyName', 'DistrictName', 'SchoolName', 'CharterSchool', 'DASS', 'ReportingCategory', 'CohortStudents', 'Regular HS Diploma Graduates (Count)', 'Regular HS Diploma Graduates (Rate)', "Met UC/CSU Grad Req's (Count)", "Met UC/CSU Grad Req's (Rate)", 'Seal of Biliteracy (Count)', 'Seal of Biliteracy (Rate)', 'Golden State Seal Merit Diploma (Count)', 'Golden State Seal Merit Diploma (Rate', 'CHSPE Completer (Count)', 'CHSPE Completer (Rate)', 'Adult Ed. HS Diploma (Count)', 'Adult Ed. HS Diploma (Rate)', 'SPED Certificate (Count)', 'SPED Certificate (Rate)', 'GED Completer (Count)', 'GED Completer (Rate)', 'Other Transfer (Count)

# California School Climate, Health, and Learning Surveys

[California School Climate, Health, and Learning Surveys](https://calschls.org/)  
A tool supported by the California Department of Education to help districts meet Local Control Accountability Plan priorities to improve school climate, pupil engagement, parent involvement, and academic achievement.

**Query CalSCHLS**  
Query statewide data on a variety of topics organized by gender, grade-level, race/ethnicity, and level of connectedness to school. Click a topic, select the indicator you are interested in, and choose how you would like the data organized.

- Child and Youth Safety
  - Bullying and Harrassment at School
  - Gang Involvement
  - School Safety
    [https://calschls.org/reports-data/query-calschls/](https://calschls.org/reports-data/query-calschls/)


In [None]:
schls_df = {}

schls_df["grade"] = pd.read_excel(
    ca_schls / "Kidsdata-Gang-Membership--by-Grade-Level-(All-Years).xls"
)
schls_df["gender_grade"] = pd.read_excel(
    ca_schls / "Kidsdata-Gang-Membership--by-Gender-and-Grade-Level-(All-Ye.xls"
)
schls_df["orientation"] = pd.read_excel(
    ca_schls / "Kidsdata-Gang-Membership--by-Sexual-Orientation-(All-Years).xls"
)

# Civil Rights Data Collection Office for Civil Rights

[Civil Rights Data Collection Office for Civil Rights - DoE](https://civilrightsdata.ed.gov/data)  
Data collected through CRDC allows researchers, analysts, and journalists to identify new and important insights into the state of civil rights in education.


In [None]:
cr_df = {}