# Data Preparation


In [1]:
# import libraries
import importlib
import os

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import subprocess
import sys

from pathlib import Path

In [2]:
# import other libraries
# check if jcds library is installed
package_name = "jcds"

if importlib.util.find_spec(package_name) is None:
    print(f" '{package_name}' not found. Installing from Github... ")
    subprocess.check_call(
        [
            sys.executable,
            "-m",
            "pip",
            "install",
            "https://github.com/junclemente/jcds.git",
        ]
    )
else:
    print(f" '{package_name}' is already installed.")

from jcds import eda as jeda
from jcds import reports as jrep

 'jcds' is already installed.


In [3]:
data_folder = Path("../data")
ca_doe = Path(data_folder / "raw/ca_doe")
ca_schls = Path(data_folder / "raw/ca_schls")
civil_rights = Path(data_folder / "raw/civil_rights_data")

## CA Dept of Education

**Adjusted Cohort Graduation Rate and Outcome Data**
Four-year Adjusted Cohort Graduation Rate (ACGR) and Outcome data reported by race/ethnicity, student group, and gender.  
Source: [https://www.cde.ca.gov/ds/ad/filesacgr.asp](https://www.cde.ca.gov/ds/ad/filesacgr.asp)

**Note:** To protect student privacy, data are suppressed (*) on the data file if the cell size within a selected student population (cohort students) is 10 or less. Additionally, the “Not Reported” race/ethnicity is suppressed, regardless of actual cell size, if the student population for one or more other race/ethnicity groups is suppressed.

Raw Datasets may not be included in the repository due to its size. 
The datasets can be downloaded from here: [raw datasets](https://drive.google.com/drive/folders/1OUvpL1nY1uTu2PmjIHJ7wZmY4YgW4onM?usp=sharing)

This section produces a combined dataset: `combined_acgr.pkl`


In [4]:
datasets = ["acgr17.txt", "acgr18.txt", "acgr19.txt", "acgr20.txt", "acgr21.txt"]

dfs = {}

for dataset in datasets:
    df = pd.read_csv(ca_doe / dataset, sep="\t", dtype=str)
    dfs[dataset.replace(".txt", "")] = df

In [5]:
df1 = dfs["acgr17"]
df2 = dfs["acgr18"]
df3 = dfs["acgr19"]
df4 = dfs["acgr20"]
df5 = dfs["acgr21"]

### Compare columns

In [6]:
col1 = list(df1.columns)
col2 = list(df2.columns)
col3 = list(df3.columns)
col4 = list(df4.columns)
col5 = list(df5.columns)

clist = [col2, col3, col4, col5]
names = ["df2", "df3", "df4", "df5"]
for name, c in zip(names, clist):
    missing = list(set(col1) - set(c))
    extra = list(set(c) - set(col1))

    print(f"\nComparing {name} to df1:")
    # print(f"Missing in {name}: {missing}")
    # print(f"Extra in {name}: {extra}")

    if not missing and not extra:
        print("✅ Columns match exactly.")
    elif not missing:
        print("⚠️ No missing columns, but there are extras.")
    elif not extra:
        print("⚠️ No extra columns, but some are missing.")


Comparing df2 to df1:
✅ Columns match exactly.

Comparing df3 to df1:
✅ Columns match exactly.

Comparing df4 to df1:
✅ Columns match exactly.

Comparing df5 to df1:
✅ Columns match exactly.


### Combine to one dataset

In [7]:
combined_df = pd.concat(dfs.values(), ignore_index=True)

combined_df.sample(10)

Unnamed: 0,AcademicYear,AggregateLevel,CountyCode,DistrictCode,SchoolCode,CountyName,DistrictName,SchoolName,CharterSchool,DASS,...,SPED Certificate (Count),SPED Certificate (Rate),GED Completer (Count),GED Completer (Rate),Other Transfer (Count),Other Transfer (Rate),Dropout (Count),Dropout (Rate),Still Enrolled (Count),Still Enrolled (Rate)
650104,2019-20,D,44,10447.0,0.0,Santa Cruz,Santa Cruz County Office of Education,District Office,All,Yes,...,0,0.0,1,0.4,0,0.0,114,43.8,0,0.0
60264,2016-17,S,1,61309.0,111799.0,Alameda,San Lorenzo Unified,East Bay Arts High,All,All,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
646642,2019-20,D,41,10413.0,0.0,San Mateo,San Mateo County Office of Education,District Office,All,All,...,1,9.1,0,0.0,0,0.0,3,27.3,0,0.0
1737,2016-17,C,14,,,Inyo,,,Yes,Yes,...,*,*,*,*,*,*,*,*,*,*
580729,2018-19,S,48,70573.0,135095.0,Solano,Vacaville Unified,Ernest Kimme Charter Academy for Independent L...,All,No,...,0,0.0,0,0.0,0,0.0,2,4.8,0,0.0
350649,2017-18,S,37,68031.0,3731478.0,San Diego,Coronado Unified,Coronado High,All,All,...,0,0.0,0,0.0,3,1.0,19,6.1,0,0.0
921954,2020-21,S,1,61259.0,118224.0,Alameda,Oakland Unified,Aspire Golden State College Preparatory Academy,Yes,No,...,0,0.0,0,0.0,0,0.0,2,2.9,4,5.9
643363,2019-20,D,37,73569.0,0.0,San Diego,Oceanside Unified,District Office,No,No,...,14,8.3,0,0.0,5,3.0,16,9.5,12,7.1
670138,2019-20,S,4,61408.0,430827.0,Butte,Biggs Unified,Biggs High,No,No,...,0,0.0,0,0.0,0,0.0,3,10.7,0,0.0
760351,2019-20,S,31,66894.0,102293.0,Placer,Placer Union High,Foresthill High,All,All,...,*,*,*,*,*,*,*,*,*,*


### Save combined dataset as pickle file

In [12]:
output_path = Path(ca_doe / "combined_acgr.pkl")

if output_path.exists():
    print(f"File already exists: {output_path}")
else:
    combined_df.to_pickle(output_path)
    print(f"File saved successfully: {output_path}")

File already exists: ../data/raw/ca_doe/combined_acgr.pkl


In [9]:
jrep.data_info(combined_df)


SHAPE:
There are 1106018 rows and 34 columns (2162.36 MB).

DUPLICATES:
There are 0 duplicated rows.

COLUMNS/VARIABLES:
Column dType Summary:
 * object: 34
There are 0 numerical (int/float/bool) variables.
There are 34 categorical (nominal/ordinal) variables.

DATETIME COLUMNS:
There are 0 datetime variables and 0 possible datetime variables.

OTHER COLUMN/VARIABLE INFO:
ID Like Columns (threshold = 95.0%): 0
Columns with mixed datatypes: 4


## California School Climate, Health, and Learning Surveys

[California School Climate, Health, and Learning Surveys](https://calschls.org/)
A tool supported by the California Department of Education to help districts meet Local Control Accountability Plan priorities to improve school climate, pupil engagement, parent involvement, and academic achievement.

**Query CalSCHLS**  
Query statewide data on a variety of topics organized by gender, grade-level, race/ethnicity, and level of connectedness to school. Click a topic, select the indicator you are interested in, and choose how you would like the data organized.  

- Child and Youth Safety
  - Bullying and Harrassment at School
  - Gang Involvement
  - School Safety
[https://calschls.org/reports-data/query-calschls/](https://calschls.org/reports-data/query-calschls/)