# Data Preparation

In [1]:
# import libraries
import importlib
import os

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import subprocess
import sys

from pathlib import Path

In [2]:
# import other libraries
# check if jcds library is installed
package_name = "jcds"

if importlib.util.find_spec(package_name) is None:
    print(f" '{package_name}' not found. Installing from Github... ")
    subprocess.check_call(
        [
            sys.executable,
            "-m",
            "pip",
            "install",
            "https://github.com/junclemente/jcds.git",
        ]
    )
else:
    print(f" '{package_name}' is already installed.")

from jcds import eda as jeda
from jcds import reports as jrep

 'jcds' is already installed.


In [3]:
data_folder = Path("../data")
ca_doe = Path(data_folder / "ca_doe")
ca_schls = Path(data_folder / "ca_schls")
civil_rights = Path(data_folder / "civil_rights_data")

## CA Dept of Education

**Adjusted Cohort Graduation Rate and Outcome Data**
Four-year Adjusted Cohort Graduation Rate (ACGR) and Outcome data reported by race/ethnicity, student group, and gender.  
Source: [https://www.cde.ca.gov/ds/ad/filesacgr.asp](https://www.cde.ca.gov/ds/ad/filesacgr.asp)

**Note:** To protect student privacy, data are suppressed (*) on the data file if the cell size within a selected student population (cohort students) is 10 or less. Additionally, the “Not Reported” race/ethnicity is suppressed, regardless of actual cell size, if the student population for one or more other race/ethnicity groups is suppressed.

In [4]:
datasets = ["acgr17.txt", "acgr18.txt", "acgr19.txt", "acgr20.txt", "acgr21.txt"]

dfs = {}

for dataset in datasets:
    df = pd.read_csv(ca_doe / dataset, sep="\t", dtype=str)
    dfs[dataset.replace(".txt", "")] = df

In [5]:
df1 = dfs["acgr17"]
df2 = dfs["acgr18"]
df3 = dfs["acgr19"]
df4 = dfs["acgr20"]
df5 = dfs["acgr21"]

### Compare columns

In [36]:
col1 = list(df1.columns)
col2 = list(df2.columns)
col3 = list(df3.columns)
col4 = list(df4.columns)
col5 = list(df5.columns)

clist = [col2, col3, col4, col5]
names = ["df2", "df3", "df4", "df5"]
for name, c in zip(names, clist):
    missing = list(set(col1) - set(c))
    extra = list(set(c) - set(col1))

    print(f"\nComparing {name} to df1:")
    # print(f"Missing in {name}: {missing}")
    # print(f"Extra in {name}: {extra}")

    if not missing and not extra:
        print("✅ Columns match exactly.")
    elif not missing:
        print("⚠️ No missing columns, but there are extras.")
    elif not extra:
        print("⚠️ No extra columns, but some are missing.")


Comparing df2 to df1:
✅ Columns match exactly.

Comparing df3 to df1:
✅ Columns match exactly.

Comparing df4 to df1:
✅ Columns match exactly.

Comparing df5 to df1:
✅ Columns match exactly.


### Combine to one dataset

In [37]:
combined_df = pd.concat(dfs.values(), ignore_index=True)

combined_df.sample(10)

Unnamed: 0,AcademicYear,AggregateLevel,CountyCode,DistrictCode,SchoolCode,CountyName,DistrictName,SchoolName,CharterSchool,DASS,...,SPED Certificate (Count),SPED Certificate (Rate),GED Completer (Count),GED Completer (Rate),Other Transfer (Count),Other Transfer (Rate),Dropout (Count),Dropout (Rate),Still Enrolled (Count),Still Enrolled (Rate)
1019485,2020-21,S,33,66993,3330479,Riverside,Beaumont Unified,Beaumont Senior High,All,All,...,0,0.0,0,0.0,2,1.2,7,4.2,1,0.6
485126,2018-19,S,19,64436,1,Los Angeles,Covina-Valley Unified,"Nonpublic, Nonsectarian Schools",No,All,...,*,*,*,*,*,*,*,*,*,*
705101,2019-20,S,19,64451,1932441,Los Angeles,Downey Unified,Downey High,All,All,...,17,1.8,0,0.0,6,0.6,9,0.9,2,0.2
749101,2019-20,S,27,75440,2730190,Monterey,Soledad Unified,Soledad High,All,All,...,*,*,*,*,*,*,*,*,*,*
607007,2019-20,D,4,61432,0,Butte,Durham Unified,District Office,No,No,...,0,0.0,0,0.0,0,0.0,1,2.1,0,0.0
28987,2016-17,D,27,75440,0,Monterey,Soledad Unified,District Office,All,No,...,0,0.0,0,0.0,2,8.3,2,8.3,1,4.2
930546,2020-21,S,7,61739,730549,Contra Costa,Martinez Unified,Alhambra Senior High,No,No,...,1,0.8,0,0.0,0,0.0,2,1.7,0,0.0
388233,2017-18,S,50,75739,5037700,Stanislaus,Turlock Unified,Turlock High,No,All,...,5,1.5,0,0.0,0,0.0,11,3.4,5,1.5
254359,2017-18,S,1,61200,133397,Alameda,Livermore Valley Joint Unified,Granada High,All,All,...,0,0.0,1,0.8,1,0.8,2,1.6,1,0.8
362572,2017-18,S,38,68478,3830429,San Francisco,San Francisco Unified,Life Learning Academy Charter,Yes,Yes,...,*,*,*,*,*,*,*,*,*,*


### Save combined dataset as pickle file

In [40]:
output_path = Path(ca_doe / "combined_acgr.pkl")

if output_path.exists():
    print(f"File already exists: {output_path}")
else:
    combined_df.to_pickle(output_path)
    print(f"File saved successfully: {output_path}")

File already exists: ../data/ca_doe/combined_acgr.pkl


In [41]:
jrep.data_info(combined_df)


SHAPE:
There are 1106018 rows and 34 columns (2162.36 MB).

DUPLICATES:
There are 0 duplicated rows.

COLUMNS/VARIABLES:
Column dType Summary:
 * object: 34
There are 0 numerical (int/float/bool) variables.
There are 34 categorical (nominal/ordinal) variables.

DATETIME COLUMNS:
There are 0 datetime variables and 0 possible datetime variables.

OTHER COLUMN/VARIABLE INFO:
ID Like Columns (threshold = 95.0%): 0
Columns with mixed datatypes: 4
