# Data Preparation

In [1]:
# import libraries
import importlib
import os

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import subprocess
import sys

from pathlib import Path

In [42]:
# import other libraries
# check if jcds library is installed
package_name = "jcds"

if importlib.util.find_spec(package_name) is None:
    print(f" '{package_name}' not found. Installing from Github... ")
    subprocess.check_call(
        [
            sys.executable,
            "-m",
            "pip",
            "install",
            "https://github.com/junclemente/jcds.git",
        ]
    )
else:
    print(f" '{package_name}' is already installed.")

from jcds import eda as jeda
from jcds import reports as jrep

 'jcds' is already installed.


In [43]:
data_folder = Path("../data")
ca_doe = Path(data_folder / "raw/ca_doe")
ca_schls = Path(data_folder / "raw/ca_schls")
civil_rights = Path(data_folder / "raw/civil_rights_data")

## CA Dept of Education

**Adjusted Cohort Graduation Rate and Outcome Data**
Four-year Adjusted Cohort Graduation Rate (ACGR) and Outcome data reported by race/ethnicity, student group, and gender.  
Source: [https://www.cde.ca.gov/ds/ad/filesacgr.asp](https://www.cde.ca.gov/ds/ad/filesacgr.asp)

**Note:** To protect student privacy, data are suppressed (*) on the data file if the cell size within a selected student population (cohort students) is 10 or less. Additionally, the “Not Reported” race/ethnicity is suppressed, regardless of actual cell size, if the student population for one or more other race/ethnicity groups is suppressed.

In [44]:
datasets = ["acgr17.txt", "acgr18.txt", "acgr19.txt", "acgr20.txt", "acgr21.txt"]

dfs = {}

for dataset in datasets:
    df = pd.read_csv(ca_doe / dataset, sep="\t", dtype=str)
    dfs[dataset.replace(".txt", "")] = df

In [45]:
df1 = dfs["acgr17"]
df2 = dfs["acgr18"]
df3 = dfs["acgr19"]
df4 = dfs["acgr20"]
df5 = dfs["acgr21"]

### Compare columns

In [46]:
col1 = list(df1.columns)
col2 = list(df2.columns)
col3 = list(df3.columns)
col4 = list(df4.columns)
col5 = list(df5.columns)

clist = [col2, col3, col4, col5]
names = ["df2", "df3", "df4", "df5"]
for name, c in zip(names, clist):
    missing = list(set(col1) - set(c))
    extra = list(set(c) - set(col1))

    print(f"\nComparing {name} to df1:")
    # print(f"Missing in {name}: {missing}")
    # print(f"Extra in {name}: {extra}")

    if not missing and not extra:
        print("✅ Columns match exactly.")
    elif not missing:
        print("⚠️ No missing columns, but there are extras.")
    elif not extra:
        print("⚠️ No extra columns, but some are missing.")


Comparing df2 to df1:
✅ Columns match exactly.

Comparing df3 to df1:
✅ Columns match exactly.

Comparing df4 to df1:
✅ Columns match exactly.

Comparing df5 to df1:
✅ Columns match exactly.


### Combine to one dataset

In [47]:
combined_df = pd.concat(dfs.values(), ignore_index=True)

combined_df.sample(10)

Unnamed: 0,AcademicYear,AggregateLevel,CountyCode,DistrictCode,SchoolCode,CountyName,DistrictName,SchoolName,CharterSchool,DASS,...,SPED Certificate (Count),SPED Certificate (Rate),GED Completer (Count),GED Completer (Rate),Other Transfer (Count),Other Transfer (Rate),Dropout (Count),Dropout (Rate),Still Enrolled (Count),Still Enrolled (Rate)
186623,2016-17,S,51,71373.0,5132758.0,Sutter,East Nicolaus Joint Union High,East Nicolaus High,All,No,...,0,0.0,0,0.0,0,0.0,2,7.7,0,0.0
205023,2017-18,D,1,61259.0,0.0,Alameda,Oakland Unified,District Office,No,Yes,...,*,*,*,*,*,*,*,*,*,*
799356,2019-20,S,37,68346.0,3737384.0,San Diego,San Dieguito Union High,Sunset High (Continuation),No,Yes,...,*,*,*,*,*,*,*,*,*,*
721655,2019-20,S,19,64733.0,1931526.0,Los Angeles,Los Angeles Unified,Carson Senior High,No,All,...,4,1.2,0,0.0,3,0.9,15,4.5,3,0.9
37800,2016-17,D,36,73890.0,0.0,San Bernardino,Silver Valley Unified,District Office,All,Yes,...,*,*,*,*,*,*,*,*,*,*
197989,2017-18,C,17,,,Lake,,,No,All,...,3,0.5,0,0.0,66,11.2,54,9.2,20,3.4
104270,2016-17,S,19,64808.0,1935998.0,Los Angeles,Montebello Unified,Montebello High,All,No,...,2,0.3,0,0.0,52,7.9,34,5.2,26,4.0
268934,2017-18,S,10,73809.0,1030121.0,Fresno,Firebaugh-Las Deltas Unified,Firebaugh High,No,No,...,*,*,*,*,*,*,*,*,*,*
649077,2019-20,D,43,69427.0,0.0,Santa Clara,East Side Union High,District Office,No,Yes,...,0,0.0,0,0.0,0,0.0,1,4.8,9,42.9
218164,2017-18,D,19,64725.0,0.0,Los Angeles,Long Beach Unified,District Office,Yes,Yes,...,*,*,*,*,*,*,*,*,*,*


### Save combined dataset as pickle file

In [48]:
output_path = Path(ca_doe / "combined_acgr.pkl")

if output_path.exists():
    print(f"File already exists: {output_path}")
else:
    combined_df.to_pickle(output_path)
    print(f"File saved successfully: {output_path}")

File already exists: ../data/raw/ca_doe/combined_acgr.pkl


In [49]:
jrep.data_info(combined_df)


SHAPE:
There are 1106018 rows and 34 columns (2162.36 MB).

DUPLICATES:
There are 0 duplicated rows.

COLUMNS/VARIABLES:
Column dType Summary:
 * object: 34
There are 0 numerical (int/float/bool) variables.
There are 34 categorical (nominal/ordinal) variables.

DATETIME COLUMNS:
There are 0 datetime variables and 0 possible datetime variables.

OTHER COLUMN/VARIABLE INFO:
ID Like Columns (threshold = 95.0%): 0
Columns with mixed datatypes: 4
