# Data Exploration

## Library imports

In [3]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
import importlib 

from pathlib import Path 

In [4]:
# import other libraries 
# check if jcds library is installed
package_name = "jcds"

if importlib.util.find_spec(package_name) is None:
    print(f" '{package_name}' not found. Installing from Github... ")
    subprocess.check_call(
        [
            sys.executable,
            "-m",
            "pip",
            "install",
            "https://github.com/junclemente/jcds.git",
        ]
    )
else:
    print(f" '{package_name}' is already installed.")

from jcds import eda as jeda
from jcds import reports as jrep

 'jcds' is already installed.


## Import dataset

In [6]:
data_folder = Path("../data")

df = pd.read_pickle(data_folder / "ews_dataset.pkl")

df.head()

Unnamed: 0,cdscode,county,charter,eilcode,virtual,magnet,yearroundyn,latitude,longitude,multilingual,...,pct_hs_enrollment,pct_senior_cohort,pct_unsafe_gr11,pct_safe_gr11,pct_neutral_gr11,avg_safety_score,high_conn,low_conn,conn_ratio,school_climate_index
0,1611190130229,Alameda,N,HS,N,N,N,37.764958,-122.24593,N,...,1.0,0.498894,0.08,0.568,0.351,0.03423,0.333333,0.333333,0.999997,0.517115
1,1611270130450,Alameda,N,HS,N,N,N,37.896661,-122.29257,N,...,1.0,0.501264,0.08,0.568,0.351,0.03423,0.333333,0.333333,0.999997,0.517115
2,1611430131177,Alameda,N,HS,N,N,N,37.868913,-122.2712,Y,...,1.0,0.496276,0.08,0.568,0.351,0.03423,0.333333,0.333333,0.999997,0.517115
3,1611500132225,Alameda,N,HS,N,N,N,37.705184,-122.07847,N,...,1.0,0.470174,0.08,0.568,0.351,0.03423,0.333333,0.333333,0.999997,0.517115
4,1611500133876,Alameda,N,HS,V,N,N,37.713501,-122.09222,N,...,0.39881,0.196429,0.08,0.568,0.351,0.03423,0.333333,0.333333,0.999997,0.517115


## Basic dataset info

In [15]:
jrep.data_info(df, show_columns=True)


SHAPE:
There are 1067 rows and 50 columns (1.63 MB).

DUPLICATES:
There are 0 duplicated rows.

COLUMNS/VARIABLES:
Column dType Summary:
 * object: 25
 * float: 25
There are 25 numerical (int/float/bool) variables.
 * Columns: ['percent__eligible_free_k12', 'frpm_count_k12', 'pct_associate', 'pct_bachelors', 'pct_bachelors_plus', 'pct_master', 'pct_master_plus', 'pct_doctorate', 'pct_juris_doctor', 'pct_no_degree', 'pct_experienced', 'pct_inexperienced', 'pct_first_year', 'pct_second_year', 'grade_retention_ratio', 'pct_hs_enrollment', 'pct_senior_cohort', 'pct_unsafe_gr11', 'pct_safe_gr11', 'pct_neutral_gr11', 'avg_safety_score', 'high_conn', 'low_conn', 'conn_ratio', 'school_climate_index']
There are 25 categorical (nominal/ordinal) variables.
 * Columns: ['cdscode', 'county', 'charter', 'eilcode', 'virtual', 'magnet', 'yearroundyn', 'latitude', 'longitude', 'multilingual', 'cohortstudents', 'regular_hs_diploma_graduates_rate', 'met_uccsu_grad_reqs_rate', 'seal_of_biliteracy_rate', 

In [16]:
jrep.data_cardinality(df, show_columns=True)

CARDINALITY REPORT

Total columns analyzed: 50

[BINARY COLUMNS]
There are 1 binary columns.
 * Columns: ['yearroundyn']
There are 0 binary with nan.

[CONSTANT/NEAR CONSTANT COLUMNS]
There are 5 constant columns.
 * Columns: ['charter', 'eilcode', 'high_conn', 'low_conn', 'conn_ratio']
There are 11 near-constant columns with >= 95% of values being the same.
 * Columns: ['charter', 'eilcode', 'yearroundyn', 'calpads_fall_1_certification_status', 'school_grade_span', 'stu_tch_ratio', 'stu_adm_ratio', 'stu_psv_ratio', 'high_conn', 'low_conn', 'conn_ratio']

[LOW CARDINALITY CATEGORICAL COLUMNS]
 * There are 8 low cardinality columns with <= 10 unique values.
Columns:
 * charter: 1 unique values
 * eilcode: 1 unique values
 * virtual: 3 unique values
 * magnet: 3 unique values
 * yearroundyn: 2 unique values
 * multilingual: 3 unique values
 * calpads_fall_1_certification_status: 1 unique values
 * school_grade_span: 1 unique values

[HIGH CARDINALITY CATEGORICAL COLUMNS]
 * There are 3 h

The following features can be removed from the dataset due to having only one value:
- charter
- eilcode
- calpads_fall_1_certification_status
- school_grade_span


In [13]:
jrep.data_quality(df, show_columns=True) 

DATA QUALITY REPORT

 * Total entries (rows * cols): 53350
 * Memory usage: 1.63 MB
 * Rows: 1067
 * Columns: 50

MISSING DATA:
 * Total entries: 6752 missing (12.7%)

ROWS:
----------
 * Rows missing any: 1067
 * Rows missing all: 0

DUPLICATES: 0

COLUMNS:
----------------
Columns missing any: 37
	'school_grade_span': 1024 missing (96.0%)
	'stu_tch_ratio': 1024 missing (96.0%)
	'stu_adm_ratio': 1024 missing (96.0%)
	'stu_psv_ratio': 1024 missing (96.0%)
	'pct_unsafe_gr11': 172 missing (16.1%)
	'pct_safe_gr11': 172 missing (16.1%)
	'pct_neutral_gr11': 172 missing (16.1%)
	'avg_safety_score': 123 missing (11.5%)
	'school_climate_index': 123 missing (11.5%)
	'pct_associate': 96 missing (9.0%)
	'pct_bachelors': 96 missing (9.0%)
	'pct_bachelors_plus': 96 missing (9.0%)
	'pct_master': 96 missing (9.0%)
	'pct_master_plus': 96 missing (9.0%)
	'pct_doctorate': 96 missing (9.0%)
	'pct_juris_doctor': 96 missing (9.0%)
	'pct_no_degree': 96 missing (9.0%)
	'pct_experienced': 96 missing (9.0%)
	'