In [None]:
!pip install geopandas matplotlib

import matplotlib as mpl
import matplotlib.pyplot as plt
import geopandas as gpd
import pandas as pd
import numpy as np
from scipy import stats

%matplotlib inline

In [None]:
df_schools = gpd.read_file('drive/My Drive/Miscellaneous/Projects/ohio-inequity/State_Wide_School_Districts.geojson')
df_schools = df_schools.drop(index=df_schools.index[0:1])
df_schools = df_schools.reset_index(drop=True)
df_schools = df_schools[['ODE_IRN', 'TAXID', 'ID', 'Shape_Leng', 'geometry']]
df_schools['ODE_IRN'] = df_schools['ODE_IRN'].astype('int64')

data_schools = pd.read_csv('drive/My Drive/Miscellaneous/Projects/ohio-inequity/dataset.csv')
data = df_schools.join(data_schools.set_index('IRN'), on='ODE_IRN', how="inner")
data['Percent Black'] = (data['Students Black'])/data['Total Students']
data['Percent White'] = (data['Students White'])/data['Total Students']
data['Percent Disability'] = (data['Students Disability'])/data['Total Students']
data['Percent Latino'] = (data['Students Latino'])/data['Total Students']
data['Percent Asian'] = (data['Students Asian'])/data['Total Students']
data['Percent NativeAm'] = (data['Students NativeAm'])/data['Total Students']
data['Percent NatHaw'] = (data['Students NatHaw'])/data['Total Students']
data['Percent Multi'] = (data['Students Multi'])/data['Total Students']
data['Percent EconDis'] = (data['Students EconDis'])/data['Total Students']
data['Percent EL'] = (data['Students EL'])/data['Total Students']
data['Percent NonWhite'] = (data['Total Students'] - data['Students White'])/data['Total Students']
data = data.reset_index(drop=True)

def remove_outliers(df, column):
  return df[(np.abs(stats.zscore(df[column])) < 3)].reset_index(drop=True)

def impute_outliers(df, column):
  # replaces outliers with maximum or minimum non outlier
  max_non_outlier = df.loc[(np.abs(stats.zscore(df[column])) < 3), column].max()
  min_non_outlier = df.loc[(np.abs(stats.zscore(df[column])) < 3), column].min()

  df.loc[(stats.zscore(df[column]) > 3), column] = max_non_outlier
  df.loc[(stats.zscore(df[column]) < -3), column] = min_non_outlier

def plot_data(df, column, reverse):
  if reverse:
    ax = df.plot(column=column, cmap=mpl.cm.get_cmap("jet_r"), legend=True)
  else:
    ax = df.plot(column=column, cmap=mpl.cm.get_cmap("jet"), legend=True)
  plt.savefig('Ohio Schools: ' + column + '.png', dpi=300)

graphable_columns = ['Students Disability', 'Students EL', 'Students EconDis',
       'Students White', 'Students Black', 'Students Latino', 'Students Asian',
       'Students NativeAm', 'Students NatHaw', 'Students Multi',
       'Total Students', 'PI', 'Property Tax per Student 2018',
       'Property Tax per Student Mean', 'Percent Black', 'Percent White',
       'Percent Disability', 'Percent Latino', 'Percent Asian',
       'Percent NativeAm', 'Percent NatHaw', 'Percent Multi',
       'Percent EconDis', 'Percent EL', 'Percent NonWhite']

# Plot All Columns With Outliers

In [None]:
for column in graphable_columns:
  plot_data(data, column, False)

# Plot All Columns Without Outliers

In [None]:
for column in graphable_columns:
  impute_outliers(data, column)
  plot_data(data, column, False)

# Plot All Columns Without Outliers, Reversed

In [None]:
for column in graphable_columns:
  impute_outliers(data, column)
  plot_data(data, column, True)