In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
import math

In [2]:
numbers = pd.read_csv('/Users/flatironschool/Absenteeism_Project/data/processed/combo_cleaned.csv')


In [3]:
numbers.tail()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,STNAM,LEANM,NCESSCH,SCHNAM,ALL_COHORT_1516,ALL_RATE_1516,LEA_STATE,LEA_STATE_NAME,...,sports_part,sports_rate,total_suspension_days,suspensed_day_rate,harassed,harassed_rate,activities_funds_rate,non_cert_rate,counselor_rate,absent_teacher_rate
21862,21862,21862,WYOMING,Washakie County School District #2,560582000393,Ten Sleep K-12,10,GE50,WY,WYOMING,...,31.0,0.281818,0.0,0.0,0.0,0.0,4407.512636,0.0,0.009091,0.0
21863,21863,21863,WYOMING,Teton County School District #1,560583000335,Jackson Hole High School,127,GE95,WY,WYOMING,...,355.0,0.554688,26.0,0.040625,0.0,0.0,4153.518984,0.0,0.004687,0.118939
21864,21864,21864,WYOMING,Teton County School District #1,560583000512,Summit High School,18,GE80,WY,WYOMING,...,,,10.0,0.185185,0.0,0.0,5469.872593,0.0,0.0,0.091491
21865,21865,21865,WYOMING,Weston County School District #7,560609000401,Upton High School,21,GE80,WY,WYOMING,...,85.0,0.85,0.0,0.0,0.0,0.0,6875.37,0.0,0.0,0.162162
21866,21866,21866,WYOMING,Washakie County School District #1,560624000343,Worland High School,105,75-79,WY,WYOMING,...,,,10.0,0.025,0.0,0.0,3015.755325,0.0,0.004275,0.331544


## Clean up graduation rates and add grad rate bins

In [4]:
#need to keep original reported grade and need column to modify and clean data
numbers['grad_slice'] = numbers['ALL_RATE_1516']

In [5]:
#remove "GE" and "LE" from ranges
numbers['grad_slice'].replace(['GE99'], '+100', inplace=True) # need to replace with 
numbers['grad_slice'].replace(['GE95'], '95', inplace=True)
numbers['grad_slice'].replace(['GE90'], '90', inplace=True)
numbers['grad_slice'].replace(['LE10'], '10', inplace=True)
numbers['grad_slice'].replace(['LE1'], '1', inplace=True)
numbers['grad_slice'].replace(['LE5'], '05', inplace=True)


In [6]:
#smallest range needs to be dealt with, has one digit before '-'
numbers['grad_slice'].replace(['6-9'], '6', inplace=True)

In [7]:
#take first two digits of rates to get rid of ranges
numbers['grad_slice'] = numbers['grad_slice'].str[:2]

In [8]:
#fix 100
numbers['grad_slice'].replace(['+1'], '100', inplace=True)

In [9]:
numbers.reset_index(inplace=True)

In [10]:
#get rid of very small schools
grad_num = numbers[numbers['ALL_COHORT_1516'] >= 31]

In [11]:
#create the binned categories
grad_num['grad_rate_bin'] = pd.cut(grad_num['grad_slice'].astype(int), [0, 59, 79, 89, 99, 100],
      labels = ['0-59%', '60-79%', '80-89%', '90-99%', '100%'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [12]:
grad_num.tail()

Unnamed: 0.2,index,Unnamed: 0,Unnamed: 0.1,STNAM,LEANM,NCESSCH,SCHNAM,ALL_COHORT_1516,ALL_RATE_1516,LEA_STATE,...,total_suspension_days,suspensed_day_rate,harassed,harassed_rate,activities_funds_rate,non_cert_rate,counselor_rate,absent_teacher_rate,grad_slice,grad_rate_bin
21854,21854,21854,21854,WYOMING,Sheridan County School District #1,560569000311,Big Horn High School,37,GE90,WY,...,2.0,0.013245,4.0,0.02649,1738.913907,0.0,0.006623,0.846154,90,90-99%
21858,21858,21858,21858,WYOMING,Sheridan County School District #2,560569500360,Sheridan High School,236,89,WY,...,46.0,0.046796,0.0,0.0,1694.559176,0.0,0.005086,0.198779,89,80-89%
21861,21861,21861,21861,WYOMING,Sweetwater County School District #2,560576200324,Green River High School,176,85-89,WY,...,110.0,0.14157,1.0,0.001287,1840.87749,0.0,0.003861,0.2,85,80-89%
21863,21863,21863,21863,WYOMING,Teton County School District #1,560583000335,Jackson Hole High School,127,GE95,WY,...,26.0,0.040625,0.0,0.0,4153.518984,0.0,0.004687,0.118939,95,90-99%
21866,21866,21866,21866,WYOMING,Washakie County School District #1,560624000343,Worland High School,105,75-79,WY,...,10.0,0.025,0.0,0.0,3015.755325,0.0,0.004275,0.331544,75,60-79%


In [13]:
grad_num['grad_rate_bin'].value_counts()

90-99%    8484
80-89%    4257
60-79%    1993
0-59%     1611
100%       228
Name: grad_rate_bin, dtype: int64

## Create Level Up Bins 

In [14]:
#level up bins
#create the binned categories
grad_num['level_up_bins'] = pd.cut(grad_num['grad_slice'].astype(int), 
    [0, 59, 79, 89, 99, 100], labels = ['60-79% Level Up Rate', '80-89% Level Up Rate', '90-99% Level Up Rate', '100% Level Up Rate', '100% Top Rate'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [15]:
grad_num.head()

Unnamed: 0.2,index,Unnamed: 0,Unnamed: 0.1,STNAM,LEANM,NCESSCH,SCHNAM,ALL_COHORT_1516,ALL_RATE_1516,LEA_STATE,...,suspensed_day_rate,harassed,harassed_rate,activities_funds_rate,non_cert_rate,counselor_rate,absent_teacher_rate,grad_slice,grad_rate_bin,level_up_bins
0,0,0,0,ALABAMA,Albertville City,10000500871,Albertville High Sch,296,92,AL,...,0.060232,0.0,0.0,2811.937359,0.0,0.003475,0.378788,92,90-99%,100% Level Up Rate
1,1,1,1,ALABAMA,Marshall County,10000600872,Asbury Sch,67,GE95,AL,...,0.018553,2.0,0.003711,4825.189777,0.0,0.002783,0.1,95,90-99%,100% Level Up Rate
2,2,2,2,ALABAMA,Marshall County,10000600878,Douglas High Sch,153,85-89,AL,...,0.030303,5.0,0.008418,5317.932795,0.0,0.001684,0.105263,85,80-89%,90-99% Level Up Rate
3,3,3,3,ALABAMA,Marshall County,10000600883,Kate D Smith DAR High Sch,120,80-84,AL,...,0.021786,0.0,0.0,5909.375686,0.0,0.002179,0.068966,80,80-89%,90-99% Level Up Rate
4,4,4,4,ALABAMA,Marshall County,10000601585,Brindlee Mt High Sch,94,85-89,AL,...,0.022039,0.0,0.0,3962.305785,0.0,0.002755,0.10101,85,80-89%,90-99% Level Up Rate


## Calculate Quantiles and Add to Data Frame

In [16]:
quantile_df_25 = grad_num.groupby('grad_rate_bin')['non_cert_rate', 'sports_rate', 'chronic_absent_rate', 'suspensed_day_rate'].quantile(.25).reset_index() 

In [17]:
quantile_df_75 = grad_num.groupby('grad_rate_bin')['non_cert_rate', 'sports_rate', 'chronic_absent_rate', 'suspensed_day_rate'].quantile(.75).reset_index() 

In [18]:
quantile_df_25.head()

0.25,grad_rate_bin,non_cert_rate,sports_rate,chronic_absent_rate,suspensed_day_rate
0,0-59%,0.0,0.124414,0.085595,0.0
1,60-79%,0.0,0.220779,0.16008,0.060664
2,80-89%,0.0,0.275554,0.126976,0.052518
3,90-99%,0.0,0.319483,0.084154,0.016064
4,100%,0.0,0.364777,0.059735,0.008866


In [19]:
quantile_df_75.head()

0.75,grad_rate_bin,non_cert_rate,sports_rate,chronic_absent_rate,suspensed_day_rate
0,0-59%,0.04202,0.406508,0.790489,0.503487
1,60-79%,0.052632,0.45195,0.4375,0.672138
2,80-89%,0.023318,0.536686,0.302113,0.351397
3,90-99%,0.0,0.649428,0.223554,0.148475
4,100%,0.0125,0.651462,0.143481,0.061694


In [21]:
grad_num = grad_num.merge(quantile_df_25, on='grad_rate_bin', suffixes=('_x', '_25th') )

In [24]:
grad_num = grad_num.merge(quantile_df_75, on='grad_rate_bin', suffixes=('_x', '_75th') )

In [25]:
grad_num.head()

Unnamed: 0.2,index,Unnamed: 0,Unnamed: 0.1,STNAM,LEANM,NCESSCH,SCHNAM,ALL_COHORT_1516,ALL_RATE_1516,LEA_STATE,...,chronic_absent_rate_25th,suspensed_day_rate_25th,non_cert_rate_x,sports_rate_x,chronic_absent_rate_x,suspensed_day_rate_x,non_cert_rate_75th,sports_rate_75th,chronic_absent_rate_75th,suspensed_day_rate_75th
0,0,0,0,ALABAMA,Albertville City,10000500871,Albertville High Sch,296,92,AL,...,0.084154,0.016064,0.0,0.649428,0.223554,0.148475,0.0,0.649428,0.223554,0.148475
1,1,1,1,ALABAMA,Marshall County,10000600872,Asbury Sch,67,GE95,AL,...,0.084154,0.016064,0.0,0.649428,0.223554,0.148475,0.0,0.649428,0.223554,0.148475
2,5,5,5,ALABAMA,Hoover City,10000700251,Hoover High Sch,714,92,AL,...,0.084154,0.016064,0.0,0.649428,0.223554,0.148475,0.0,0.649428,0.223554,0.148475
3,7,7,7,ALABAMA,Hoover City,10000701456,Spain Park High Sch,412,94,AL,...,0.084154,0.016064,0.0,0.649428,0.223554,0.148475,0.0,0.649428,0.223554,0.148475
4,8,8,8,ALABAMA,Madison City,10000800831,Bob Jones High Sch,451,97,AL,...,0.084154,0.016064,0.0,0.649428,0.223554,0.148475,0.0,0.649428,0.223554,0.148475


## Calculate Middle 50% Range for App

In [36]:
grad_num['Chronic_Absenteeism_25%'] = round(grad_num['total_enrollment'] * grad_num['chronic_absent_rate_25th'],0)
grad_num['chronic_absent_rate_75th'] = round(grad_num['total_enrollment'] * grad_num['chronic_absent_rate_75th'],0)
grad_num['sports_rate_25th'] = round(grad_num['total_enrollment'] * grad_num['sports_rate_25th'],0)
grad_num['sports_rate_75th'] = round(grad_num['total_enrollment'] * grad_num['sports_rate_75th'],0)
grad_num['suspensed_day_rate_25th'] = round(grad_num['total_enrollment'] * grad_num['suspensed_day_rate_25th'],0)
grad_num['suspensed_day_rate_75th'] = round(grad_num['total_enrollment'] * grad_num['suspensed_day_rate_75th'],0)
grad_num['non_cert_rate_25th'] = round(grad_num['total_enrollment'] * grad_num['non_cert_rate_25th'],0)
grad_num['non_cert_rate_75th'] = round(grad_num['total_enrollment'] * grad_num['non_cert_rate_75th'],0)


In [37]:
grad_num.head()

Unnamed: 0.2,index,Unnamed: 0,Unnamed: 0.1,STNAM,LEANM,NCESSCH,SCHNAM,ALL_COHORT_1516,ALL_RATE_1516,LEA_STATE,...,suspensed_day_rate_25th,non_cert_rate_x,sports_rate_x,chronic_absent_rate_x,suspensed_day_rate_x,non_cert_rate_75th,sports_rate_75th,chronic_absent_rate_75th,suspensed_day_rate_75th,Chronic_Absenteeism_25%
0,0,0,0,ALABAMA,Albertville City,10000500871,Albertville High Sch,296,92,AL,...,21.0,0.0,0.649428,0.223554,0.148475,0.0,841.0,290.0,192.0,109.0
1,1,1,1,ALABAMA,Marshall County,10000600872,Asbury Sch,67,GE95,AL,...,9.0,0.0,0.649428,0.223554,0.148475,0.0,350.0,120.0,80.0,45.0
2,5,5,5,ALABAMA,Hoover City,10000700251,Hoover High Sch,714,92,AL,...,48.0,0.0,0.649428,0.223554,0.148475,0.0,1927.0,663.0,441.0,250.0
3,7,7,7,ALABAMA,Hoover City,10000701456,Spain Park High Sch,412,94,AL,...,27.0,0.0,0.649428,0.223554,0.148475,0.0,1089.0,375.0,249.0,141.0
4,8,8,8,ALABAMA,Madison City,10000800831,Bob Jones High Sch,451,97,AL,...,28.0,0.0,0.649428,0.223554,0.148475,0.0,1140.0,393.0,261.0,148.0


## Clean up final data frame and save to csv

In [44]:
columns = grad_num.columns
for col in columns:
    print(col)

index
STNAM
LEANM
NCESSCH
SCHNAM
ALL_COHORT_1516
ALL_RATE_1516
LEA_STATE
LEAID_y
LEA_NAME
SCHID
COMBOKEY
JJ
SCH_STATUS_SPED
SCH_STATUS_MAGNET
SCH_STATUS_CHARTER
SCH_STATUS_ALT
TOT_ENR_M
TOT_ENR_F
TOT_DUAL_M
TOT_DUAL_F
TOT_DAYSMISSED_M
TOT_DAYSMISSED_F
TOT_ABSENT_M
TOT_ABSENT_F
SCH_SSPART_M
SCH_SSPART_F
TOT_SSPART
SCH_FTETEACH_TOT
SCH_FTETEACH_NOTCERT
districtID
IDSCH
total_enrollment
total_chronic_absent
chronic_absent_rate_x
sports_part
sports_rate_x
total_suspension_days
suspensed_day_rate_x
non_cert_rate_x
grad_slice
grad_rate_bin
level_up_bins
non_cert_rate_25th
sports_rate_25th
chronic_absent_rate_25th
suspensed_day_rate_25th
non_cert_rate_x
sports_rate_x
chronic_absent_rate_x
suspensed_day_rate_x
non_cert_rate_75th
sports_rate_75th
chronic_absent_rate_75th
suspensed_day_rate_75th
Chronic_Absenteeism_25%


In [42]:
grad_num.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1, inplace=True)
grad_num.drop(['LEA_STATE_NAME', 'SCH_NAME', 'SCH_MAGNETDETAIL','SCH_ALTFOCUS', 'TOT_GTENR_M', 'TOT_GTENR_F'], axis=1, inplace=True)
grad_num.drop(grad_num.columns.to_series()['TOT_ALGENR_GS0910_M':'TOT_SATACT_F'], axis=1, inplace=True)
grad_num.drop(grad_num.columns.to_series()['SCH_HBALLEGATIONS_SEX':'SCH_HBALLEGATIONS_REL'], axis=1, inplace=True)
grad_num.drop(['SCH_NPE_WOFED', 'SCH_NPE_WFED', 'SCH_FTECOUNSELORS', 'SCH_FTETEACH_ABSENT'], axis=1, inplace=True)
grad_num.drop(grad_num.columns.to_series()['total_ap_ib_de':'calc_rate'], axis=1, inplace=True)
grad_num.drop(grad_num.columns.to_series()['harassed':'activities_funds_rate'], axis=1, inplace=True)
grad_num.drop(['counselor_rate', 'absent_teacher_rate'], axis=1, inplace=True)




In [43]:
grad_num.head()

Unnamed: 0,index,STNAM,LEANM,NCESSCH,SCHNAM,ALL_COHORT_1516,ALL_RATE_1516,LEA_STATE,LEAID_y,LEA_NAME,...,suspensed_day_rate_25th,non_cert_rate_x,sports_rate_x,chronic_absent_rate_x,suspensed_day_rate_x,non_cert_rate_75th,sports_rate_75th,chronic_absent_rate_75th,suspensed_day_rate_75th,Chronic_Absenteeism_25%
0,0,ALABAMA,Albertville City,10000500871,Albertville High Sch,296,92,AL,100005,Albertville City,...,21.0,0.0,0.649428,0.223554,0.148475,0.0,841.0,290.0,192.0,109.0
1,1,ALABAMA,Marshall County,10000600872,Asbury Sch,67,GE95,AL,100006,Marshall County,...,9.0,0.0,0.649428,0.223554,0.148475,0.0,350.0,120.0,80.0,45.0
2,5,ALABAMA,Hoover City,10000700251,Hoover High Sch,714,92,AL,100007,Hoover City,...,48.0,0.0,0.649428,0.223554,0.148475,0.0,1927.0,663.0,441.0,250.0
3,7,ALABAMA,Hoover City,10000701456,Spain Park High Sch,412,94,AL,100007,Hoover City,...,27.0,0.0,0.649428,0.223554,0.148475,0.0,1089.0,375.0,249.0,141.0
4,8,ALABAMA,Madison City,10000800831,Bob Jones High Sch,451,97,AL,100008,Madison City,...,28.0,0.0,0.649428,0.223554,0.148475,0.0,1140.0,393.0,261.0,148.0


In [45]:
grad_num.to_csv('grad_num.csv')