# Import modules

In [213]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import statsmodels.api as sm
import pickle
import re
from statsmodels.formula.api import ols
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

---

# Schools Data

https://infohub.nyced.org/reports/school-quality/school-quality-reports-and-resources

### Read in data

In [214]:
df = pd.read_excel('201819_ems_sqr_results.xlsx', index_col=1)

### Remove rows to set headers 

In [215]:
headers = df.iloc[0]
df  = pd.DataFrame(df.values[1:], columns=headers)

In [216]:
headers = df.iloc[0]
df  = pd.DataFrame(df.values[1:], columns=headers)

In [217]:
headers = df.iloc[0]
df  = pd.DataFrame(df.values[1:], columns=headers)

In [218]:
df = df.drop(df.index[0])

### Reset index and drop old index

In [219]:
df = df.reset_index()

### Drop NaN columns

In [220]:
df = df.dropna(axis=1, how='all')

### Drop old index column

In [221]:
df = df.drop(columns='index')

### Function to check NaN values

In [222]:
def nan_values(col_name):
    for col in col_name:
        print('column name: ' + col)
        print(df[col].isna().value_counts())

In [223]:
nan_values(list(df.columns))

column name: DBN
False    1293
Name: DBN, dtype: int64
column name: School Name
False    1293
Name: School Name, dtype: int64
column name: School Type
False    1293
Name: School Type, dtype: int64
column name: Enrollment
False    1293
Name: Enrollment, dtype: int64
column name: Rigorous Instruction Rating
False    1243
True       50
Name: Rigorous Instruction Rating, dtype: int64
column name: Collaborative Teachers Rating
False    1243
True       50
Name: Collaborative Teachers Rating, dtype: int64
column name: Supportive Environment Rating
False    1225
True       68
Name: Supportive Environment Rating, dtype: int64
column name: Effective School Leadership Rating
False    1243
True       50
Name: Effective School Leadership Rating, dtype: int64
column name: Strong Family-Community Ties Rating
False    1238
True       55
Name: Strong Family-Community Ties Rating, dtype: int64
column name: Trust Rating
False    1239
True       54
Name: Trust Rating, dtype: int64
column name: Student Ach

### According to the "." values in student attendance rates, it seems to be that these are charter schools that can chooose whether or not to give information. 

In [224]:
df['Student Attendance Rate'].isna().value_counts()

False    1293
Name: Student Attendance Rate, dtype: int64

### Drop values with "."

In [225]:
df = df[df['Student Attendance Rate'] != '.']

In [226]:
df = df[df['Percent of teachers with 3 or more years of experience'] != '.']

In [227]:
df.shape

(1108, 45)

### Drop Rows Containing Charter Schools

In [228]:
df = df.drop(df.tail(143).index,inplace=False)

### Convert to csv

In [229]:
df

Unnamed: 0,DBN,School Name,School Type,Enrollment,Rigorous Instruction Rating,Collaborative Teachers Rating,Supportive Environment Rating,Effective School Leadership Rating,Strong Family-Community Ties Rating,Trust Rating,...,Percent HRA Eligible,Percent Asian,Percent Black,Percent Hispanic,Percent White,Years of principal experience at this school,Percent of teachers with 3 or more years of experience,Student Attendance Rate,Percent of Students Chronically Absent,Teacher Attendance Rate
0,01M015,P.S. 015 Roberto Clemente,Elementary,161,Exceeding Target,Exceeding Target,Exceeding Target,Exceeding Target,Exceeding Target,Meeting Target,...,0.77,0.124,0.28,0.553,0.037,8.9,0.64,0.928,0.227,0.974
1,01M019,P.S. 019 Asher Levy,Elementary,239,Exceeding Target,Exceeding Target,Meeting Target,Exceeding Target,Exceeding Target,Meeting Target,...,0.548,0.059,0.218,0.623,0.067,9.9,0.75,0.909,0.343,0.966
2,01M020,P.S. 020 Anna Silver,Elementary,439,Not Meeting Target,Approaching Target,Approaching Target,Approaching Target,Meeting Target,Meeting Target,...,0.647,0.289,0.13,0.506,0.036,2.9,0.667,0.925,0.296,0.966
3,01M034,P.S. 034 Franklin D. Roosevelt,K-8,288,Approaching Target,Approaching Target,Approaching Target,Approaching Target,Meeting Target,Approaching Target,...,0.875,0.028,0.33,0.601,0.028,2.7,0.545,0.885,0.455,0.968
4,01M063,The STAR Academy - P.S.63,Elementary,207,Exceeding Target,Exceeding Target,Meeting Target,Meeting Target,Exceeding Target,Meeting Target,...,0.623,0.019,0.213,0.652,0.082,11.9,0.591,0.908,0.347,0.967
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
960,29Q038,P.S. 038 Rosedale,Elementary,283,Meeting Target,Approaching Target,Meeting Target,Meeting Target,Exceeding Target,Exceeding Target,...,0.466,0.025,0.869,0.078,0.021,4.9,0.905,0.93,0.234,0.967
961,29Q052,P.S. 052 Queens,Elementary,380,Approaching Target,Approaching Target,Meeting Target,Meeting Target,Meeting Target,Meeting Target,...,0.726,0.05,0.763,0.145,0.016,0.9,0.882,0.903,0.374,0.959
962,29Q059,I.S. 059 Springfield Gardens,Middle,599,Meeting Target,Meeting Target,Approaching Target,Meeting Target,Exceeding Target,Meeting Target,...,0.543,0.02,0.868,0.058,0.01,1,0.976,0.903,0.306,0.98
963,29Q095,P.S. 095 Eastwood,Elementary,1441,Exceeding Target,Exceeding Target,Meeting Target,Exceeding Target,Exceeding Target,Exceeding Target,...,0.668,0.429,0.099,0.397,0.014,7.7,0.921,0.901,0.362,0.971


In [230]:
df.shape

(965, 45)

### Remove columns

In [231]:
df = df.drop(columns=['Average Incoming ELA Proficiency (Based on 5th Grade)', 
                 'Average Incoming Math Proficiency (Based on 5th Grade)',
                 'Years of principal experience at this school',
                 'Quality Review - How interesting and challenging is the curriculum',
                 'Quality Review - How effective is the teaching and learning',
                 'Quality Review - How well does the school assess what students are learning',
                 'Quality Review - How clearly are high expectations communicated to students and staff',
                 'Quality Review - How well do teachers work with each other',
                 'Quality Review - How safe and inclusive is the school while supporting social-emotional growth',
                 'Quality Review - How well does the school allocate and manage resources',
                 'Quality Review - How well does the school identify, track, and meet its goals',
                 'Quality Review - How thoughtful is the school s approach to teacher development and evaluation',
                 'Quality Review - How well are school decisions evaluated and adjusted',
                 'Quality Review - Dates of Review',
                 'Student Achievement Rating',
                 'Trust Rating',
                 'Strong Family-Community Ties Rating',
                 'Effective School Leadership Rating',
                 'Supportive Environment Rating',
                 'Collaborative Teachers Rating',
                 'Rigorous Instruction Rating'])

In [232]:
df.iloc[50]

0
DBN                                                                               02M217
School Name                                               P.S./I.S. 217 Roosevelt Island
School Type                                                                          K-8
Enrollment                                                                           586
Rigorous Instruction - Percent Positive                                             0.86
Collaborative Teachers - Percent Positive                                           0.91
Supportive Environment - Percent Positive                                           0.74
Effective School Leadership - Percent Positive                                      0.89
Strong Family-Community Ties - Percent Positive                                     0.96
Trust - Percent Positive                                                             0.9
Percent English Language Learners                                                  0.148
Percent Students wi

In [236]:
df = df.rename(columns={'DBN': 'dbn', 
                                        'School Name': 'school_name', 
                                        'School Type': 'school_type', 
                                        'Enrollment': 'enrollment', 
                                        'Rigorous Instruction - Percent Positive': 'rigorous_instruction', 
                                        'Collaborative Teachers - Percent Positive': 'collab_teachers',
                                        'Supportive Environment - Percent Positive': 'support_environ',
                                        'Effective School Leadership - Percent Positive': 'effective_school_leadership',
                                        'Strong Family-Community Ties - Percent Positive': 'strong_fam_community_ties',
                                        'Trust - Percent Positive':'trust',
                                        'Percent English Language Learners':'english_language_learners',
                                        'Percent Students with Disabilities':'students_disabilities',
                                        'Percent Self-Contained':'self_contained',
                                        'Economic Need Index':'economic_need_index',
                                        'Percent in Temp Housing':'temp_housing',
                                        'Percent HRA Eligible':'hra_eligible',
                                        'Percent Asian':'asian',
                                        'Percent Black':'black',
                                        'Percent Hispanic':'hispanic',
                                        'Percent White':'white',
                                        'Percent of teachers with 3 or more years of experience':'teachers_3_or_more_yrs_exp',
                                        'Student Attendance Rate':'student_attendance_rate',
                                        'Percent of Students Chronically Absent':'chronically_absent',
                                        'Teacher Attendance Rate':'teacher_attendance_rate'})

In [237]:
df

Unnamed: 0,dbn,school_name,school_type,enrollment,rigorous_instruction,collab_teachers,support_environ,effective_school_leadership,strong_fam_community_ties,trust,...,temp_housing,hra_eligible,asian,black,hispanic,white,teachers_3_or_more_yrs_exp,student_attendance_rate,chronically_absent,teacher_attendance_rate
0,01M015,P.S. 015 Roberto Clemente,Elementary,161,0.85,0.9,0.83,0.96,0.97,0.94,...,0.398,0.77,0.124,0.28,0.553,0.037,0.64,0.928,0.227,0.974
1,01M019,P.S. 019 Asher Levy,Elementary,239,0.87,0.92,0.85,0.9,0.98,0.91,...,0.155,0.548,0.059,0.218,0.623,0.067,0.75,0.909,0.343,0.966
2,01M020,P.S. 020 Anna Silver,Elementary,439,0.7,0.77,0.74,0.8,0.95,0.86,...,0.164,0.647,0.289,0.13,0.506,0.036,0.667,0.925,0.296,0.966
3,01M034,P.S. 034 Franklin D. Roosevelt,K-8,288,0.74,0.66,0.64,0.66,0.94,0.76,...,0.295,0.875,0.028,0.33,0.601,0.028,0.545,0.885,0.455,0.968
4,01M063,The STAR Academy - P.S.63,Elementary,207,0.8,0.86,0.87,0.9,0.96,0.92,...,0.261,0.623,0.019,0.213,0.652,0.082,0.591,0.908,0.347,0.967
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
960,29Q038,P.S. 038 Rosedale,Elementary,283,0.9,0.91,0.86,0.95,0.98,0.96,...,0.067,0.466,0.025,0.869,0.078,0.021,0.905,0.93,0.234,0.967
961,29Q052,P.S. 052 Queens,Elementary,380,0.73,0.87,0.78,0.88,0.95,0.93,...,0.311,0.726,0.05,0.763,0.145,0.016,0.882,0.903,0.374,0.959
962,29Q059,I.S. 059 Springfield Gardens,Middle,599,0.81,0.86,0.65,0.91,0.94,0.86,...,0.095,0.543,0.02,0.868,0.058,0.01,0.976,0.903,0.306,0.98
963,29Q095,P.S. 095 Eastwood,Elementary,1441,0.86,0.93,0.88,0.95,0.99,0.96,...,0.13,0.668,0.429,0.099,0.397,0.014,0.921,0.901,0.362,0.971


In [238]:
list(df.columns)

['dbn',
 'school_name',
 'school_type',
 'enrollment',
 'rigorous_instruction',
 'collab_teachers',
 'support_environ',
 'effective_school_leadership',
 'strong_fam_community_ties',
 'trust',
 'english_language_learners',
 'students_disabilities',
 'self_contained',
 'economic_need_index',
 'temp_housing',
 'hra_eligible',
 'asian',
 'black',
 'hispanic',
 'white',
 'teachers_3_or_more_yrs_exp',
 'student_attendance_rate',
 'chronically_absent',
 'teacher_attendance_rate']

In [239]:
df.to_csv('sqr.csv', index=False)