In [1]:
import re

import numpy as np
import pandas as pd

import folium
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

In [2]:
df = pd.read_csv('../../../input/2016 School Explorer.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1272 entries, 0 to 1271
Columns: 161 entries, Adjusted Grade to Grade 8 Math 4s - Economically Disadvantaged
dtypes: float64(5), int64(123), object(33)
memory usage: 1.6+ MB


In [4]:
df.describe()

Unnamed: 0,SED Code,District,Latitude,Longitude,Zip,Economic Need Index,Average ELA Proficiency,Average Math Proficiency,Grade 3 ELA - All Students Tested,Grade 3 ELA 4s - All Students,...,Grade 8 Math - All Students Tested,Grade 8 Math 4s - All Students,Grade 8 Math 4s - American Indian or Alaska Native,Grade 8 Math 4s - Black or African American,Grade 8 Math 4s - Hispanic or Latino,Grade 8 Math 4s - Asian or Pacific Islander,Grade 8 Math 4s - White,Grade 8 Math 4s - Multiracial,Grade 8 Math 4s - Limited English Proficient,Grade 8 Math 4s - Economically Disadvantaged
count,1272.0,1272.0,1272.0,1272.0,1272.0,1247.0,1217.0,1217.0,1272.0,1272.0,...,1272.0,1272.0,1272.0,1272.0,1272.0,1272.0,1272.0,1272.0,1272.0,1272.0
mean,328669800000.0,16.13522,40.734537,-73.918347,10815.720912,0.672281,2.534215,2.668956,60.569182,4.95283,...,43.841195,4.91195,0.003145,0.610063,0.947327,1.984277,0.970912,0.002358,0.159591,2.992138
std,12220220000.0,9.24527,0.086602,0.080576,529.588875,0.210959,0.363589,0.47047,57.872496,8.300568,...,82.87878,20.792371,0.068635,3.966083,4.056007,12.841333,6.880223,0.084116,1.321195,12.694124
min,307500000000.0,1.0,40.507803,-74.244025,10001.0,0.049,1.81,1.83,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,320875200000.0,9.0,40.669499,-73.957057,10452.0,0.55,2.25,2.3,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,331500400000.0,15.0,40.722995,-73.920571,11203.0,0.731,2.45,2.58,54.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,342400000000.0,24.0,40.815632,-73.879264,11232.0,0.841,2.76,2.98,94.0,7.0,...,59.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,353100900000.0,32.0,40.903455,-73.70892,11694.0,0.957,3.93,4.2,356.0,55.0,...,652.0,312.0,2.0,107.0,71.0,246.0,126.0,3.0,33.0,196.0


In [5]:
df_registration = pd.read_csv('../../../input/2017-2018 SHSAT Admissions Test Offers By Sending School.csv')

In [6]:
df_merged = df.merge(df_registration[['School DBN', 
                                      'Borough', 
                                      'School Category', 
                                      'Number of students who took test']], 
                     left_on='Location Code', 
                     right_on='School DBN', 
                     how='inner')

In [7]:
df_merged['IsCommunitySchool'] = (df_merged['Community School?'] == 'Yes').astype('float')

percent_pattern = r'Percent\s.*'
percent_regex = re.compile(percent_pattern, re.IGNORECASE)

for col in df_merged.columns.values:
    if percent_regex.search(col):
        print col
        df_merged[col] = df_merged[col].astype(np.object).str.replace('%', '').astype(float)
df_merged['School Income Estimate'] = df_merged['School Income Estimate'].astype(np.object).str.replace('$', '').str.replace(',', '').str.replace('.', '').astype(float)
df_merged['Student Attendance Rate'] = df_merged['Student Attendance Rate'].astype(np.object).str.replace('%', '').astype(float)
df_merged['Percent of Students Chronically Absent'] = df_merged['Percent of Students Chronically Absent'].astype(np.object).str.replace('%', '').astype(float)

df_merged['Grade High'] = df_merged['Grade High'].astype(np.object_)
df_merged['Grade High'] = df_merged['Grade High'].str.replace('0K', '00')
df_merged['Grade High'] = df_merged['Grade High'].str.replace('PK', '00')

SPI_columns = ['Rigorous Instruction %', 'Collaborative Teachers %', 'Supportive Environment %', 
               'Effective School Leadership %', 'Strong Family-Community Ties %', 'Trust %']
for col in SPI_columns:
    df_merged[col] = df_merged[col].astype(np.object).str.replace('%', '').astype(float)

Percent ELL
Percent Asian
Percent Black
Percent Hispanic
Percent Black / Hispanic
Percent White
Percent of Students Chronically Absent


In [8]:
drop_columns = ['SED Code', 'Location Code', 'Community School?', 'School Name', 'Grades', 'Grade Low', 
                'School DBN', 'Zip', 'Address (Full)', 'School Category']
df_merged = df_merged.drop(columns=drop_columns)

In [9]:
label_encode_columns = ['City', 'District', 'Borough']

label_encoders = []
for col in label_encode_columns:
    label_encoder = LabelEncoder()
    label_encoder.fit(df_merged[col])
    df_merged[col] = label_encoder.transform(df_merged[col].values)
    
    label_encoders.append(label_encoder)

In [10]:
nan_percentage = df_merged.isnull().mean().sort_values(ascending=False)*100
columns_to_keep = list(nan_percentage[nan_percentage <= 0].index.values)
columns_to_keep.append('Number of students who took test')

df_merged = df_merged[columns_to_keep]

In [11]:
df_merged = df_merged.astype('float')
df_merged = df_merged.dropna()

In [12]:
feature_list = list(columns_to_keep)
feature_list.remove('Number of students who took test')

In [13]:
X_train = df_merged[feature_list]
y_train = df_merged['Number of students who took test']

y_train = (y_train > y_train.median()).astype('float')

In [14]:
model = LogisticRegressionCV(cv=5, n_jobs=-1, tol=1e-10, max_iter=1**10)
model.fit(X_train, y_train)

LogisticRegressionCV(Cs=10, class_weight=None, cv=5, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=1,
           multi_class='ovr', n_jobs=-1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=1e-10, verbose=0)

In [15]:
preds = model.predict(X_train)
print classification_report(y_train, preds)

             precision    recall  f1-score   support

        0.0       0.76      0.92      0.83       275
        1.0       0.89      0.69      0.78       261

avg / total       0.82      0.81      0.80       536



In [16]:
temp = df_merged[preds==1]
temp['Longitude'].describe()

count    202.000000
mean     -73.924089
std        0.093608
min      -74.243221
25%      -73.978458
50%      -73.925924
75%      -73.863707
max      -73.713022
Name: Longitude, dtype: float64

In [17]:
feature_importance = model.coef_[0]

feature_importance_min = feature_importance.min()
feature_importance_max = feature_importance.max()
feature_importance = (feature_importance - feature_importance_min) / (feature_importance_max - feature_importance_min)

In [18]:
feature_importance_dict = dict(zip(feature_list, feature_importance))
feature_importance_dict = sorted(feature_importance_dict.iteritems(), key=lambda (k, v): (v, k), reverse=True)
feature_importance_dict

[('Longitude', 1.0),
 ('Grade 6 ELA - All Students Tested', 0.8913836438233415),
 ('Grade 6 Math 4s - All Students', 0.8911186886585374),
 ('Grade 6 Math - All Students Tested', 0.8878174663349611),
 ('Grade 7 Math 4s - All Students', 0.8005617659344898),
 ('Grade 6 ELA 4s - All Students', 0.7996703725060211),
 ('Grade 6 Math 4s - Economically Disadvantaged', 0.7713901734909349),
 ('Grade 8 ELA 4s - All Students', 0.7581365814352017),
 ('Grade 7 ELA 4s - All Students', 0.7560733233122792),
 ('Grade 7 Math - All Students Tested', 0.7394261880445615),
 ('Grade 7 ELA - All Students Tested', 0.7299442974250293),
 ('Grade 5 Math - All Students Tested', 0.725397443185781),
 ('Grade 4 Math 4s - All Students', 0.7247703798838062),
 ('Grade 5 ELA - All Students Tested', 0.7219534711051477),
 ('Grade 7 Math 4s - Economically Disadvantaged', 0.7175214116982919),
 ('Grade 6 Math 4s - Asian or Pacific Islander', 0.7123035669532564),
 ('Grade 6 ELA 4s - Economically Disadvantaged', 0.712157516515436