# Analyzing NYC High School Data 
In this project we will be analyzing how the average SAT score of various public high schools in New York City is impacted by the their respective demographics. 

The database we have consists of 7 files, provided by New York City itself. A brief description of each file is given below:

- SAT scores by school - SAT scores for each high school in New York City
- School attendance - Attendance information for each school in New York City
- Class size - Information on class size for each school
- AP test results - Advanced Placement (AP) exam results for each high school (passing an optional AP exam in a particular subject can earn a student college credit in that subject)
- Graduation outcomes - The percentage of students who graduated, and other outcome information
- Demographics - Demographic information for each school
- School survey - Surveys of parents, teachers, and students at each school


# Read in the data

In [1]:
# Like always, we will import the required libraries to start with: 
import pandas as pd
import numpy
import matplotlib.pyplot as plt
%matplotlib inline
from mpl_toolkits.basemap import Basemap
import re 
data_files = [
    "ap_2010.csv",
    "class_size.csv",
    "demographics.csv",
    "graduation.csv",
    "hs_directory.csv",
    "sat_results.csv"
]

data = {}

for f in data_files:
    d = pd.read_csv("schools/{0}".format(f))
    data[f.replace(".csv", "")] = d

ModuleNotFoundError: No module named 'mpl_toolkits.basemap'

# Read in the surveys

In [None]:
all_survey = pd.read_csv("schools/survey_all.txt", delimiter="\t", encoding='windows-1252')
d75_survey = pd.read_csv("schools/survey_d75.txt", delimiter="\t", encoding='windows-1252')
survey = pd.concat([all_survey, d75_survey], axis=0)

survey["DBN"] = survey["dbn"]

survey_fields = [
    "DBN", 
    "rr_s", 
    "rr_t", 
    "rr_p", 
    "N_s", 
    "N_t", 
    "N_p", 
    "saf_p_11", 
    "com_p_11", 
    "eng_p_11", 
    "aca_p_11", 
    "saf_t_11", 
    "com_t_11", 
    "eng_t_11", 
    "aca_t_11", 
    "saf_s_11", 
    "com_s_11", 
    "eng_s_11", 
    "aca_s_11", 
    "saf_tot_11", 
    "com_tot_11", 
    "eng_tot_11", 
    "aca_tot_11",
]
survey = survey.loc[:,survey_fields]
data["survey"] = survey

# Add DBN columns

In [None]:
data["hs_directory"]["DBN"] = data["hs_directory"]["dbn"]

def pad_csd(num):
    string_representation = str(num)
    if len(string_representation) > 1:
        return string_representation
    else:
        return "0" + string_representation
    
data["class_size"]["padded_csd"] = data["class_size"]["CSD"].apply(pad_csd)
data["class_size"]["DBN"] = data["class_size"]["padded_csd"] + data["class_size"]["SCHOOL CODE"]

# Convert columns to numeric

In [None]:
cols = ['SAT Math Avg. Score', 'SAT Critical Reading Avg. Score', 'SAT Writing Avg. Score']
for c in cols:
    data["sat_results"][c] = pd.to_numeric(data["sat_results"][c], errors="coerce")

data['sat_results']['sat_score'] = data['sat_results'][cols[0]] + data['sat_results'][cols[1]] + data['sat_results'][cols[2]]

def find_lat(loc):
    coords = re.findall("\(.+, .+\)", loc)
    lat = coords[0].split(",")[0].replace("(", "")
    return lat

def find_lon(loc):
    coords = re.findall("\(.+, .+\)", loc)
    lon = coords[0].split(",")[1].replace(")", "").strip()
    return lon

data["hs_directory"]["lat"] = data["hs_directory"]["Location 1"].apply(find_lat)
data["hs_directory"]["lon"] = data["hs_directory"]["Location 1"].apply(find_lon)

data["hs_directory"]["lat"] = pd.to_numeric(data["hs_directory"]["lat"], errors="coerce")
data["hs_directory"]["lon"] = pd.to_numeric(data["hs_directory"]["lon"], errors="coerce")

# Condense datasets

In [None]:
class_size = data["class_size"]
class_size = class_size[class_size["GRADE "] == "09-12"]
class_size = class_size[class_size["PROGRAM TYPE"] == "GEN ED"]

class_size = class_size.groupby("DBN").agg(numpy.mean)
class_size.reset_index(inplace=True)
data["class_size"] = class_size

data["demographics"] = data["demographics"][data["demographics"]["schoolyear"] == 20112012]

data["graduation"] = data["graduation"][data["graduation"]["Cohort"] == "2006"]
data["graduation"] = data["graduation"][data["graduation"]["Demographic"] == "Total Cohort"]

# Convert AP scores to numeric

In [None]:
cols = ['AP Test Takers ', 'Total Exams Taken', 'Number of Exams with scores 3 4 or 5']

for col in cols:
    data["ap_2010"][col] = pd.to_numeric(data["ap_2010"][col], errors="coerce")

# Combine the datasets

In [None]:
combined = data["sat_results"]

combined = combined.merge(data["ap_2010"], on="DBN", how="left")
combined = combined.merge(data["graduation"], on="DBN", how="left")

to_merge = ["class_size", "demographics", "survey", "hs_directory"]

for m in to_merge:
    combined = combined.merge(data[m], on="DBN", how="inner")

combined = combined.fillna(combined.mean())
combined = combined.fillna(0)

# Add a school district column for mapping

In [None]:
def get_first_two_chars(dbn):
    return dbn[0:2]

combined["school_dist"] = combined["DBN"].apply(get_first_two_chars)

# Find correlations

In [None]:
correlations = combined.corr()
correlations = correlations["sat_score"]
print(correlations)

# Plotting survey correlations

In [None]:
remove_list = ["DBN",'N_s','N_t','N_p']
for P in remove_list:
    survey_fields.remove(P)
len(survey_fields)

In [None]:
bar_heights = []
for P in survey_fields:
    Q = correlations[P]
    bar_heights.append(Q)
bar_position = []
for P in range(0,19):
    Q = P+.75
    bar_position.append(Q)
ax_1 = plt.subplot() 
ax_1.bar(bar_position, bar_heights,.75)
x_ticks =[] 
for P in range(0,19):
    Q = P+.75+.375
    x_ticks.append(Q)
ax_1.set_xticks(x_ticks)
ax_1.set_xticklabels(survey_fields, rotation =90 )
ax_1.set_xlabel("Survey_Type")
ax_1.set_ylabel("Correlation with SAT Score")
plt.show()

In [None]:
ax_2 = combined.plot(x ="saf_s_11", y= "sat_score", kind = "scatter", title = "saf_s_11 vs Sat_score")
 

In [None]:
districts_avg = combined.groupby("school_dist").agg(numpy.mean)
type(districts_avg)

In [None]:
m = Basemap(
    projection='merc', 
    llcrnrlat=40.496044, 
    urcrnrlat=40.915256, 
    llcrnrlon=-74.255735, 
    urcrnrlon=-73.700272,
    resolution='i'
)
m.drawmapboundary(fill_color='#85A6D9')
m.drawcoastlines(color='#6D5F47', linewidth=.4)
m.drawrivers(color='#6D5F47', linewidth=.4)

longitude = districts_avg["lon"].tolist()
latitude = districts_avg["lat"].tolist()
m.scatter(longitude, latitude, s=50, zorder = 2, latlon = True, c= districts_avg["saf_s_11"], cmap = "summer")

In [None]:
# Let's look into thee race factors
race_per = ["white_per","asian_per","black_per","hispanic_per"]
bar_position = []
bar_height = []
x_ticks = []
ax_3 = plt.subplot()
for P in race_per:
    Q = correlations[P]
    bar_height.append(Q)
for P in range(0,4):
    Q = P/2+0.1
    bar_position.append(Q)
for P in range(0,4):
    Q = P/2+.25
    x_ticks.append(Q)
ax_3.bar(bar_position, bar_height, .3)
ax_3.set_xticks(x_ticks)
ax_3.set_xticklabels(race_per, rotation = 90)
ax_3.set_xlabel("Race")
ax_3.set_ylabel("Correlation with SAT SCORE")
ax_3.set_ylim(-0.5, 0.8)

In [None]:
combined.plot(x="hispanic_per", y="sat_score", kind = "scatter", title ="sat_score vs hispanic_per" )

In [None]:
hispanic_bool = combined["hispanic_per"] >95
combined.loc[hispanic_bool, ["SCHOOL NAME","hispanic_per", "sat_score"] ]

In [None]:
hispanic10_bool = combined["hispanic_per"]<10
satscore1800_bool = combined["sat_score"]>1800
combined.loc[hispanic10_bool & satscore1800_bool, ["SCHOOL NAME","hispanic_per", "sat_score"]]

In [None]:
bar_position = [0.5,1]
bar_height = [correlations["male_per"], correlations["female_per"]]
ax_4 = plt.subplot()
ax_4.bar(bar_position, bar_height, .3)
ax_4.set_xticks([.65,1.15])
ax_4.set_xticklabels(["male_per", "female_per"], rotation = 90)
ax_4.set_xlabel("Gender")
ax_4.set_ylabel("Correlation with SAT SCORE")

In [None]:
combined.plot(x = "female_per", y = "sat_score", kind="scatter", title = "female_per vs sat_score")

In [None]:
female_bool = combined["female_per"]>60
satscore1700_bool = combined["sat_score"]>1700
combined.loc[female_bool & satscore1700_bool, ["SCHOOL NAME","female_per", "sat_score"]]

In [None]:
combined["ap_per"] = combined["AP Test Takers "]/combined["total_enrollment"]

In [None]:
combined.plot(x= "ap_per", y = "sat_score", kind="scatter") 