# PyCity Schools Analysis

* As a whole, schools with higher budgets, did not yield better test results. By contrast, schools with higher spending per student actually (\$645-675) underperformed compared to schools with smaller budgets (<\$585 per student).

* As a whole, smaller and medium sized schools dramatically out-performed large sized schools on passing math performances (89-91% passing vs 67%).

* As a whole, charter schools out-performed the public district schools across all metrics. However, more analysis will be required to glean if the effect is due to school practices or the fact that charter schools tend to serve smaller student populations per school. 
---

In [66]:
# Dependencies and Setup
import pandas as pd
import numpy as np

# File to Load (Remember to Change These)
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

# Read School and Student Data File and store into Pandas Data Frames
school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)

# Combine the data into a single dataset (consider using a left join)
school_data_complete = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])
school_data_complete


Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635
5,5,Bryan Miranda,M,9th,Huang High School,94,94,0,District,2917,1910635
6,6,Sheena Carter,F,11th,Huang High School,82,80,0,District,2917,1910635
7,7,Nicole Baker,F,12th,Huang High School,96,69,0,District,2917,1910635
8,8,Michael Roth,M,10th,Huang High School,95,87,0,District,2917,1910635
9,9,Matthew Greene,M,10th,Huang High School,96,84,0,District,2917,1910635


## District Summary


In [67]:

#Create a high level snapshot (in table form) of the district's key metrics, including:

#Total Schools
total_schools = school_data['school_name'].count()
total_schools


15

In [68]:
#Total Students

total_students = student_data['Student ID'].count()
total_students

39170

In [69]:
 # Total Budget
    
total_budget = school_data['budget'].sum()
total_budget


24649428

In [70]:
 # Average Math Score
    
avg_math_score =  np.around(student_data['math_score'].sum()/student_data['math_score'].count(),decimals=6)
avg_math_score   
        

78.985371

In [71]:
# Average Reading Score

avg_reading_score =  np.around(student_data['reading_score'].sum()/student_data['reading_score'].count(),decimals=6)
avg_reading_score



81.87784

In [72]:

# % Passing Math (The percentage of students that passed math.)
    
passing_math_perc = np.around((student_data['math_score']>=70).sum() /student_data['math_score'].count()*100,decimals=6)
passing_math_perc

# % Passing Reading (The percentage of students that passed reading.)

passing_reading_perc = np.around((student_data['reading_score']>=70).sum() /student_data['reading_score'].count()*100,decimals=6)
passing_reading_perc


85.805463

In [74]:
  # % Overall Passing (The percentage of students that passed math **and** reading.)
    
overall_passing_rate = np.around((avg_math_score+avg_reading_score)/2,decimals=6)
overall_passing_rate
    
    
district_summary_df = pd.DataFrame({'Total Schools': total_schools,
                            'Total Students': total_students,
                            'Total Budget' : total_budget,
                            'Average Math Score' : avg_math_score,
                            'Average Reading Score' : avg_reading_score ,
                            '% Passing Math' : passing_math_perc,
                            '% Passing Reading' : passing_reading_perc,
                            '% Overall Passing Rate' : overall_passing_rate
                           }, index=[0])
district_summary_df['Total Budget'] = district_summary_df['Total Budget'].map("${:,.2f}".format)
district_summary_df['Total Students'] = district_summary_df['Total Students'].map("{:,}".format)




district_summary_df

    

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
0,15,39170,"$24,649,428.00",78.985371,81.87784,74.980853,85.805463,80.431606


## School Summary

In [77]:

school_data_complete["passing_math"] = school_data_complete["math_score"] >= 70
school_data_complete["passing_reading"] = school_data_complete["reading_score"] >= 70

school_data_complete

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget,passing_math,passing_reading
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635,True,False
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635,False,True
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635,False,True
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635,False,False
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635,True,True
5,5,Bryan Miranda,M,9th,Huang High School,94,94,0,District,2917,1910635,True,True
6,6,Sheena Carter,F,11th,Huang High School,82,80,0,District,2917,1910635,True,True
7,7,Nicole Baker,F,12th,Huang High School,96,69,0,District,2917,1910635,False,True
8,8,Michael Roth,M,10th,Huang High School,95,87,0,District,2917,1910635,True,True
9,9,Matthew Greene,M,10th,Huang High School,96,84,0,District,2917,1910635,True,True


In [78]:

school_group = school_data_complete.groupby(["school_name"]).mean()
school_group["Per Student Budget"] = school_group["budget"]/school_group["size"]
school_group["% Passing Math"] = round(school_group["passing_math"]*100,2)
school_group["% Passing Reading"] = round(school_group["passing_reading"]*100,2)
school_group["% Overall Passing Rate"] = round(((school_group["passing_math"] + school_group["passing_reading"])/2)*100,3)

#Merge with school_data to collect information about the type, size and budget
school_data_summary = pd.merge(school_group, school_data, how="left", on=["school_name", "school_name"])
del school_data_summary['size_y']
del school_data_summary['budget_y']
del school_data_summary['Student ID']
del school_data_summary['School ID_x']


school_summary_dataframe = pd.DataFrame({"School Name":  school_data_summary["school_name"],
                                "School Type": school_data_summary["type"],
                               "Total Students":school_data_summary["size_x"],
                               "Total School Budget": school_data_summary["budget_x"],
                               "Per Student Budget":school_data_summary["Per Student Budget"], 
                               "Average Math Score":round(school_data_summary["math_score"],2),
                               "Average Reading Score":round(school_data_summary["reading_score"],2), 
                               "% Passing Math": school_data_summary["% Passing Math"],
                               "% Passing Reading": school_data_summary["% Passing Reading"],
                               "% Overall Passing Rate": school_data_summary["% Overall Passing Rate"]}) 

school_summary_dataframe["Total Students"] = school_summary_dataframe["Total Students"].map("{:,.0f}".format)
school_summary_dataframe["Total School Budget"] = school_summary_dataframe["Total School Budget"].map("${:,.2f}".format)
school_summary_dataframe["Per Student Budget"] = school_summary_dataframe["Per Student Budget"].map("${:,.2f}".format)

school_summary_dataframe

Unnamed: 0,School Name,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
0,Bailey High School,District,4976,"$3,124,928.00",$628.00,77.05,81.03,66.68,81.93,74.307
1,Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83.06,83.98,94.13,97.04,95.587
2,Figueroa High School,District,2949,"$1,884,411.00",$639.00,76.71,81.16,65.99,80.74,73.364
3,Ford High School,District,2739,"$1,763,916.00",$644.00,77.1,80.75,68.31,79.3,73.804
4,Griffin High School,Charter,1468,"$917,500.00",$625.00,83.35,83.82,93.39,97.14,95.266
5,Hernandez High School,District,4635,"$3,022,020.00",$652.00,77.29,80.93,66.75,80.86,73.808
6,Holden High School,Charter,427,"$248,087.00",$581.00,83.8,83.81,92.51,96.25,94.379
7,Huang High School,District,2917,"$1,910,635.00",$655.00,76.63,81.18,65.68,81.32,73.5
8,Johnson High School,District,4761,"$3,094,650.00",$650.00,77.07,80.97,66.06,81.22,73.64
9,Pena High School,Charter,962,"$585,858.00",$609.00,83.84,84.04,94.59,95.95,95.27


## Top Performing Schools (By Passing Rate)

In [81]:
# Sort and show top five schools

school_student_df_school_sum = school_student_df_school_sum.sort_values(by=['% Overall Passing Rate'], ascending=False)
school_student_top_df = school_student_df_school_sum.head(5)
school_student_top_df




Unnamed: 0_level_0,School Type,Total School Budget,Total Students,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Per Student Budget,% Overall Passing Rate
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Cabrera High School,Charter,"$1,081,356.00",1858,83.061895,83.97578,94.133477,97.039828,$582.00,95.586652
Thomas High School,Charter,"$1,043,130.00",1635,83.418349,83.84893,93.272171,97.308869,$638.00,95.29052
Pena High School,Charter,"$585,858.00",962,83.839917,84.044699,94.594595,95.945946,$609.00,95.270271
Griffin High School,Charter,"$917,500.00",1468,83.351499,83.816757,93.392371,97.138965,$625.00,95.265668
Wilson High School,Charter,"$1,319,574.00",2283,83.274201,83.989488,93.867718,96.539641,$578.00,95.203679


## Bottom Performing Schools (By Passing Rate)

In [82]:
# Sort and show bottom five schools

school_student_worst_df = school_student_df_school_sum.tail(5)
school_student_worst_df = school_student_worst_df.sort_values(by=['% Overall Passing Rate'])
school_student_worst_df



Unnamed: 0_level_0,School Type,Total School Budget,Total Students,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,Per Student Budget,% Overall Passing Rate
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Rodriguez High School,District,"$2,547,363.00",3999,76.842711,80.744686,66.366592,80.220055,$637.00,73.293323
Figueroa High School,District,"$1,884,411.00",2949,76.711767,81.15802,65.988471,80.739234,$639.00,73.363853
Huang High School,District,"$1,910,635.00",2917,76.629414,81.182722,65.683922,81.316421,$655.00,73.500171
Johnson High School,District,"$3,094,650.00",4761,77.072464,80.966394,66.057551,81.222432,$650.00,73.639992
Ford High School,District,"$1,763,916.00",2739,77.102592,80.746258,68.309602,79.299014,$644.00,73.804308


## Math Scores by Grade

In [83]:
# Create data series of scores by grade levels using conditionals

# Group each by school name

school_avgmath_df_group = pd.DataFrame({'9th' : ((school_student_df[school_student_df['grade']=="9th"].groupby(['school_name'])['math_score'].sum()
                                                  /school_student_df[school_student_df['grade']=="9th"].groupby(['school_name'])['Student ID'].count())),
                                        '10th' : ((school_student_df[school_student_df['grade']=="10th"].groupby(['school_name'])['math_score'].sum()
                                                  /school_student_df[school_student_df['grade']=="10th"].groupby(['school_name'])['Student ID'].count())),
                                        '11th' : ((school_student_df[school_student_df['grade']=="11th"].groupby(['school_name'])['math_score'].sum()
                                                  /school_student_df[school_student_df['grade']=="11th"].groupby(['school_name'])['Student ID'].count())),
                                        '12th' : ((school_student_df[school_student_df['grade']=="12th"].groupby(['school_name'])['math_score'].sum()
                                                  /school_student_df[school_student_df['grade']=="12th"].groupby(['school_name'])['Student ID'].count())),
                                       })
school_avgmath_df_group.index.name = " "
school_avgmath_df_group



Unnamed: 0,9th,10th,11th,12th
,,,,
Bailey High School,77.083676,76.996772,77.515588,76.492218
Cabrera High School,83.094697,83.154506,82.76556,83.277487
Figueroa High School,76.403037,76.539974,76.884344,77.151369
Ford High School,77.361345,77.672316,76.918058,76.179963
Griffin High School,82.04401,84.229064,83.842105,83.356164
Hernandez High School,77.438495,77.337408,77.136029,77.186567
Holden High School,83.787402,83.429825,85.0,82.855422
Huang High School,77.027251,75.908735,76.446602,77.225641
Johnson High School,77.187857,76.691117,77.491653,76.863248


## Reading Score by Grade 

In [84]:
# Create data series of scores by grade levels using conditionals

# Group each by school name

school_avgread_df_group = pd.DataFrame({'9th' : ((school_student_df[school_student_df['grade']=="9th"].groupby(['school_name'])['reading_score'].sum()
                                                  /school_student_df[school_student_df['grade']=="9th"].groupby(['school_name'])['Student ID'].count())),
                                        '10th' : ((school_student_df[school_student_df['grade']=="10th"].groupby(['school_name'])['reading_score'].sum()
                                                  /school_student_df[school_student_df['grade']=="10th"].groupby(['school_name'])['Student ID'].count())),
                                        '11th' : ((school_student_df[school_student_df['grade']=="11th"].groupby(['school_name'])['reading_score'].sum()
                                                  /school_student_df[school_student_df['grade']=="11th"].groupby(['school_name'])['Student ID'].count())),
                                        '12th' : ((school_student_df[school_student_df['grade']=="12th"].groupby(['school_name'])['reading_score'].sum()
                                                  /school_student_df[school_student_df['grade']=="12th"].groupby(['school_name'])['Student ID'].count())),
                                       })
school_avgread_df_group.index.name = " "
school_avgread_df_group


Unnamed: 0,9th,10th,11th,12th
,,,,
Bailey High School,81.303155,80.907183,80.945643,80.912451
Cabrera High School,83.676136,84.253219,83.788382,84.287958
Figueroa High School,81.198598,81.408912,80.640339,81.384863
Ford High School,80.632653,81.262712,80.403642,80.662338
Griffin High School,83.369193,83.706897,84.288089,84.013699
Hernandez High School,80.86686,80.660147,81.39614,80.857143
Holden High School,83.677165,83.324561,83.815534,84.698795
Huang High School,81.290284,81.512386,81.417476,80.305983
Johnson High School,81.260714,80.773431,80.616027,81.227564


## Scores by School Spending

In [85]:
# Establish the bins -- choose any set of bins you would like, but see below for testing bins
# to test, set your bins as follows: [0, 585, 615, 645, 675]
# ALSO -- Note that the values for `% Passing Math`, `% Passing Reading` and `% Overall Passing Rate`
# were computed using averages of averages -- your results may vary if you use weighted averages 

spending_bins = [0, 585, 610, 635, 670]
group_names = ["<$585", "$585-610", "$610-635", "$635-670"]


school_spend_df =school_student_df_school_sum



school_spend_df["Spending Ranges (Per Student)"] = pd.cut(
    school_spend_df["Per Student Budget"].str.strip("$").astype(float),spending_bins
    ,labels= group_names)


school_spend_scores_df = pd.DataFrame(
                               {'Average Math Score': school_spend_df.groupby("Spending Ranges (Per Student)")['Average Math Score'].mean(),
                                'Average Reading Score': school_spend_df.groupby("Spending Ranges (Per Student)")['Average Reading Score'].mean()                              
                               })
school_spend_scores_df['% Passing Math'] = pd.to_numeric(school_spend_df['% Passing Math']).groupby(school_spend_df["Spending Ranges (Per Student)"]).mean()
school_spend_scores_df['% Passing Reading'] = pd.to_numeric(school_spend_df['% Passing Reading']).groupby(school_spend_df["Spending Ranges (Per Student)"]).mean()
school_spend_scores_df['% Overall Passing Rate'] = school_spend_df.groupby("Spending Ranges (Per Student)")['% Overall Passing Rate'].mean()


school_spend_scores_df


Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
Spending Ranges (Per Student),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<$585,83.455399,83.933814,93.460096,96.610877,95.035486
$585-610,83.599686,83.885211,94.230858,95.900287,95.065573
$610-635,80.199966,82.42536,80.036217,89.536123,84.78617
$635-670,77.866721,81.368774,70.347325,82.995575,76.67145


## Scores by School Size

In [88]:
# Establish the bins 

size_bins = [0, 1000, 2000, 5000]
group_names = ["Small (<1000)", "Medium (1000-2000)", "Large (2000-5000)"]

# Use 4 reasonable bins to group school size. 
school_data_summary["School Size"] = pd.cut(school_data_summary["size_x"], size_bins, labels=group_names)
school_data_summary

#group by size_x
school_size_grouped = school_data_summary.groupby("School Size").mean() 
school_size_grouped

#Remove the unwanted columns as per the sample provided
#del school_size_grouped['size_x']
del school_size_grouped['budget_x']
del school_size_grouped['Per Student Budget']
del school_size_grouped['School ID_y']
del school_size_grouped['passing_math']
del school_size_grouped['passing_reading']

#Display
school_size_grouped




Unnamed: 0_level_0,reading_score,math_score,size_x,% Passing Math,% Passing Reading,% Overall Passing Rate
School Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Small (<1000),83.929843,83.821598,694.5,93.55,96.1,94.8245
Medium (1000-2000),83.864438,83.374684,1704.4,93.598,96.79,95.1954
Large (2000-5000),81.344493,77.746417,3657.375,69.96375,82.76625,76.365


## Scores by School Type

In [87]:
# Type | Average Math Score | Average Reading Score | % Passing Math | % Passing Reading | % Overall Passing Rate

school_type_grouped = school_data_summary.groupby("type").mean()

del school_type_grouped['size_x']
del school_type_grouped['budget_x']
del school_type_grouped['Per Student Budget']
del school_type_grouped['School ID_y']
del school_type_grouped['passing_math']
del school_type_grouped['passing_reading']

school_type_grouped


Unnamed: 0_level_0,reading_score,math_score,% Passing Math,% Passing Reading,% Overall Passing Rate
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Charter,83.896421,83.473852,93.62,96.58625,95.10375
District,80.966636,76.956733,66.548571,80.798571,73.673714
