In [1]:
import pandas as pd
import os

In [2]:
school_path = os.path.join("raw_data", "schools_complete.csv")
stu_path = os.path.join("raw_data", "students_complete.csv")

In [36]:
schools_df = pd.read_csv(school_path)
students_df = pd.read_csv(stu_path)

schools_df.head()
students_df.head()

Unnamed: 0,Student ID,name,gender,grade,school,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84


In [194]:
#Function to format floats as money
def money(n):
    return '${:,.2f}'.format(n)
def mfloat(n):
    return float(n.replace("$","").replace(",",""))

### District Summary

In [13]:
#Total Schools
num_schools = len(schools_df)
#Total Students
num_students = len(students_df)
#Total Budget
total_budget = schools_df.sum()["budget"]

In [14]:
#Average Math Score
avg_math = students_df.mean()["math_score"]
#Average Reading Score
avg_reading = students_df.mean()["reading_score"]

In [20]:
#Passing threshold set at 70 (C-)
passing = 70
#% Passing Math
num_pass_math = len(students_df.loc[students_df["math_score"]>=passing])
perc_pass_math = num_pass_math/num_students*100
#% Passing Reading
num_pass_reading = len(students_df.loc[students_df["reading_score"]>=passing])
perc_pass_reading = num_pass_reading/num_students*100
#Overall Passing Rate (Average of the above two)
perc_pass_total = (perc_pass_math+perc_pass_reading)/2

In [238]:
summary_df = pd.DataFrame({})
summary_df["Total Schools"] = [num_schools]
summary_df["Total Students"] = [num_students]
summary_df["Total Budget"] = [money(total_budget)]
summary_df["Avg Math Score"] = [avg_math]
summary_df["Avg Reading Score"] = [avg_reading]
summary_df["% Passing Math"] = [perc_pass_math]
summary_df["% Passing Reading"] = [perc_pass_reading]
summary_df["% Overall Passing Rate"] = [perc_pass_total]

summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Avg Math Score,Avg Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
0,15,39170,"$24,649,428.00",78.985371,81.87784,74.980853,85.805463,80.393158


### School Summary

In [248]:
#Gather groupby's
stu_school_group = students_df.groupby("school")
stu_school_count = stu_school_group.count()
stu_school_avg = stu_school_group.mean()
stu_school_sum = stu_school_group.sum()

#School Name
#School Type
#Total Students
#Total School Budget
schools_summary = pd.DataFrame({})
schools_summary[["School Name", "School Type", "Total Students", 
                 "Total School Budget"]] = schools_df[["name","type","size","budget"]]

#Per Student Budget
schools_summary["Per Student Budget"] = (schools_summary["Total School Budget"]/schools_summary["Total Students"]).map(money)
schools_summary["Total School Budget"] = schools_summary["Total School Budget"].map(money)

#Average Math Score
#Average Reading Score
schools_summary=schools_summary.merge(stu_school_avg[["math_score","reading_score"]],left_on="School Name", right_index =True)
schools_summary=schools_summary.rename(columns={"math_score":"Avg Math Score","reading_score":"Avg Reading Score"})

#% Passing Math
#% Passing Reading
school_math_pass = students_df.loc[students_df["math_score"]>=passing].groupby("school").count()[["math_score"]]
school_reading_pass = students_df.loc[students_df["reading_score"]>=passing].groupby("school").count()[["reading_score"]]
stu_school_pass = school_math_pass.join(school_reading_pass).rename(columns={"reading_score":"% Passing Reading","math_score":"% Passing Math"})
stu_school_pass["% Passing Math"]=stu_school_pass["% Passing Math"]/stu_school_count["name"]*100
stu_school_pass["% Passing Reading"]=stu_school_pass["% Passing Reading"]/stu_school_count["name"]*100

#Overall Passing Rate
stu_school_pass["% Overall Passing Rate"]=(stu_school_pass["% Passing Reading"]+stu_school_pass["% Passing Math"])/2
schools_summary = schools_summary.merge(stu_school_pass, left_on="School Name", right_index =True)

schools_summary = schools_summary.set_index("School Name")
schools_summary

Unnamed: 0_level_0,School Type,Total Students,Total School Budget,Per Student Budget,Avg Math Score,Avg Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Huang High School,District,2917,"$1,910,635.00",$655.00,76.629414,81.182722,65.683922,81.316421,73.500171
Figueroa High School,District,2949,"$1,884,411.00",$639.00,76.711767,81.15802,65.988471,80.739234,73.363852
Shelton High School,Charter,1761,"$1,056,600.00",$600.00,83.359455,83.725724,93.867121,95.854628,94.860875
Hernandez High School,District,4635,"$3,022,020.00",$652.00,77.289752,80.934412,66.752967,80.862999,73.807983
Griffin High School,Charter,1468,"$917,500.00",$625.00,83.351499,83.816757,93.392371,97.138965,95.265668
Wilson High School,Charter,2283,"$1,319,574.00",$578.00,83.274201,83.989488,93.867718,96.539641,95.203679
Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83.061895,83.97578,94.133477,97.039828,95.586652
Bailey High School,District,4976,"$3,124,928.00",$628.00,77.048432,81.033963,66.680064,81.93328,74.306672
Holden High School,Charter,427,"$248,087.00",$581.00,83.803279,83.814988,92.505855,96.252927,94.379391
Pena High School,Charter,962,"$585,858.00",$609.00,83.839917,84.044699,94.594595,95.945946,95.27027


### Top Performing Schools (By Passing Rate)

In [240]:
schools_sort_perf_top = schools_summary.sort_values("% Overall Passing Rate", ascending=False)
schools_sort_perf_top.head(5)

Unnamed: 0_level_0,School Type,Total Students,Total School Budget,Per Student Budget,Avg Math Score,Avg Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Cabrera High School,Charter,1858,"$1,081,356.00",$582.00,83.061895,83.97578,94.133477,97.039828,95.586652
Thomas High School,Charter,1635,"$1,043,130.00",$638.00,83.418349,83.84893,93.272171,97.308869,95.29052
Pena High School,Charter,962,"$585,858.00",$609.00,83.839917,84.044699,94.594595,95.945946,95.27027
Griffin High School,Charter,1468,"$917,500.00",$625.00,83.351499,83.816757,93.392371,97.138965,95.265668
Wilson High School,Charter,2283,"$1,319,574.00",$578.00,83.274201,83.989488,93.867718,96.539641,95.203679


### Bottom Performing Schools (By Passing Rate)

In [241]:
schools_sort_perf_bot = schools_summary.sort_values("% Overall Passing Rate", ascending=True)
schools_sort_perf_bot.head(5)

Unnamed: 0_level_0,School Type,Total Students,Total School Budget,Per Student Budget,Avg Math Score,Avg Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Rodriguez High School,District,3999,"$2,547,363.00",$637.00,76.842711,80.744686,66.366592,80.220055,73.293323
Figueroa High School,District,2949,"$1,884,411.00",$639.00,76.711767,81.15802,65.988471,80.739234,73.363852
Huang High School,District,2917,"$1,910,635.00",$655.00,76.629414,81.182722,65.683922,81.316421,73.500171
Johnson High School,District,4761,"$3,094,650.00",$650.00,77.072464,80.966394,66.057551,81.222432,73.639992
Ford High School,District,2739,"$1,763,916.00",$644.00,77.102592,80.746258,68.309602,79.299014,73.804308


### Math Scores by Grade

In [242]:
stu_grades_group = students_df.groupby(["school","grade"])
stu_grades_math = stu_grades_group.mean()[["math_score"]].unstack()["math_score"][["9th","10th","11th","12th"]]
del stu_grades_math.columns.name
del stu_grades_math.index.name
stu_grades_math

Unnamed: 0,9th,10th,11th,12th
Bailey High School,77.083676,76.996772,77.515588,76.492218
Cabrera High School,83.094697,83.154506,82.76556,83.277487
Figueroa High School,76.403037,76.539974,76.884344,77.151369
Ford High School,77.361345,77.672316,76.918058,76.179963
Griffin High School,82.04401,84.229064,83.842105,83.356164
Hernandez High School,77.438495,77.337408,77.136029,77.186567
Holden High School,83.787402,83.429825,85.0,82.855422
Huang High School,77.027251,75.908735,76.446602,77.225641
Johnson High School,77.187857,76.691117,77.491653,76.863248
Pena High School,83.625455,83.372,84.328125,84.121547


### Reading Scores by Grade

In [243]:
stu_grades_reading = stu_grades_group.mean()[["reading_score"]].unstack()["reading_score"][["9th","10th","11th","12th"]]
del stu_grades_reading.columns.name
del stu_grades_reading.index.name
stu_grades_reading

Unnamed: 0,9th,10th,11th,12th
Bailey High School,81.303155,80.907183,80.945643,80.912451
Cabrera High School,83.676136,84.253219,83.788382,84.287958
Figueroa High School,81.198598,81.408912,80.640339,81.384863
Ford High School,80.632653,81.262712,80.403642,80.662338
Griffin High School,83.369193,83.706897,84.288089,84.013699
Hernandez High School,80.86686,80.660147,81.39614,80.857143
Holden High School,83.677165,83.324561,83.815534,84.698795
Huang High School,81.290284,81.512386,81.417476,80.305983
Johnson High School,81.260714,80.773431,80.616027,81.227564
Pena High School,83.807273,83.612,84.335938,84.59116


### Scores by School Spending

In [253]:
spend_bins = [575,600,625,650,700]
spend_labels = ["<$600","$600-624","$625-649","$650+"]

school_scores_spend = schools_summary.copy()
school_scores_spend["Spending Ranges (Per Student)"] = pd.cut(schools_summary["Per Student Budget"].map(mfloat),spend_bins,labels=spend_labels,right=False)
school_scores_spend.iloc[:,4:].groupby("Spending Ranges (Per Student)").mean()

Unnamed: 0_level_0,Avg Math Score,Avg Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
Spending Ranges (Per Student),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
<$600,83.455399,83.933814,93.460096,96.610877,95.035486
$600-624,83.599686,83.885211,94.230858,95.900287,95.065572
$625-649,79.079225,81.891436,75.668212,86.106569,80.887391
$650+,76.99721,81.027843,66.164813,81.133951,73.649382


### Scores by School Size

In [278]:
size_bins = [0,1500,2500,5000]
size_labels = ["Small (<1500)","Medium (1500-2500)","Large (2500-5000)"]

school_scores_size = schools_summary.copy()
school_scores_size["School Size"] = pd.cut(schools_summary["Total Students"],size_bins,labels=size_labels,right=False)
school_scores_size.iloc[:,4:].groupby("School Size").mean()

Unnamed: 0_level_0,Avg Math Score,Avg Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
School Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Small (<1500),83.664898,83.892148,93.497607,96.445946,94.971776
Medium (1500-2500),83.359224,83.898984,93.694764,96.670815,95.18279
Large (2500-5000),76.956733,80.966636,66.548453,80.799062,73.673757


### Scores by School Type

In [276]:
schools_summary.iloc[:,[0,4,5,6,7,8]].groupby("School Type").mean()

Unnamed: 0_level_0,Avg Math Score,Avg Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
School Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Charter,83.473852,83.896421,93.62083,96.586489,95.10366
District,76.956733,80.966636,66.548453,80.799062,73.673757


### Observable Trends:

1) Smaller schools, charter schools, and schools with lower spending per student all correlate with higher academic performance.

2) Academic scores are generally consistent between grade levels within each school.

3) Reading performance is almost unilaterally greater than math performance.