In [2]:
#import dependencies (1)
import pandas as pd
from pathlib import Path

In [33]:
#creating initial dataframes, reading in and then converting csv data into a pandas df (1)
schools_df = pd.read_csv(Path("Resources/schools_complete.csv"))
students_df = pd.read_csv(Path("Resources/students_complete.csv"))
#sorting schools_df by school name, alphabetically (6)(8). The index is reset.
schools_df = schools_df.sort_values(by='school_name').reset_index()

In [4]:
#calculate passing rates for math, reading and total. using 60/100 as benchmark for 'passing'. (2)

math_passing = (students_df["math_score"].loc[students_df["math_score"]>60].count() / students_df["math_score"].count()) * 100

reading_passing = (students_df["reading_score"].loc[students_df["reading_score"]>60].count() / students_df["reading_score"].count()) * 100

#idenitying rows where reading score and math score are both > 60, and calculating a percentage (3)
total_passing = (((students_df["reading_score"]>60) & (students_df["math_score"]>60)).sum() / students_df["reading_score"].count()) * 100



In [5]:
#storing all target values and categories in a dictionary.
district_summary_data = {
    "Category": ["Total Schools", 
                 "Total Students", 
                 "Total Budget ($)",
                 "Average Math Score",
                 "Average Reading Score",
                 "Math Passing %",
                 "Reading Passing %",
                 "Total Passing %"
                ],
    "Value": [schools_df["school_name"].count(),
              students_df["student_name"].count(),
              schools_df["budget"].sum(),
              round(students_df["math_score"].mean(),2),
              round(students_df["reading_score"].mean(),2),
              round(math_passing, 2),
              round(reading_passing, 2),
              round(total_passing, 2)
            ]
    }

In [6]:
#converting dictionary object into a dataframe. (4)
district_summary_df = pd.DataFrame(district_summary_data)
#setting the category values as index values (5)
district_summary_df = district_summary_df.set_index('Category', drop=True)
print("Distric Summary: ")
district_summary_df.head(20)

Distric Summary: 


Unnamed: 0_level_0,Value
Category,Unnamed: 1_level_1
Total Schools,15.0
Total Students,39170.0
Total Budget ($),24649428.0
Average Math Score,78.99
Average Reading Score,81.88
Math Passing %,90.91
Reading Passing %,100.0
Total Passing %,90.91


In [7]:
#creating dataframes that contain average scores by school, using .groupby() and .mean() (6). 
# resetting the index of each dataframe with reset_index() (7), to match the formatting of schools_df.
mathAVG = (pd.DataFrame(students_df.groupby(["school_name"]).mean()).reset_index())[["school_name","math_score"]]
readAVG = (pd.DataFrame(students_df.groupby(["school_name"]).mean()).reset_index())[["school_name","reading_score"]]

In [8]:
#creating a dictionary of desired values to pass into pd.DataFrame
school_summary_data = {
    "school_name": schools_df["school_name"],
    "School Type": schools_df["type"],
    "Student Count": schools_df["size"],
    "Total Budget ($)": schools_df["budget"],
    "Per-Student Budget ($)": (schools_df["budget"]/schools_df["size"]),
    "Average Math Score": mathAVG["math_score"],
    "Average Reading Score": readAVG["reading_score"]  
    }

In [9]:
#creating the first iteration of the school summary dataframe
school_summary_df = pd.DataFrame(school_summary_data)


In [10]:
#code block to determing the percentage of students passing math, by school
#adding this data to school summary data.

#find the number of students passing math
passing_math = students_df.loc[(students_df['math_score']>60)]
#grouping loc output and creating a new dataframe. counting the size of each group with size() (11).
passing_math = pd.DataFrame(passing_math.groupby("school_name").size()).reset_index()
#rename columns (9)
passing_math = passing_math.rename(columns = {0:"Number of Students Passing Math"})

#find the total number of students per school, creating new dataframe
school_pop = pd.DataFrame(students_df.groupby("school_name").size()).reset_index()
school_pop = school_pop.rename(columns = {0:"Total Students"})

#merging two dataframe on shared series 'school_name' (10)
math_merge = pd.merge(passing_math,school_pop, on="school_name")
#creating a new series
math_merge["Passing Math %"] = (math_merge["Number of Students Passing Math"] / \
                                math_merge["Total Students"]) * 100
#simplifying dataframe to target columns
math_merge = math_merge[["school_name", "Passing Math %"]]


#merging math data into summary dataframe
school_summary_df1 = pd.merge(school_summary_df, math_merge, on="school_name")


In [11]:
#code block to determing the percentage of students passing reading, by school. this code mimics the math block above.
#adding this data to school summary data.

#find the number of students passing reading
passing_read = students_df.loc[(students_df['reading_score']>60)]
#grouping loc output and creating a new dataframe. counting the size of each group with size() (11).
passing_read = pd.DataFrame(passing_read.groupby("school_name").size()).reset_index()
passing_read = passing_read.rename(columns = {0:"Number of Students Passing Reading"}) #(9)

#merging two dataframe on shared series 'school_name' (10)
#using school_pop dataframe calculated earlier, which containts total population per school
read_merge = pd.merge(passing_read,school_pop, on="school_name")
#creating a new series
read_merge["Passing Reading %"] = (read_merge["Number of Students Passing Reading"] / \
                                read_merge["Total Students"]) * 100
#simplifying dataframe to target columns
read_merge = read_merge[["school_name", "Passing Reading %"]]


#merging math data into summary dataframe
school_summary_df2 = pd.merge(school_summary_df1, read_merge, on="school_name")


In [21]:
#code block to determing the percentage of students passing reading AND math, by school. this code mimics the math block above.
#adding this data to school summary data.

#find the number of students passing reading AND passing math
passing_total = students_df.loc[((students_df['reading_score']>60) & (students_df['math_score']>60)),:]
#grouping loc output and creating a new dataframe. counting the size of each group with size() (11).
passing_total = pd.DataFrame(passing_total.groupby("school_name").size()).reset_index()
passing_total = passing_total.rename(columns = {0:"Number of Students Passing"}) #(9)

#merging two dataframe on shared series 'school_name' (10)
#using school_pop dataframe calculated earlier, which containts total population per school
total_merge = pd.merge(passing_total,school_pop, on="school_name")
total_merge.head(20)


#creating a new series
total_merge["Total Passing %"] = (total_merge["Number of Students Passing"] / \
                                total_merge["Total Students"]) * 100
#simplifying dataframe to target columns
total_merge = total_merge[["school_name", "Total Passing %"]]


#merging math data into summary dataframe
school_summary_df3 = pd.merge(school_summary_df2, total_merge, on="school_name")



In [36]:
#cleaning and formatting the school summary dataframe
#creating final dataframe
school_summary_final = school_summary_df3[["school_name", "School Type", "Student Count", 
                                            "Total Budget ($)","Per-Student Budget ($)", 
                                           "Average Math Score", "Average Reading Score",
                                           "Passing Math %","Passing Reading %", "Total Passing %"]]
#modifying column names
school_summary_final = school_summary_final.rename(columns={"school_name": "School Name",
                                                           "Passing Math %": "Passing Math (%)",
                                                           "Passing Reading %": "Passing Reading (%)",
                                                           "Total Passing %": "Total Passing (%)"})
#updating index to be School Name values
school_summary_final= school_summary_final.set_index("School Name")

#using mapping to udpate the formatting of series in dataframe (12)
school_summary_final["Student Count"] = school_summary_final["Student Count"].map("{:,.0f}".format)
school_summary_final["Total Budget ($)"] = school_summary_final["Total Budget ($)"].map("${:,.0f}".format)
school_summary_final["Per-Student Budget ($)"] = school_summary_final["Per-Student Budget ($)"].map("${:,.0f}".format)
school_summary_final["Average Math Score"] = school_summary_final["Average Math Score"].map("{:.1f}".format)
school_summary_final["Average Reading Score"] = school_summary_final["Average Reading Score"].map("{:.1f}".format)
school_summary_final["Passing Math (%)"] = school_summary_final["Passing Math (%)"].map("{:.2f}%".format)
school_summary_final["Passing Reading (%)"] = school_summary_final["Passing Reading (%)"].map("{:.2f}%".format)
school_summary_final["Total Passing (%)"] = school_summary_final["Total Passing (%)"].map("{:.2f}%".format)


school_summary_final

Unnamed: 0_level_0,School Type,Student Count,Total Budget ($),Per-Student Budget ($),Average Math Score,Average Reading Score,Passing Math (%),Passing Reading (%),Total Passing (%)
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Bailey High School,District,4976,"$3,124,928",$628,77.0,81.0,87.44%,100.00%,87.44%
Cabrera High School,Charter,1858,"$1,081,356",$582,83.1,84.0,100.00%,100.00%,100.00%
Figueroa High School,District,2949,"$1,884,411",$639,76.7,81.2,86.44%,100.00%,86.44%
Ford High School,District,2739,"$1,763,916",$644,77.1,80.7,87.22%,100.00%,87.22%
Griffin High School,Charter,1468,"$917,500",$625,83.4,83.8,100.00%,100.00%,100.00%
Hernandez High School,District,4635,"$3,022,020",$652,77.3,80.9,86.45%,100.00%,86.45%
Holden High School,Charter,427,"$248,087",$581,83.8,83.8,100.00%,100.00%,100.00%
Huang High School,District,2917,"$1,910,635",$655,76.6,81.2,86.84%,100.00%,86.84%
Johnson High School,District,4761,"$3,094,650",$650,77.1,81.0,86.70%,100.00%,86.70%
Pena High School,Charter,962,"$585,858",$609,83.8,84.0,100.00%,100.00%,100.00%
