#PyCity School Analysis
 
 

In [7]:
#import dependencies (1)
import pandas as pd
from pathlib import Path

In [8]:
#creating initial dataframes, reading in and then converting csv data into a pandas df (1)
schools_df = pd.read_csv(Path("Resources/schools_complete.csv"))
students_df = pd.read_csv(Path("Resources/students_complete.csv"))
#sorting schools_df by school name, alphabetically (6)(8). The index is reset.
schools_df = schools_df.sort_values(by='school_name').reset_index()

# Districut Summary

In [9]:
#calculate passing rates for math, reading and total. using 70/100 as benchmark for 'passing'. (2)

math_passing = (students_df["math_score"].loc[students_df["math_score"]>=70].count() / students_df["math_score"].count()) * 100

reading_passing = (students_df["reading_score"].loc[students_df["reading_score"]>=70].count() / students_df["reading_score"].count()) * 100

#idenitying rows where reading score and math score are both >= 70, and calculating a percentage (3)
total_passing = (((students_df["reading_score"]>=70) & (students_df["math_score"]>=70)).sum() / students_df["reading_score"].count()) * 100


In [13]:
#storing all target values and categories in a dictionary. (16)
district_summary_data = {
    "Category": ["Total Schools", 
                 "Total Students", 
                 "Total Budget",
                 "Average Math Score",
                 "Average Reading Score",
                 "Math Passing %",
                 "Reading Passing %",
                 "Total Passing %"
                ],
    "Value": [schools_df["school_name"].count(),
              students_df["student_name"].count(),
              schools_df["budget"].sum(),
              round(students_df["math_score"].mean(),2),
              round(students_df["reading_score"].mean(),2),
              round(math_passing, 2),
              round(reading_passing, 2),
              round(total_passing, 2)
            ]
    }

In [33]:
#converting dictionary object into a dataframe. (4)
district_summary = pd.DataFrame(district_summary_data)
#setting the category values as index values (5)
#district_summary = district_summary.set_index('Category', drop=True)

# Formatting (17)
district_summary.iloc[0,1] = "{:,.0f}".format(district_summary.iloc[0,1])
district_summary.iloc[1,1] = "{:,.0f}".format(district_summary.iloc[1,1])
district_summary.iloc[2,1] = "${:,.2f}".format(district_summary.iloc[2,1])

district_summary.head(20)

Unnamed: 0,Category,Value
0,Total Schools,15
1,Total Students,39170
2,Total Budget,"$24,649,428.00"
3,Average Math Score,78.99
4,Average Reading Score,81.88
5,Math Passing %,74.98
6,Reading Passing %,85.81
7,Total Passing %,65.17


# School Summary

In [None]:
#creating dataframes that contain average scores by school, using .groupby() and .mean() (6). 
# resetting the index of each dataframe with reset_index() (7), to match the formatting of schools_df.
mathAVG = (pd.DataFrame(students_df.groupby(["school_name"]).mean()).reset_index())[["school_name","math_score"]]
readAVG = (pd.DataFrame(students_df.groupby(["school_name"]).mean()).reset_index())[["school_name","reading_score"]]

In [None]:
#creating a dictionary of desired values to pass into pd.DataFrame
school_summary_data = {
    "school_name": schools_df["school_name"],
    "School Type": schools_df["type"],
    "Student Count": schools_df["size"],
    "Total Budget ($)": schools_df["budget"],
    "Per-Student Budget ($)": (schools_df["budget"]/schools_df["size"]),
    "Average Math Score": mathAVG["math_score"],
    "Average Reading Score": readAVG["reading_score"]  
    }

In [None]:
#creating the first iteration of the school summary dataframe
school_summary_df = pd.DataFrame(school_summary_data)


In [None]:
#code block to determing the percentage of students passing math, by school
#adding this data to school summary data.

#find the number of students passing math
passing_math = students_df.loc[(students_df['math_score']>=70)]
#grouping loc output and creating a new dataframe. counting the size of each group with size() (11).
passing_math = pd.DataFrame(passing_math.groupby("school_name").size()).reset_index()
#rename columns (9)
passing_math = passing_math.rename(columns = {0:"Number of Students Passing Math"})

#find the total number of students per school, creating new dataframe
school_pop = pd.DataFrame(students_df.groupby("school_name").size()).reset_index()
school_pop = school_pop.rename(columns = {0:"Total Students"})

#merging two dataframe on shared series 'school_name' (10)
math_merge = pd.merge(passing_math,school_pop, on="school_name")
#creating a new series
math_merge["Passing Math %"] = (math_merge["Number of Students Passing Math"] / \
                                math_merge["Total Students"]) * 100
#simplifying dataframe to target columns
math_merge = math_merge[["school_name", "Passing Math %"]]


#merging math data into summary dataframe
school_summary_df1 = pd.merge(school_summary_df, math_merge, on="school_name")


In [None]:
#code block to determing the percentage of students passing reading, by school. this code mimics the math block above.
#adding this data to school summary data.

#find the number of students passing reading
passing_read = students_df.loc[(students_df['reading_score']>=70)]
#grouping loc output and creating a new dataframe. counting the size of each group with size() (11).
passing_read = pd.DataFrame(passing_read.groupby("school_name").size()).reset_index()
passing_read = passing_read.rename(columns = {0:"Number of Students Passing Reading"}) #(9)

#merging two dataframe on shared series 'school_name' (10)
#using school_pop dataframe calculated in cell above, which containts total population per school
read_merge = pd.merge(passing_read,school_pop, on="school_name")
#creating a new series
read_merge["Passing Reading %"] = (read_merge["Number of Students Passing Reading"] / \
                                read_merge["Total Students"]) * 100
#simplifying dataframe to target columns
read_merge = read_merge[["school_name", "Passing Reading %"]]


#merging math data into summary dataframe
school_summary_df2 = pd.merge(school_summary_df1, read_merge, on="school_name")


In [None]:
#code block to determing the percentage of students passing reading AND math, by school. this code mimics the math block above.
#adding this data to school summary data.

#find the number of students passing reading AND passing math
passing_total = students_df.loc[((students_df['reading_score']>=70) & (students_df['math_score']>=70)),:]
#grouping loc output and creating a new dataframe. counting the size of each group with size() (11).
passing_total = pd.DataFrame(passing_total.groupby("school_name").size()).reset_index()
passing_total = passing_total.rename(columns = {0:"Number of Students Passing"}) #(9)

#merging two dataframe on shared series 'school_name' (10)
#using school_pop dataframe calculated in cell above, which containts total population per school
total_merge = pd.merge(passing_total,school_pop, on="school_name")
total_merge.head(20)


#creating a new series
total_merge["Total Passing %"] = (total_merge["Number of Students Passing"] / \
                                total_merge["Total Students"]) * 100
#simplifying dataframe to target columns
total_merge = total_merge[["school_name", "Total Passing %"]]


#merging math data into summary dataframe
school_summary_df3 = pd.merge(school_summary_df2, total_merge, on="school_name")



In [None]:
#cleaning and formatting the school summary dataframe
#creating final dataframe
per_school_summary = school_summary_df3[["school_name", "School Type", "Student Count", 
                                            "Total Budget ($)","Per-Student Budget ($)", 
                                           "Average Math Score", "Average Reading Score",
                                           "Passing Math %","Passing Reading %", "Total Passing %"]]
#modifying column names
per_school_summary = per_school_summary.rename(columns={"school_name": "School Name",
                                                           "Passing Math %": "Passing Math (%)",
                                                           "Passing Reading %": "Passing Reading (%)",
                                                           "Total Passing %": "Total Passing (%)"})

"""Formatting block that was written, but not needed. left in comment format to save and show work.
#updating index to be School Name values
school_summary_final= school_summary_final.set_index("School Name")

#using mapping to udpate the formatting of series in dataframe (12)
school_summary_final["Student Count"] = school_summary_final["Student Count"].map("{:,.0f}".format)
school_summary_final["Total Budget ($)"] = school_summary_final["Total Budget ($)"].map("${:,.0f}".format)
school_summary_final["Per-Student Budget ($)"] = school_summary_final["Per-Student Budget ($)"].map("${:,.0f}".format)
school_summary_final["Average Math Score"] = school_summary_final["Average Math Score"].map("{:.1f}".format)
school_summary_final["Average Reading Score"] = school_summary_final["Average Reading Score"].map("{:.1f}".format)
school_summary_final["Passing Math (%)"] = school_summary_final["Passing Math (%)"].map("{:.2f}%".format)
school_summary_final["Passing Reading (%)"] = school_summary_final["Passing Reading (%)"].map("{:.2f}%".format)
school_summary_final["Total Passing (%)"] = school_summary_final["Total Passing (%)"].map("{:.2f}%".format)
"""

#creating a copy of the final summary, to modify later for additional analysis (14)
school_summary_final_copy = per_school_summary.copy()
per_school_summary.head(15)

## Highest-Performing Schools by Percentage of Overall Passing

In [None]:
#organizing the summary data frame by 'total passing (%)', in descending order.
top_schools = school_summary_final_copy.sort_values(by="Total Passing (%)",ascending=False)
top_schools.head(5)

## Lowest-Performing Schools by Percentage of Overall Passing

In [None]:
#organizing the summary data frame by 'total passing (%)', in ascending order.
bottom_schools = school_summary_final_copy.sort_values(by="Total Passing (%)",ascending=True)
bottom_schools.head(5)

## Math Scores by Grade

In [None]:
#average math scores, per grade level
#grouping the "grade" and "math_score" series, and calculating the mean of each group.

math_by_grade = students_df[["grade", "math_score"]].groupby("grade").mean()
math_by_grade = math_by_grade.reset_index()
math_by_grade = math_by_grade.sort_values(by="grade",ascending=True)
math_by_grade = math_by_grade.rename(columns={"grade":"Grade","math_score": "Math Score"})
math_scores_by_grade = math_by_grade.set_index('Grade')

math_scores_by_grade


## Reading Scores by Grade

In [None]:
#average reading scores, per grade level
#grouping the "grade" and "reading_score" series, and calculating the mean of each group.
read_by_grade = students_df[["grade", "reading_score"]].groupby("grade").mean()
read_by_grade = read_by_grade.reset_index()
read_by_grade = read_by_grade.sort_values(by="grade",ascending=True)
read_by_grade = read_by_grade.rename(columns={"grade":"Grade", "reading_score":"Reading Score"})
reading_scores_by_grade = read_by_grade.set_index('Grade')

reading_scores_by_grade

## Scores by School Spending

In [None]:
#Scores by school spending (per student)

#creating bins and lables. code provided in challenge documentation
spending_bins = [0, 585, 630, 645, 680]
labels = ["<$585", "$585-630", "$630-645", "$645-680"]

#adding a new column to the overall summary dataframe, populating with bins (13)
school_summary_final_copy["Spending Ranges (Per Student)"] = pd.cut(school_summary_final_copy["Per-Student Budget ($)"],
                                             bins=spending_bins,labels=labels,include_lowest=True)

#calculating the mean of each group. code provided in challenge documentation
spending_math_scores = school_summary_final_copy.groupby(["Spending Ranges (Per Student)"])["Average Math Score"].mean()
spending_reading_scores = school_summary_final_copy.groupby(["Spending Ranges (Per Student)"])["Average Reading Score"].mean()
spending_passing_math = school_summary_final_copy.groupby(["Spending Ranges (Per Student)"])["Passing Math (%)"].mean()
spending_passing_reading = school_summary_final_copy.groupby(["Spending Ranges (Per Student)"])["Passing Reading (%)"].mean()
overall_passing_spending = school_summary_final_copy.groupby(["Spending Ranges (Per Student)"])["Total Passing (%)"].mean()


#merging data into single dataframe
spending_summary = pd.merge(spending_math_scores, spending_reading_scores,
                            on="Spending Ranges (Per Student)")

spending_summary = pd.merge(spending_summary, spending_passing_math, 
                            on="Spending Ranges (Per Student)")

spending_summary = pd.merge(spending_summary, spending_passing_reading, 
                            on="Spending Ranges (Per Student)")

spending_summary = pd.merge(spending_summary, overall_passing_spending, 
                            on="Spending Ranges (Per Student)")

spending_summary

## Scores by School Size

In [None]:
#Scores by School Size
#creating bins and lables. code provided in challenge documentation
size_bins = [0, 1000, 2000, 5000]
labels = ["Small (<1000)", "Medium (1000-2000)", "Large (2000-5000)"]

#adding a new column to the overall summary dataframe, populating with bins (13)
school_summary_final_copy["School Size"] = pd.cut(school_summary_final_copy["Student Count"],
                                             bins=size_bins,labels=labels,include_lowest=True)

#calculating the mean of each group. code provided in challenge documentation
size_math_scores = school_summary_final_copy.groupby(["School Size"])["Average Math Score"].mean()
size_reading_scores = school_summary_final_copy.groupby(["School Size"])["Average Reading Score"].mean()
size_passing_math = school_summary_final_copy.groupby(["School Size"])["Passing Math (%)"].mean()
size_passing_reading = school_summary_final_copy.groupby(["School Size"])["Passing Reading (%)"].mean()
overall_passing_size = school_summary_final_copy.groupby(["School Size"])["Total Passing (%)"].mean()

#merging data into single dataframe
size_summary = pd.merge(size_math_scores, size_reading_scores,
                            on="School Size")

size_summary = pd.merge(size_summary, size_passing_math, 
                            on="School Size")

size_summary = pd.merge(size_summary, size_passing_reading, 
                            on="School Size")

size_summary = pd.merge(size_summary, overall_passing_size, 
                            on="School Size")


size_summary

## Scores by School Type

In [None]:
#code block structure taken from module challenge documentation/ starter-code, per instructions
# Group the per_school_summary DataFrame by "School Type" and average the results.
average_math_score_by_type = per_school_summary.groupby(["School Type"])["Average Math Score"].mean()
average_reading_score_by_type = per_school_summary.groupby(["School Type"])["Average Reading Score"].mean()
average_percent_passing_math_by_type = per_school_summary.groupby(["School Type"])["Passing Math (%)"].mean()
average_percent_passing_reading_by_type = per_school_summary.groupby(["School Type"])["Passing Reading (%)"].mean()
average_percent_overall_passing_by_type = per_school_summary.groupby(["School Type"])["Total Passing (%)"].mean()


# combine groupby objects into a single DataFrame called `type_summary` (15)
type_summary = pd.DataFrame(pd.concat([average_math_score_by_type,average_reading_score_by_type,
                         average_percent_passing_math_by_type, average_percent_passing_reading_by_type,
                         average_percent_overall_passing_by_type], axis=1))

type_summary
