In [602]:
import pandas as pd

In [603]:
# Load files
school_data = "Resources/schools_complete.csv"
student_data = "Resources/students_complete.csv"

In [604]:
#Load data into a dataframe
school_df = pd.read_csv(school_data)
student_df = pd.read_csv(student_data)

In [605]:
school_df.head(2)

Unnamed: 0,School ID,school_name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411


In [606]:
student_df.head(2)

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61


In [607]:
# Combine school and data dataframes into a single dataframe
school_complete_df = pd.merge(student_df, school_df, how = "left", on = ["school_name", "school_name"])
school_complete_df.head(3)


Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635


In [608]:
# # Create a grouping based on the school name
schools = school_complete_df.groupby("school_name")

In [609]:
# Calculate the total number of schools
total_schools = len(schools)

In [610]:
# Calculate the total budget
budget_df = schools.first()
total_budget = budget_df["budget"].sum()

In [611]:
# Calculat the total number of students
total_students = school_complete_df["Student ID"].count()

# Calculate the average reading score
avg_reading_score = school_complete_df["reading_score"].mean()

# Calculate the average math score
avg_math_score = school_complete_df["math_score"].mean()

# Calculate the % of students that have a passing reading score
read_filter = school_complete_df["reading_score"] >= 70
num_passed_reading = len(school_complete_df[read_filter])
percent_passing_reading = num_passed_reading / total_students

# Calculate the % of students that have a passing math score
math_filter = school_complete_df["math_score"] >= 70
num_passed_math = len(school_complete_df[math_filter])
percent_passing_math = num_passed_math / total_students

# Calculate the % of students that have a passing math and reading score
overall_filter = len(school_complete_df[read_filter & math_filter])
percent_passing_overall = overall_filter / total_students


## District Summary

* Calculate the total number of schools

* Calculate the total number of students

* Calculate the total budget

* Calculate the average math score 

* Calculate the average reading score

* Calculate the percentage of students with a passing math score (70 or greater)

* Calculate the percentage of students with a passing reading score (70 or greater)

* Calculate the percentage of students who passed math **and** reading (% Overall Passing)

* Create a dataframe to hold the above results

* Optional: give the displayed data cleaner formatting

In [612]:
district_summary_df = pd.DataFrame(
    {"Total Schools": [total_schools],
     "Total Students": [f'{total_students:,}'],
     "Total Budget" : [f'${total_budget:,}'],
     "Avg Math Score" : [f'{(avg_math_score / 100):.2%}'],
     "Avg Reading Score" : [f'{(avg_reading_score / 100):.2%}'],
     "% Passing Math" : [f'{(percent_passing_math):.2%}'],
     "% Passing Reading" : [f'{percent_passing_reading:.2%}'],
     "% Overall Passing" : [f'{percent_passing_overall:.2%}']
     }
)

district_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Avg Math Score,Avg Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428",78.99%,81.88%,74.98%,85.81%,65.17%


In [613]:
school_complete_df.head(3)

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635


In [614]:
type_budget_df = schools[["type", "budget"]].max()
type_budget_df.head(3)

Unnamed: 0_level_0,type,budget
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Bailey High School,District,3124928
Cabrera High School,Charter,1081356
Figueroa High School,District,1884411


In [615]:
# school_summary = school_complete_df.groupby(["school_name", "Student ID"])

In [616]:
school_summary1_df = schools.agg({"type" : "first",
                                 "Student ID" : "count",
                                 "budget" : "first",
                                 "math_score" : "mean",
                                 "reading_score" : "mean"
                                 })

school_summary1_df.rename(mapper = {"type" : "School Type",
                                    "Student ID" : "Total Students",
                                    "budget" : "Total School Budget",
                                    "math_score" : "Average Math Score",
                                    "reading_score" : "Average Reading Score"}, axis = "columns", inplace = True)

school_summary1_df.head(3)

Unnamed: 0_level_0,School Type,Total Students,Total School Budget,Average Math Score,Average Reading Score
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bailey High School,District,4976,3124928,77.048432,81.033963
Cabrera High School,Charter,1858,1081356,83.061895,83.97578
Figueroa High School,District,2949,1884411,76.711767,81.15802


In [617]:
# Calculate the budget per student
budget_student = school_summary1_df["Total School Budget"] / school_summary1_df["Total Students"]

# Convert the budget per student series into a data frame
budget_student_df = pd.DataFrame(budget_student, columns=["Per Student Budget"])
budget_student_df.head(3)

Unnamed: 0_level_0,Per Student Budget
school_name,Unnamed: 1_level_1
Bailey High School,628.0
Cabrera High School,582.0
Figueroa High School,639.0


In [618]:
# Group by Schools and Student ID
stats_per_school = school_complete_df.groupby(["school_name", "Student ID"])

In [619]:
# CREATE A NEW DATAFRAME CALCULATING SCORES BY SCHOOL BY STUDENT
avg_passing_math = stats_per_school[["math_score", "reading_score"]].mean()
passing_by_school_df = pd.DataFrame(avg_passing_math, columns=["math_score", "reading_score"])

In [620]:
student_scores_per_school = stats_per_school[["math_score", "reading_score"]].first()
scores_by_school_df = pd.DataFrame(student_scores_per_school)
scores_by_school_df

Unnamed: 0_level_0,Unnamed: 1_level_0,math_score,reading_score
school_name,Student ID,Unnamed: 2_level_1,Unnamed: 3_level_1
Bailey High School,17871,59,75
Bailey High School,17872,58,84
Bailey High School,17873,86,79
Bailey High School,17874,89,71
Bailey High School,17875,61,90
...,...,...,...
Wright High School,26031,82,90
Wright High School,26032,95,94
Wright High School,26033,99,69
Wright High School,26034,91,73


In [621]:
math1 = scores_by_school_df["math_score"] >= 70
# num_passing_math = scores_by_school_df[math1]
math1

school_name         Student ID
Bailey High School  17871         False
                    17872         False
                    17873          True
                    17874          True
                    17875         False
                                  ...  
Wright High School  26031          True
                    26032          True
                    26033          True
                    26034          True
                    26035          True
Name: math_score, Length: 39170, dtype: bool

In [535]:
# Join the Per Student Budget data column to the data frame using a .join method
new_school_summary_df = school_summary1_df.join(budget_student_df)
new_school_summary_df.head(3)


Unnamed: 0_level_0,School Type,Total Students,Total School Budget,Average Math Score,Average Reading Score,Per Student Budget
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bailey High School,District,4976,3124928,77.048432,81.033963,628.0
Cabrera High School,Charter,1858,1081356,83.061895,83.97578,582.0
Figueroa High School,District,2949,1884411,76.711767,81.15802,639.0


In [539]:
# Set the column order for the updated data frame
column_order = ["School Type", "Total Students", "Total School Budget", "Per Student Budget",
                "Average Math Score", "Average Reading Score"]

# Apply the column order to the new data frame and save with a new dataframe name
school_summary_with_budget_df = new_school_summary_df.reindex(columns = column_order)
school_summary_with_budget_df.head(3)

Unnamed: 0_level_0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bailey High School,District,4976,3124928,628.0,77.048432,81.033963
Cabrera High School,Charter,1858,1081356,582.0,83.061895,83.97578
Figueroa High School,District,2949,1884411,639.0,76.711767,81.15802
