In [58]:
import pandas as pd

In [59]:
# File to Load
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

# Read School and Student Data File and store into Pandas DataFrames
school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)

# Combine the data into a single dataset.  
school_data_complete = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])

In [60]:
school_data.head()

Unnamed: 0,School ID,school_name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500


In [61]:
student_data.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84


In [62]:
school_data_complete.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [63]:
# Calculate the total number of schools
total_schools = school_data["School ID"].count()

#Calculate the total number of students
total_students = student_data["Student ID"].count()

# Calculate sum of budgets
total_budget = school_data["budget"].sum()

#Calculate average math scores
math_average = student_data["math_score"].mean()

#Calculate average reading scores
read_average = student_data["reading_score"].mean()

# math_pass = number of students that have a math score >= 70
math_pass_df = school_data_complete.loc[(school_data_complete['math_score'] >= 70)]
math_pass = math_pass_df["Student ID"].count()

# read_pass = number of students that have a read score >= 70
read_pass_df = school_data_complete.loc[(school_data_complete['reading_score'] >= 70)]
read_pass = read_pass_df["Student ID"].count()

# pass = number of students that have a math AND reading score >= 70
pass_df = school_data_complete.loc[(school_data_complete['math_score'] >= 70) & (school_data_complete['reading_score'] >= 70), :]
pass_grade = pass_df["Student ID"].count()

# percentage of students with passing math grade
per_math_pass = (math_pass/total_students) * 100

# percentage of students with passing reading grade
per_read_pass = (read_pass/total_students) * 100

# percentage of students with both passing reading and math grades
per_pass = (pass_grade/total_students) * 100

total_schools, total_students, total_budget, math_average, read_average, math_pass, read_pass, per_pass

(15,
 39170,
 24649428,
 78.98537145774827,
 81.87784018381414,
 29370,
 33610,
 65.17232575950983)

In [64]:
#create District Summary data frame
district_summary = pd.DataFrame([
    {"Total Schools": total_schools,
    "Total Students": total_students,
    "Total Budget": total_budget,
    "Average Math Score": math_average,
    "Average Reading Schore": read_average,
    "% Passing Math": per_math_pass,
    "% Passing Reading": per_read_pass,
    "% Overall Passing": per_pass,}
])
district_summary

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Schore,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,24649428,78.985371,81.87784,74.980853,85.805463,65.172326


In [65]:
# Extract the columns with numerical values
school_scores = school_data_complete[["school_name", "reading_score", "math_score", "size"]]

#group by school and calculate the mean
school_group = school_scores.groupby(["school_name"])

avg_scores = school_group.mean()

# Using .rename(columns={}) in order to rename columns
renamed_avg_scores = avg_scores.rename(columns={"reading_score":"Reading Average", "math_score":"Math Average", "size":"Total Students"})
renamed_avg_scores

Unnamed: 0_level_0,Reading Average,Math Average,Total Students
school_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bailey High School,81.033963,77.048432,4976.0
Cabrera High School,83.97578,83.061895,1858.0
Figueroa High School,81.15802,76.711767,2949.0
Ford High School,80.746258,77.102592,2739.0
Griffin High School,83.816757,83.351499,1468.0
Hernandez High School,80.934412,77.289752,4635.0
Holden High School,83.814988,83.803279,427.0
Huang High School,81.182722,76.629414,2917.0
Johnson High School,80.966394,77.072464,4761.0
Pena High School,84.044699,83.839917,962.0


In [66]:
# Filter rows so that only rows with passing math scores are shown
passing_math_scores = school_scores.loc[(school_scores['math_score'] >= 70)]

# Group by school and count rows
passing_math_scores_group = passing_math_scores.groupby(["school_name"])

num_pass_math = passing_math_scores_group.count()

# Drop unnecessary columns
drop_read = num_pass_math.drop('reading_score', 1)
new_num_pass_math = drop_read.drop('size', 1)
new_num_pass_math

Unnamed: 0_level_0,math_score
school_name,Unnamed: 1_level_1
Bailey High School,3318
Cabrera High School,1749
Figueroa High School,1946
Ford High School,1871
Griffin High School,1371
Hernandez High School,3094
Holden High School,395
Huang High School,1916
Johnson High School,3145
Pena High School,910


In [67]:
# Filter rows so that only rows with passing reading scores are shown
passing_read_scores = school_scores.loc[(school_scores['reading_score'] >= 70)]

# Group by school and count rows
passing_read_scores_group = passing_read_scores.groupby(["school_name"])

num_pass_read = passing_read_scores_group.count()

# Drop unnecessary columns
drop_math = num_pass_read.drop('math_score', 1)
new_num_pass_read = drop_math.drop('size', 1)
new_num_pass_read

Unnamed: 0_level_0,reading_score
school_name,Unnamed: 1_level_1
Bailey High School,4077
Cabrera High School,1803
Figueroa High School,2381
Ford High School,2172
Griffin High School,1426
Hernandez High School,3748
Holden High School,411
Huang High School,2372
Johnson High School,3867
Pena High School,923


In [68]:
# Filter rows so that only rows with passing reading and math scores are shown
passing_both_scores = school_scores.loc[(school_scores['reading_score'] >= 70) & (school_scores['math_score'] >= 70)]
passing_both_scores

# Group by school and count rows
passing_both_scores_group = passing_both_scores.groupby(["school_name"])

num_pass_both = passing_both_scores_group.count()

# Drop unnecessary columns
drop_it = num_pass_both.drop('reading_score', 1)
new_num_pass_both = drop_it.drop('math_score', 1)
new_num_pass_both

Unnamed: 0_level_0,size
school_name,Unnamed: 1_level_1
Bailey High School,2719
Cabrera High School,1697
Figueroa High School,1569
Ford High School,1487
Griffin High School,1330
Hernandez High School,2481
Holden High School,381
Huang High School,1561
Johnson High School,2549
Pena High School,871


In [69]:
# Merge dataframes using an outer join
merge1 = pd.merge(renamed_avg_scores, new_num_pass_math, on="school_name", how="outer")
merge2 = pd.merge(merge1, new_num_pass_read, on="school_name", how="outer")
merge3 = pd.merge(merge2, new_num_pass_both, on="school_name", how="outer")
merge4 = pd.merge(merge3, school_data, on="school_name", how="outer")
merge4

Unnamed: 0,school_name,Reading Average,Math Average,Total Students,math_score,reading_score,size_x,School ID,type,size_y,budget
0,Bailey High School,81.033963,77.048432,4976.0,3318,4077,2719,7,District,4976,3124928
1,Cabrera High School,83.97578,83.061895,1858.0,1749,1803,1697,6,Charter,1858,1081356
2,Figueroa High School,81.15802,76.711767,2949.0,1946,2381,1569,1,District,2949,1884411
3,Ford High School,80.746258,77.102592,2739.0,1871,2172,1487,13,District,2739,1763916
4,Griffin High School,83.816757,83.351499,1468.0,1371,1426,1330,4,Charter,1468,917500
5,Hernandez High School,80.934412,77.289752,4635.0,3094,3748,2481,3,District,4635,3022020
6,Holden High School,83.814988,83.803279,427.0,395,411,381,8,Charter,427,248087
7,Huang High School,81.182722,76.629414,2917.0,1916,2372,1561,0,District,2917,1910635
8,Johnson High School,80.966394,77.072464,4761.0,3145,3867,2549,12,District,4761,3094650
9,Pena High School,84.044699,83.839917,962.0,910,923,871,9,Charter,962,585858


In [70]:
#Add Per Student Budget and Percent Passing
merge4["Per Student Budget"] = merge4["budget"]/merge4["Total Students"]
merge4["% Passing Math"] = (merge4["math_score"]/merge4["Total Students"]) * 100
merge4["% Passing Reading"] = (merge4["reading_score"]/merge4["Total Students"]) * 100
merge4["% Passing Overall"] = (merge4["size_x"]/merge4["Total Students"]) * 100
merge4.head()

# Drop unnecessary columns
drop1 = merge4.drop('School ID', 1)
drop2 = drop1.drop('size_y', 1)
drop3 = drop2.drop('size_x', 1)
drop4 = drop3.drop('math_score', 1)
school_summary = drop4.drop('reading_score', 1)
school_summary.head()

# Reorganize Columns
organized_school_summary = school_summary[["school_name","type","Total Students","budget","Per Student Budget","Math Average","Reading Average","% Passing Math","% Passing Reading", "% Passing Overall"]]
renamed_school_summary = organized_school_summary.rename(columns={"school_name":"School Name", "Reading Average":"Average Reading Score", "Math Average":"Average Math Score", "type":"Type", "budget":"Total Budget"})
renamed_school_summary


Unnamed: 0,School Name,Type,Total Students,Total Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Passing Overall
0,Bailey High School,District,4976.0,3124928,628.0,77.048432,81.033963,66.680064,81.93328,54.642283
1,Cabrera High School,Charter,1858.0,1081356,582.0,83.061895,83.97578,94.133477,97.039828,91.334769
2,Figueroa High School,District,2949.0,1884411,639.0,76.711767,81.15802,65.988471,80.739234,53.204476
3,Ford High School,District,2739.0,1763916,644.0,77.102592,80.746258,68.309602,79.299014,54.289887
4,Griffin High School,Charter,1468.0,917500,625.0,83.351499,83.816757,93.392371,97.138965,90.599455
5,Hernandez High School,District,4635.0,3022020,652.0,77.289752,80.934412,66.752967,80.862999,53.527508
6,Holden High School,Charter,427.0,248087,581.0,83.803279,83.814988,92.505855,96.252927,89.227166
7,Huang High School,District,2917.0,1910635,655.0,76.629414,81.182722,65.683922,81.316421,53.513884
8,Johnson High School,District,4761.0,3094650,650.0,77.072464,80.966394,66.057551,81.222432,53.539172
9,Pena High School,Charter,962.0,585858,609.0,83.839917,84.044699,94.594595,95.945946,90.540541
