In [4]:
# import dependacies 
import os 
import csv 
import pandas as pd 
import numpy as np 

In [5]:
# set csv paths 
schools_complete = "raw_data/schools_complete.csv"
students_complete = "raw_data/students_complete.csv"

In [6]:
# read schools csv 
schools_pd = pd.read_csv(schools_complete, low_memory=False)

schools_pd.head()

Unnamed: 0,School ID,name,type,size,budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500


In [7]:
# rename and format columns in schools df 
schools_organized_df = schools_pd.rename(columns={"name":"School Name", "type": "Type", 
                                                   "size":"School Size", 
                                                   "budget": "Total Budget"})
schools_organized_df.head()

Unnamed: 0,School ID,School Name,Type,School Size,Total Budget
0,0,Huang High School,District,2917,1910635
1,1,Figueroa High School,District,2949,1884411
2,2,Shelton High School,Charter,1761,1056600
3,3,Hernandez High School,District,4635,3022020
4,4,Griffin High School,Charter,1468,917500


In [8]:
# total distrcit budget **Actual District Budget**
total_budget = schools_pd["budget"].sum()
total_budget 

24649428

In [9]:
total_student_count = schools_pd["size"].sum()
total_student_count

39170

In [10]:
students_pd = pd.read_csv(students_complete, low_memory=False)
students_pd.head()

Unnamed: 0,Student ID,name,gender,grade,school,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84


In [11]:
# rename and format columns in student df 
students_organized_df = students_pd.rename(columns={"name":"Student Name", "gender": "Gender", "school":"School Name", 
                                                    "grade": "Grade", "reading_score": "Reading Score", 
                                                    "math_score": "Math Score"})
students_organized_df.head()

Unnamed: 0,Student ID,Student Name,Gender,Grade,School Name,Reading Score,Math Score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84


In [12]:
# merge extracts into new data frame using inner join 
ds_combined = pd.merge(schools_organized_df, students_organized_df, on="School Name", how="outer")
ds_combined.head()

Unnamed: 0,School ID,School Name,Type,School Size,Total Budget,Student ID,Student Name,Gender,Grade,Reading Score,Math Score
0,0,Huang High School,District,2917,1910635,0,Paul Bradley,M,9th,66,79
1,0,Huang High School,District,2917,1910635,1,Victor Smith,M,12th,94,61
2,0,Huang High School,District,2917,1910635,2,Kevin Rodriguez,M,12th,90,60
3,0,Huang High School,District,2917,1910635,3,Dr. Richard Scott,M,12th,67,58
4,0,Huang High School,District,2917,1910635,4,Bonnie Ray,F,9th,97,84


In [13]:
# total school count
total_school_count = len(ds_combined["School Name"].unique())
total_school_count

15

In [14]:
# total student count by school 
sc_per_school = ds_combined["School Name"].value_counts()
sc_per_school

Bailey High School       4976
Johnson High School      4761
Hernandez High School    4635
Rodriguez High School    3999
Figueroa High School     2949
Huang High School        2917
Ford High School         2739
Wilson High School       2283
Cabrera High School      1858
Wright High School       1800
Shelton High School      1761
Thomas High School       1635
Griffin High School      1468
Pena High School          962
Holden High School        427
Name: School Name, dtype: int64

In [15]:
# total student count 
total_student_count = sc_per_school.sum()
total_student_count

39170

In [16]:
tb_per_school = ds_combined.groupby(["School Name"]).mean()["Total Budget"]
tb_per_school.head()

School Name
Bailey High School      3124928.0
Cabrera High School     1081356.0
Figueroa High School    1884411.0
Ford High School        1763916.0
Griffin High School      917500.0
Name: Total Budget, dtype: float64

In [17]:
total_ds_budget = pd.DataFrame({"Total Budget": tb_per_school}).sum()
total_ds_budget

Total Budget    24649428.0
dtype: float64

In [18]:
# calculate average Reading and Math Scores (** this will be used in future tables)
avg_rs = ds_combined["Reading Score"].mean() 
avg_ms = ds_combined["Math Score"].mean() 
print(avg_rs)
print(avg_ms)

81.87784018381414
78.98537145774827


In [19]:
# extract reading scores greater than 70% 

rs_passing = ds_combined.loc[ds_combined["Reading Score"] >= 70,:]
rs_passing.head()

Unnamed: 0,School ID,School Name,Type,School Size,Total Budget,Student ID,Student Name,Gender,Grade,Reading Score,Math Score
1,0,Huang High School,District,2917,1910635,1,Victor Smith,M,12th,94,61
2,0,Huang High School,District,2917,1910635,2,Kevin Rodriguez,M,12th,90,60
4,0,Huang High School,District,2917,1910635,4,Bonnie Ray,F,9th,97,84
5,0,Huang High School,District,2917,1910635,5,Bryan Miranda,M,9th,94,94
6,0,Huang High School,District,2917,1910635,6,Sheena Carter,F,11th,82,80


In [71]:
# extract reading scores below 70% 

rs_failing = ds_combined.loc[ds_combined["Reading Score"] < 70,:]
rs_failing.head()

Unnamed: 0,School ID,School Name,Type,School Size,Total Budget,Student ID,Student Name,Gender,Grade,Reading Score,Math Score
0,0,Huang High School,District,2917,1910635,0,Paul Bradley,M,9th,66,79
3,0,Huang High School,District,2917,1910635,3,Dr. Richard Scott,M,12th,67,58
12,0,Huang High School,District,2917,1910635,12,Brittney Walker,F,9th,64,79
18,0,Huang High School,District,2917,1910635,18,Kevin Stevens,M,9th,64,69
26,0,Huang High School,District,2917,1910635,26,Melanie Decker,F,9th,63,85


In [85]:
school_types = ds_combined.groupby("Type").count()
school_types

Unnamed: 0_level_0,School ID,School Name,School Size,Total Budget,Student ID,Student Name,Gender,Grade,Reading Score,Math Score
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Charter,12194,12194,12194,12194,12194,12194,12194,12194,12194,12194
District,26976,26976,26976,26976,26976,26976,26976,26976,26976,26976


In [86]:
# rs passing  
rs_passing_total = rs_passing["Reading Score"].mean()
rs_passing_total

84.47325200833086

In [87]:
# rs failing  

rs_failing_total = rs_failing["Reading Score"].mean()
rs_failing_total

66.1886690647482

In [88]:
# rs total passing 

passing_reading_total = (rs_failing_total / rs_passing_total) * 100
passing_reading_total

78.35458857227444

In [89]:
ms_passing = ds_combined.loc[ds_combined["Math Score"] >= 70,:]

In [90]:
ms_failing = ds_combined.loc[ds_combined["Math Score"] < 70,:]

In [91]:
# ms passing  
ms_passing_total = ms_passing["Math Score"].mean()
ms_passing_total

84.4712631937351

In [92]:
# ms failing 
ms_failing_total = ms_failing["Math Score"].mean()
ms_failing_total 

62.544489795918366

In [93]:
# ms passing 
passing_math_total = (ms_failing_total / ms_passing_total) * 100
passing_math_total

74.04232804293738

In [98]:
# overall % passing 
overall_passing = ((ms_passing_total + rs_passing_total)/2)
overall_passing

84.47225760103298

In [97]:
# school budget per student 
budget_per_student = (ds_combined["Total Budget"])/(ds_combined["School Size"])

In [96]:
# School Summary 

# set index to School Name 

summary_df = ds_combined.groupby(["School Name"]).mean()
summary_df.head()

Unnamed: 0_level_0,School ID,School Size,Total Budget,Student ID,Reading Score,Math Score
School Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bailey High School,7.0,4976.0,3124928.0,20358.5,81.033963,77.048432
Cabrera High School,6.0,1858.0,1081356.0,16941.5,83.97578,83.061895
Figueroa High School,1.0,2949.0,1884411.0,4391.0,81.15802,76.711767
Ford High School,13.0,2739.0,1763916.0,36165.0,80.746258,77.102592
Griffin High School,4.0,1468.0,917500.0,12995.5,83.816757,83.351499


In [84]:
schools_final = pd.DataFrame({"School Type": school_types,  
                              "Total Budget": ,
                              "Budget per Student": budget_per_student,
                              "Average Math Score %": avg_ms, 
                              "Average Reading Score %": avg_rs, 
                              "Passing Reading %": passing_reading_total, 
                              "Passing Math %": passing_math_total, 
                              "Overall Passing %": overall_passing})
schools_final

ValueError: array length 1 does not match index length 39170

In [53]:
schools_final = summary_df.DataFrame({"School Type": ["Type"],
                          "Total Budget": ["Total Budget"], 
                          "Average Math Score %": ["Reading Score"], 
                          "Average Reading Score %": ["Math Score"], 
                          "Passing Reading %": passing_reading_total, 
                          "Passing Math %": passing_math_total, 
                          "Overall Passing %": overall_passing})
schools_final

AttributeError: 'DataFrame' object has no attribute 'DataFrame'

In [None]:
# format
summary_df["Total Budget"] = summary_df["Total Budget"].map("${:,}".format)
summary_df["Total Schools"] = summary_df["Total Schools"].map("{:,}".format)

In [None]:
# reorganize 
summary_df = summary_df[["Total Schools","Total Budget","Average Math Score %",
                         "Average Reading Score %", "Passing Reading %", "Passing Math %", "Overall Passing %"]]