In [2]:
# Dependencies and Setup
import pandas as pd
import numpy as np

# File to Load (Remember to Change These)
school_raw = "Resources/schools_complete.csv"
student_raw = "Resources/students_complete.csv"

# Read School and Student Data File and store into Pandas Data Frames
scdf = pd.read_csv(school_raw)
stdf = pd.read_csv(student_raw)

# Combine the data into a single dataset
cdf = pd.merge(scdf, stdf, how="left", on=["school_name", "school_name"])
cdf.head(2)

Unnamed: 0,School ID,school_name,type,size,budget,Student ID,student_name,gender,grade,reading_score,math_score
0,0,Huang High School,District,2917,1910635,0,Paul Bradley,M,9th,66,79
1,0,Huang High School,District,2917,1910635,1,Victor Smith,M,12th,94,61


In [3]:
ccdf = cdf[['size', 'reading_score', 'math_score']]
ccdf = pd.DataFrame(ccdf)
ccdf['bps'] = cdf['budget'] / cdf['size']
ccdf['pm'] = cdf['math_score'] >= 70
ccdf['pr'] = cdf['reading_score'] >= 70
ccdf

Unnamed: 0,size,reading_score,math_score,bps,pm,pr
0,2917,66,79,655.0,True,False
1,2917,94,61,655.0,False,True
2,2917,90,60,655.0,False,True
3,2917,67,58,655.0,False,False
4,2917,97,84,655.0,True,True
...,...,...,...,...,...,...
39165,1635,99,90,638.0,True,True
39166,1635,95,70,638.0,True,True
39167,1635,73,84,638.0,True,True
39168,1635,99,90,638.0,True,True


In [4]:
bins = [0, 1000, 2000, 5000]
group_names = ["Small (<1000)", "Medium (1000-2000)", "Large (2000-5000)"]
ccdf['School Size'] = pd.cut(ccdf["size"], bins, labels=group_names)
ccdf

Unnamed: 0,size,reading_score,math_score,bps,pm,pr,School Size
0,2917,66,79,655.0,True,False,Large (2000-5000)
1,2917,94,61,655.0,False,True,Large (2000-5000)
2,2917,90,60,655.0,False,True,Large (2000-5000)
3,2917,67,58,655.0,False,False,Large (2000-5000)
4,2917,97,84,655.0,True,True,Large (2000-5000)
...,...,...,...,...,...,...,...
39165,1635,99,90,638.0,True,True,Medium (1000-2000)
39166,1635,95,70,638.0,True,True,Medium (1000-2000)
39167,1635,73,84,638.0,True,True,Medium (1000-2000)
39168,1635,99,90,638.0,True,True,Medium (1000-2000)


In [5]:
gbo = ccdf.groupby('School Size')
gbo.count().head()

Unnamed: 0_level_0,size,reading_score,math_score,bps,pm,pr
School Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Small (<1000),1389,1389,1389,1389,1389,1389
Medium (1000-2000),8522,8522,8522,8522,8522,8522
Large (2000-5000),29259,29259,29259,29259,29259,29259


In [6]:
gdf = gbo.agg({  "reading_score":"mean",
                  "math_score":"mean", 
                  "pm":"sum", 
                  "pr":"sum"
               })
gdf['tc'] = gbo['size'].count()
gdf

Unnamed: 0_level_0,reading_score,math_score,pm,pr,tc
School Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Small (<1000),83.974082,83.828654,1305.0,1334.0,1389
Medium (1000-2000),83.867989,83.372682,7978.0,8247.0,8522
Large (2000-5000),81.198674,77.477597,20087.0,24029.0,29259


In [10]:
gdf['% Passing Math'] = gdf.pm / gdf.tc
gdf['% Passing Reading'] = gdf.pr / gdf.tc
gdf['% Overall Passing Rate'] = (gdf['% Passing Reading'] + gdf['% Passing Math']) / 2
gdf

Unnamed: 0_level_0,reading_score,math_score,pm,pr,tc,% Passing Math,% Passing Reading,% Overall Passing Rate
School Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Small (<1000),83.974082,83.828654,1305.0,1334.0,1389,0.939525,0.960403,0.949964
Medium (1000-2000),83.867989,83.372682,7978.0,8247.0,8522,0.936165,0.967731,0.951948
Large (2000-5000),81.198674,77.477597,20087.0,24029.0,29259,0.686524,0.821252,0.753888


In [13]:
fdf = gdf.rename(columns={'reading_score':'Average Reading Score',
                           'math_score':'Average Math Score'})

fdf = fdf[[ 'Average Math Score','Average Reading Score', '% Passing Math', '% Passing Reading', '% Overall Passing Rate']]

fdf.style

Unnamed: 0_level_0,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing Rate
School Size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Small (<1000),83.8287,83.9741,0.939525,0.960403,0.949964
Medium (1000-2000),83.3727,83.868,0.936165,0.967731,0.951948
Large (2000-5000),77.4776,81.1987,0.686524,0.821252,0.753888


In [19]:
print(gcdf.pm / gcdf.tc)
print(5967/6368)
print((.937+.966)/2)

Spending Ranges(Per Student)
<$585       0.937029
$585-615    0.941241
$615-645    0.714004
$645-675    0.662308
dtype: float64
0.9370288944723618
0.9515
