In [1]:
import pandas as pd

In [5]:
# Files to load
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = "Resources/students_complete.csv"

schools_df = pd.read_csv(school_data_to_load)
students_df = pd.read_csv(student_data_to_load)
students_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Dr. Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84


In [7]:
# Determine if there are any  missing values in the schools data
schools_df.count()

School ID      15
school_name    15
type           15
size           15
budget         15
dtype: int64

In [6]:
# Determine if there are any missing values in the students data
students_df.count()

Student ID       39170
student_name     39170
gender           39170
grade            39170
school_name      39170
reading_score    39170
math_score       39170
dtype: int64

In [8]:
# Determine if there are any missing values in the schools data
schools_df.isnull()

Unnamed: 0,School ID,school_name,type,size,budget
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
5,False,False,False,False,False
6,False,False,False,False,False
7,False,False,False,False,False
8,False,False,False,False,False
9,False,False,False,False,False


In [14]:
# Determine if there are any missing values in the students data
students_df.isnull()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
39165,False,False,False,False,False,False,False
39166,False,False,False,False,False,False,False
39167,False,False,False,False,False,False,False
39168,False,False,False,False,False,False,False


In [13]:
# Determine if there are any missing values in the students data
students_df.isnull().sum()

Student ID       0
student_name     0
gender           0
grade            0
school_name      0
reading_score    0
math_score       0
dtype: int64

In [16]:
# Determine if there are not any missing values in the school data.
schools_df.notnull()

Unnamed: 0,School ID,school_name,type,size,budget
0,True,True,True,True,True
1,True,True,True,True,True
2,True,True,True,True,True
3,True,True,True,True,True
4,True,True,True,True,True
5,True,True,True,True,True
6,True,True,True,True,True
7,True,True,True,True,True
8,True,True,True,True,True
9,True,True,True,True,True


In [18]:
# Determine if there are not any missing values in the student data.
students_df.notnull().sum()

Student ID       39170
student_name     39170
gender           39170
grade            39170
school_name      39170
reading_score    39170
math_score       39170
dtype: int64

In [19]:
schools_df.dtypes

School ID       int64
school_name    object
type           object
size            int64
budget          int64
dtype: object

In [20]:
schools_df["budget"].dtype

dtype('int64')

In [21]:
schools_df.budget.dtype

dtype('int64')

In [22]:
students_df.dtypes

Student ID        int64
student_name     object
gender           object
grade            object
school_name      object
reading_score     int64
math_score        int64
dtype: object

In [24]:
prefixes_suffixes = ['Dr. ', 'Mr. ', 'Mrs. ', 'Miss ', 'Ms. ', 'MD',' DDS',' DVM',' PhD']

In [26]:
# Remove all of the non-familial prefixes and suffixes
for word in prefixes_suffixes:
    students_df["student_name"] = students_df["student_name"].str.replace(word,"")

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84
...,...,...,...,...,...,...,...
39165,39165,Donna Howard,F,12th,Thomas High School,99,90
39166,39166,Dawn Bell,F,10th,Thomas High School,95,70
39167,39167,Rebecca Tanner,F,9th,Thomas High School,73,84
39168,39168,Desiree Kidd,F,10th,Thomas High School,99,90


In [28]:
# Combine the school and student data into a single dataset
school_data_complete_df = pd.merge(students_df, schools_df, on=["school_name","school_name"])
school_data_complete_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [29]:
# Get the total number of students
student_count = school_data_complete_df.count()
student_count

Student ID       39170
student_name     39170
gender           39170
grade            39170
school_name      39170
reading_score    39170
math_score       39170
School ID        39170
type             39170
size             39170
budget           39170
dtype: int64

In [33]:
# The previous gave the counts for all columns, since they are all the same we can pick any column
# to get the number of students in the district
student_count = school_data_complete_df["student_name"].count()
student_count

39170

In [37]:
# Get the number of schools by counting the number of unique entries in the school_name column
school_count = len(school_data_complete_df["school_name"].unique())
school_count

15

In [40]:
# Get the school district's total budget for the schools (sum of each school's budget)
total_budget = schools_df["budget"].sum()
total_budget

24649428

In [41]:
# Calculate the average (mean) reading score
mean_reading_score = school_data_complete_df["reading_score"].mean()
mean_reading_score

81.87784018381414

In [43]:
# Calculate the average (mean) math score
mean_math_score = school_data_complete_df["math_score"].mean()
mean_math_score

78.98537145774827

In [46]:
# Get a List of all of the passing scores for math & reading
passing_math = school_data_complete_df["math_score"] >= 70
passing_math

0         True
1        False
2        False
3        False
4         True
         ...  
39165     True
39166     True
39167     True
39168     True
39169     True
Name: math_score, Length: 39170, dtype: bool

In [47]:
# Get a DataFrame of all students that are passing math.  The technique below is a means for filtering the
# DataFrame
passing_math_df = school_data_complete_df[school_data_complete_df["math_score"] >= 70]
passing_math_df

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635
5,5,Bryan Miranda,M,9th,Huang High School,94,94,0,District,2917,1910635
6,6,Sheena Carter,F,11th,Huang High School,82,80,0,District,2917,1910635
8,8,Michael Roth,M,10th,Huang High School,95,87,0,District,2917,1910635
...,...,...,...,...,...,...,...,...,...,...,...
39165,39165,Donna Howard,F,12th,Thomas High School,99,90,14,Charter,1635,1043130
39166,39166,Dawn Bell,F,10th,Thomas High School,95,70,14,Charter,1635,1043130
39167,39167,Rebecca Tanner,F,9th,Thomas High School,73,84,14,Charter,1635,1043130
39168,39168,Desiree Kidd,F,10th,Thomas High School,99,90,14,Charter,1635,1043130


In [49]:
# Now do the same thing again for reading
passing_reading_df = school_data_complete_df[school_data_complete_df["reading_score"] >= 70]
passing_reading_df

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635
5,5,Bryan Miranda,M,9th,Huang High School,94,94,0,District,2917,1910635
6,6,Sheena Carter,F,11th,Huang High School,82,80,0,District,2917,1910635
...,...,...,...,...,...,...,...,...,...,...,...
39165,39165,Donna Howard,F,12th,Thomas High School,99,90,14,Charter,1635,1043130
39166,39166,Dawn Bell,F,10th,Thomas High School,95,70,14,Charter,1635,1043130
39167,39167,Rebecca Tanner,F,9th,Thomas High School,73,84,14,Charter,1635,1043130
39168,39168,Desiree Kidd,F,10th,Thomas High School,99,90,14,Charter,1635,1043130


In [58]:
# Compute the number & percentage of students passing math
passing_math_count = passing_math_df["student_name"].count()
passing_math_pct = passing_math_count / float(student_count) * 100
# Compute the number & percentage of students passing reading
passing_reading_count = passing_reading_df["student_name"].count()
passing_reading_pct = passing_reading_count / float(student_count) * 100

In [59]:
print(passing_math_count, passing_math_pct)
print(passing_reading_count, passing_reading_pct)

29370 74.9808526933878
33610 85.80546336482001


In [63]:
# Filter for the number of students that passed both math and reading
passing_reading_and_math_df = school_data_complete_df[(school_data_complete_df["reading_score"] >= 70) & 
                                                      (school_data_complete_df["math_score"] >= 70)]
passing_reading_and_math_df

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635
5,5,Bryan Miranda,M,9th,Huang High School,94,94,0,District,2917,1910635
6,6,Sheena Carter,F,11th,Huang High School,82,80,0,District,2917,1910635
8,8,Michael Roth,M,10th,Huang High School,95,87,0,District,2917,1910635
9,9,Matthew Greene,M,10th,Huang High School,96,84,0,District,2917,1910635
...,...,...,...,...,...,...,...,...,...,...,...
39165,39165,Donna Howard,F,12th,Thomas High School,99,90,14,Charter,1635,1043130
39166,39166,Dawn Bell,F,10th,Thomas High School,95,70,14,Charter,1635,1043130
39167,39167,Rebecca Tanner,F,9th,Thomas High School,73,84,14,Charter,1635,1043130
39168,39168,Desiree Kidd,F,10th,Thomas High School,99,90,14,Charter,1635,1043130


In [65]:
passing_rdg_and_math_count = passing_reading_and_math_df["student_name"].count()
passing_rdg_and_math_count

25528

In [67]:
passing_rdg_and_math_pct = passing_rdg_and_math_count / student_count * 100
passing_rdg_and_math_pct

65.17232575950983

In [116]:
metrics = [{"Total Schools": school_count, 
            "Total Students": student_count, 
            "Total Budget": total_budget, 
            "Average Math Score": mean_math_score,
            "Average Reading Score": mean_reading_score, 
            "% Passing Math": passing_math_pct, 
            "% Passing Reading": passing_reading_pct, 
            "% Overall Passing": passing_rdg_and_math_pct}]
metrics_df = pd.DataFrame(metrics)
metrics_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,24649428,78.985371,81.87784,74.980853,85.805463,65.172326


In [117]:
# Format all columns using the pandas map function (pandas.Series.map).  Note that the use of the Python
# format function converts every item to a string type.
metrics_df["Total Students"] = metrics_df["Total Students"].map("{:,}".format)
metrics_df["Total Budget"] = metrics_df["Total Budget"].map("${:,.2f}".format)
metrics_df["Average Math Score"] = metrics_df["Average Math Score"].map("{:.1f}".format)
metrics_df["Average Reading Score"] = metrics_df["Average Reading Score"].map("{:.1f}".format)
metrics_df["% Passing Math"] = metrics_df["% Passing Math"].map("{:.0f}".format)
metrics_df["% Passing Reading"] = metrics_df["% Passing Reading"].map("{:.0f}".format)
metrics_df["% Overall Passing"] = metrics_df["% Overall Passing"].map("{:.0f}".format)


In [118]:
metrics_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",79.0,81.9,75,86,65


In [121]:
# Reorder the columns in the DataFrame (they're already in this order, but keep this code as an example)
new_column_order = ["Total Schools", "Total Students", "Total Budget", "Average Math Score", 
                    "Average Reading Score", "% Passing Math", "% Passing Reading", "% Overall Passing"]
metrics_df = metrics_df[new_column_order]
metrics_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",79.0,81.9,75,86,65
