In [22]:
# import pandas library
import pandas as pd

# import os to test relative join
import os

# files to load (first using direct, second using relative path for practice)
school_data_to_load = "Resources/schools_complete.csv"
student_data_to_load = os.path.join("Resources", "students_complete.csv")

# read the files and store as a pandas DataFrame
school_data_df = pd.read_csv(school_data_to_load)
student_data_df = pd.read_csv(student_data_to_load)

# # view first five rows
# school_data_df.head()

# #view last five rows
# school_data_df.tail()

# # print
# school_data_df
# student_data_df

# determine if there are any missing values in the school data
student_data_df.notnull().sum()

# determine data types for the school DataFrame
school_data_df.dtypes

# determine data types for the student DataFrame
student_data_df.dtypes

Student ID        int64
student_name     object
gender           object
grade            object
school_name      object
reading_score     int64
math_score        int64
dtype: object

In [25]:
# add each prefix and suffix to remove from a list
prefixes_suffixes = ["Dr. ", "Mr. ","Ms. ", "Mrs. ", "Miss ", " MD", " DDS", " DVM", " PhD"]

# iterate through the words in the pre/suff list and replace them with an empty space ("")
for word in prefixes_suffixes:
    student_data_df["student_name"] = student_data_df["student_name"].str.replace(word, "")

student_data_df.head(10)

  


Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score
0,0,Paul Bradley,M,9th,Huang High School,66,79
1,1,Victor Smith,M,12th,Huang High School,94,61
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60
3,3,Richard Scott,M,12th,Huang High School,67,58
4,4,Bonnie Ray,F,9th,Huang High School,97,84
5,5,Bryan Miranda,M,9th,Huang High School,94,94
6,6,Sheena Carter,F,11th,Huang High School,82,80
7,7,Nicole Baker,F,12th,Huang High School,96,69
8,8,Michael Roth,M,10th,Huang High School,95,87
9,9,Matthew Greene,M,10th,Huang High School,96,84


In [26]:
# combine data from the individual school/student data frames into a single dataset
school_data_complete_df = pd.merge(student_data_df, school_data_df, on=["school_name", "school_name"])

school_data_complete_df.head()

Unnamed: 0,Student ID,student_name,gender,grade,school_name,reading_score,math_score,School ID,type,size,budget
0,0,Paul Bradley,M,9th,Huang High School,66,79,0,District,2917,1910635
1,1,Victor Smith,M,12th,Huang High School,94,61,0,District,2917,1910635
2,2,Kevin Rodriguez,M,12th,Huang High School,90,60,0,District,2917,1910635
3,3,Richard Scott,M,12th,Huang High School,67,58,0,District,2917,1910635
4,4,Bonnie Ray,F,9th,Huang High School,97,84,0,District,2917,1910635


In [30]:
# # get the total number of students based on all columns
# student_count = school_data_complete_df.count()

# get the total number of students based on the student ID column
student_count = school_data_complete_df["Student ID"].count()
student_count

39170

In [40]:
# # get the total number of unique schools
# school_count = school_data_df["school_name"].count()

# school_count

# get the total number of unique schools from the merged DataFrame
school_count_2 = len(school_data_complete_df["school_name"].unique())

school_count_2

15

In [41]:
# calculate the total budget
total_budget = school_data_df["budget"].sum()

total_budget

24649428

In [43]:
# calculate the average reading score
average_reading_score = school_data_complete_df["reading_score"].mean()

average_reading_score

# calculate the average math score
average_math_score = school_data_complete_df["math_score"].mean()

average_math_score

78.98537145774827

In [45]:
# assign variables for passing math and reading
passing_math = school_data_complete_df["math_score"] >= 70
passing_reading = school_data_complete_df["reading_score"] >= 70


In [56]:
# get all students who are passing math in a new DataFrame
passing_math = school_data_complete_df[school_data_complete_df["math_score"] >= 70]
passing_math.head()

# get all students who aer passing reading in a new DataFrame
passing_reading = school_data_complete_df[school_data_complete_df["reading_score"] >= 70]


Student ID       29370
student_name     29370
gender           29370
grade            29370
school_name      29370
reading_score    29370
math_score       29370
School ID        29370
type             29370
size             29370
budget           29370
dtype: int64

In [61]:
# calculate the number of students passing math
passing_math_count = passing_math["student_name"].count()

# calculate the number of students passing reading
passing_reading_count = passing_reading["student_name"].count()

print(passing_math_count)
print(passing_reading_count)

29370
33610


In [66]:
# calculate the percentage of students who passed math
passing_math_percentage = passing_math_count / float(student_count) * 100

# calculate the percentage of students who passed reading
passing_reading_percentage = passing_reading_count / float(student_count) * 100

print(passing_math_percentage)
print(passing_reading_percentage)

74.9808526933878
85.80546336482001


In [73]:
# calculate the percentage of students who passed both math and reading
passing_math_reading = school_data_complete_df[(school_data_complete_df["math_score"] >= 70) & (school_data_complete_df["reading_score"] >=70)]

# calculate the number of students who passed both math and reading
overall_passing_math_reading_count = passing_math_reading["student_name"].count()

overall_passing_math_reading_count

# calculate the overall passing percentage
overall_passing_percentage = overall_passing_math_reading_count / float(student_count) * 100

overall_passing_percentage

65.17232575950983

In [106]:
# Adding a list of values with keys to create a new DataFrame.
district_summary_df = pd.DataFrame(
          [{"Total Schools": school_count,
          "Total Students": student_count,
          "Total Budget": total_budget,
          "Average Math Score": average_math_score,
          "Average Reading Score": average_reading_score,
          "% Passing Math": passing_math_percentage,
         "% Passing Reading": passing_reading_percentage,
        "% Overall Passing": overall_passing_percentage}])
district_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,24649428,78.985371,81.87784,74.980853,85.805463,65.172326


In [107]:
# format the "Total Students" output to have the comma for a thousandths separator
district_summary_df["Total Students"] = district_summary_df["Total Students"].map("{:,}".format)

district_summary_df["Total Students"]

0    39,170
Name: Total Students, dtype: object

In [108]:
# format the "Total Budget" output to have the comma for a thousandths spearator, a decimal, and a $
district_summary_df["Total Budget"] = district_summary_df["Total Budget"].map("${:,.2f}".format)

district_summary_df["Total Budget"]

0    $24,649,428.00
Name: Total Budget, dtype: object

In [109]:
# Format the columns.
district_summary_df["Average Math Score"] = district_summary_df["Average Math Score"].map("{:.1f}".format)

district_summary_df["Average Reading Score"] = district_summary_df["Average Reading Score"].map("{:.1f}".format)

district_summary_df["% Passing Math"] = district_summary_df["% Passing Math"].map("{:.0f}".format)

district_summary_df["% Passing Reading"] = district_summary_df["% Passing Reading"].map("{:.0f}".format)

district_summary_df["% Overall Passing"] = district_summary_df["% Overall Passing"].map("{:.0f}".format)

district_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",79.0,81.9,75,86,65


In [110]:
# reorder columns
new_column_order = ["Total Schools", "Total Students", "Total Budget", "Average Math Score", "Average Reading Score", "% Passing Math", "% Passing Reading", "% Overall Passing"]

# asssign new DataFrame to the new column order
district_summary_df = district_summary_df[new_column_order]

district_summary_df

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",79.0,81.9,75,86,65
