In [1]:
# Dependencies and Setup
import pandas as pd

# File to Load (Remember to Change These)
school_data_to_load = "Resources/schools.csv"
student_data_to_load = "Resources/students.csv"

# Read School and Student Data File and store into Pandas DataFrames
school_data = pd.read_csv(school_data_to_load)
student_data = pd.read_csv(student_data_to_load)

# Combine the data into a single dataset.  
school_data_complete = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])
school_data_complete.to_csv("Resources/MergedData.csv") #To visually see data for my own help

In [2]:
#Find column names #print(school_data_complete.columns)

#-----DISTRICT SUMMARY-----#
#Calculate the total number of schools 
TotalSchools = school_data['school_name'].tolist() #print(len(TotalSchools))

#Calculate the total number of students 
TotalStudents = school_data['size'].tolist() #print(TotalStudents)
SumTotalStudents = sum(TotalStudents) 

#Calculate the total budget
Budget = school_data["budget"].tolist() 
TotalBudget = sum(Budget)

In [3]:
#Calculate the average math score 
AvgMathScore = school_data_complete["math_score"].mean() #print(AvgMathScore)

#Calculate the average reading score 
AvgReadScore = school_data_complete["reading_score"].mean() #print(AvgReadScore)

In [4]:
#Calculate the percentage of students with a passing math score (70 or greater)
MathOver70 = len(school_data_complete.loc[school_data_complete["math_score"]>=70, "math_score"])
MathPercentPass = MathOver70/SumTotalStudents *100 #print(MathPercentPass)

#Calculate the percentage of students with a passing reading score (70 or greater)
ReadOver70 = len(school_data_complete.loc[school_data_complete["reading_score"]>=70, "reading_score"])
ReadPercentPass = ReadOver70/SumTotalStudents *100 #print(ReadPercentPass)

#Calculate the percentage of students who passed math AND reading (% Overall Passing)
TotalPercentPassed = school_data_complete.loc[(school_data_complete["reading_score"]>=70) & 
                        (school_data_complete["math_score"]>=70), :]
PassedBoth = len(TotalPercentPassed)/SumTotalStudents * 100 #print(PassedBoth)

In [5]:
#Create a dataframe to hold the above results
DistSummary = { 'Total Schools' : [len(TotalSchools)], "Total Students" : [f'{SumTotalStudents:,}'],
                     "Total Budget" : [f'${TotalBudget:,.2f}'], "Average Math Score" : [AvgMathScore],
                     "Average Reading Score" : [AvgReadScore], "% Passing Math" : [MathPercentPass],
                     "% Passing Reading" : [ReadPercentPass], "% Overall Passing" : [PassedBoth]}
DistrictSummaryDF = pd.DataFrame(DistSummary)
DistrictSummaryDF

#Optional: give the displayed data cleaner formatting ---- come back to

Unnamed: 0,Total Schools,Total Students,Total Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
0,15,39170,"$24,649,428.00",78.985371,81.87784,74.980853,85.805463,65.172326


In [6]:
#-----SCHOOL SUMMARY-----#
#GroupbySchool
SchoolGroup = school_data_complete.groupby(['school_name'])

#Type
Type = SchoolGroup['type'].min() #.unique().tolist() #------.min?? .unique?? and can I use this as a bin????

#Total Students
StudentTotal = SchoolGroup["student_name"].count()

#Budget
NewBudget = SchoolGroup["budget"].mean()

#PerStudentBudget
PerStudentBudget = NewBudget/StudentTotal

#Average Math Score
MathScore = round(SchoolGroup['math_score'].mean(), 2)

#Average Reading Score
EngScore = round(SchoolGroup['reading_score'].mean(), 2)

In [7]:
#% Passing Math
MathPass = school_data_complete[(school_data_complete['math_score']>=70)] #Created new DF for math scores over 70
SchoolMathGroup = MathPass.groupby(['school_name']) #Regrouped data by school name using new DF
MathPassSum = SchoolMathGroup['math_score'].count() #Found the total number
MathPassPercent = round((MathPassSum/StudentTotal) * 100, 2) #Calculated percent

#% Passing Reading
EngPass = school_data_complete[(school_data_complete['reading_score']>=70)] #Created new DF for reading scores over 70
SchoolEngGroup = EngPass.groupby(['school_name']) #Regrouped data by school name using new DF
EngPassSum = SchoolEngGroup['reading_score'].count() #Found the total number
EngPassPercent = round((EngPassSum/StudentTotal) * 100, 2) #Calculated percent

#% Overall Passing (The percentage of students that passed math AND reading.) #Same steps as above
MathEngPass = school_data_complete[(school_data_complete['reading_score']>=70)&(school_data_complete['math_score']>=70)]
SchoolCombGroup = MathEngPass.groupby(['school_name'])
CombPassSum = SchoolCombGroup['math_score'].count()
CombPassPercent = round((CombPassSum/StudentTotal) * 100, 2)

In [8]:
#Dataframe for above results
SchoolSummary = {'School Type': Type, 'Total Students': StudentTotal, 'Total School Budget': NewBudget, 
                 'Per Student Budget': PerStudentBudget, 'Average Math Score': MathScore,
                 'Average Reading Score': EngScore, '% Passing Math': MathPassPercent,
                 '% Passing Reading': EngPassPercent, '% Overall Passing': CombPassPercent}

SchoolSummaryDF = pd.DataFrame(SchoolSummary) 
#SchoolSummaryDF["Total Students"] = SchoolSummaryDF["Total Students"].map("{:,}".format)
SchoolSummaryDF["Total School Budget"] = SchoolSummaryDF["Total School Budget"].map("${:,.2f}".format)
#SchoolSummaryDF["Per Student Budget"] = SchoolSummaryDF["Per Student Budget"].apply(money)

#Create a dataframe to hold the above results
SchoolSummaryDF.index.name = None
SchoolSummaryDF

Unnamed: 0,School Type,Total Students,Total School Budget,Per Student Budget,Average Math Score,Average Reading Score,% Passing Math,% Passing Reading,% Overall Passing
Bailey High School,District,4976,"$3,124,928.00",628.0,77.05,81.03,66.68,81.93,54.64
Cabrera High School,Charter,1858,"$1,081,356.00",582.0,83.06,83.98,94.13,97.04,91.33
Figueroa High School,District,2949,"$1,884,411.00",639.0,76.71,81.16,65.99,80.74,53.2
Ford High School,District,2739,"$1,763,916.00",644.0,77.1,80.75,68.31,79.3,54.29
Griffin High School,Charter,1468,"$917,500.00",625.0,83.35,83.82,93.39,97.14,90.6
Hernandez High School,District,4635,"$3,022,020.00",652.0,77.29,80.93,66.75,80.86,53.53
Holden High School,Charter,427,"$248,087.00",581.0,83.8,83.81,92.51,96.25,89.23
Huang High School,District,2917,"$1,910,635.00",655.0,76.63,81.18,65.68,81.32,53.51
Johnson High School,District,4761,"$3,094,650.00",650.0,77.07,80.97,66.06,81.22,53.54
Pena High School,Charter,962,"$585,858.00",609.0,83.84,84.04,94.59,95.95,90.54


In [None]:
Top5Overall_df = SchoolSummaryDF.sort_values("% Overall Passing", ascending = False)
Top5Overall_df.head()

In [None]:
Worst5Overall_df = SchoolSummaryDF.sort_values("% Overall Passing", ascending = True)
Worst5Overall_df.head()

In [None]:
#Create a table that lists the average Math Score for students of each grade level (9th, 10th, 11th, 12th) at each school.
#Create a pandas series for each grade.
Math9th_series = school_data_complete.loc[(school_data_complete['grade'] == '9th')]
Math9th_series = Math9th_series[['school_name','grade','math_score']]
Math9th_series = Math9th_series.rename(columns = {'math_score' : '9th'})
Math9th_series = Math9th_series.groupby(['school_name']).mean()

Math10th_series = school_data_complete.loc[(school_data_complete['grade'] == '10th')]
Math10th_series = Math10th_series[['school_name','grade','math_score']]
Math10th_series = Math10th_series.rename(columns = {'math_score' : '10th'})
Math10th_series = Math10th_series.groupby(['school_name']).mean()

Math11th_series = school_data_complete.loc[(school_data_complete['grade'] == '11th')]
Math11th_series = Math11th_series[['school_name','grade','math_score']]
Math11th_series = Math11th_series.rename(columns = {'math_score' : '11th'})
Math11th_series = Math11th_series.groupby(['school_name']).mean()

Math12th_series = school_data_complete.loc[(school_data_complete['grade'] == '12th')]
Math12th_series = Math12th_series[['school_name','grade','math_score']]
Math12th_series = Math12th_series.rename(columns = {'math_score' : '12th'})
Math12th_series = Math12th_series.groupby(['school_name']).mean()

#Combine the series into a dataframe
MergedMath_df = Math9th_series.merge(Math10th_series,on='school_name').merge(Math11th_series,on='school_name').merge(
                Math12th_series,on='school_name')
MergedMath_df.index.name = None
MergedMath_df

In [None]:
#Create a table that lists the average Reading Score for students of each grade level (9th, 10th, 11th, 12th) at each school.
#Create a pandas series for each grade. Hint: use a conditional statement.
Read9th_series = school_data_complete.loc[(school_data_complete['grade'] == '9th')]
Read9th_series = Read9th_series[['school_name','grade','reading_score']]
Read9th_series = Read9th_series.rename(columns = {'reading_score' : '9th'})
Read9th_series = Read9th_series.groupby(['school_name']).mean()

Read10th_series = school_data_complete.loc[(school_data_complete['grade'] == '10th')]
Read10th_series = Read10th_series[['school_name','grade','reading_score']]
Read10th_series = Read10th_series.rename(columns = {'reading_score' : '10th'})
Read10th_series = Read10th_series.groupby(['school_name']).mean()

Read11th_series = school_data_complete.loc[(school_data_complete['grade'] == '11th')]
Read11th_series = Read11th_series[['school_name','grade','reading_score']]
Read11th_series = Read11th_series.rename(columns = {'reading_score' : '11th'})
Read11th_series = Read11th_series.groupby(['school_name']).mean()

Read12th_series = school_data_complete.loc[(school_data_complete['grade'] == '12th')]
Read12th_series = Read12th_series[['school_name','grade','reading_score']]
Read12th_series = Read12th_series.rename(columns = {'reading_score' : '12th'})
Read12th_series = Read12th_series.groupby(['school_name']).mean()

#Combine the series into a dataframe
MergedRead_df = Read9th_series.merge(Read10th_series,on='school_name').merge(Read11th_series,on='school_name').merge(
                Read12th_series,on='school_name')
MergedRead_df.index.name = None
MergedRead_df

In [None]:
#Create a table that breaks down school performances based on average Spending Ranges (Per Student). 
#Use 4 reasonable bins to group school spending. Include in the table each of the following:
#Average Math Score
#Average Reading Score
#% Passing Math
#% Passing Reading
#Overall Passing Rate (Average of the above two)
SpendDF = SchoolSummaryDF.reset_index(drop=True) 
#PerStudentBudget

#I used #print(NewDF['Per Student Budget'].max()) and #print(NewDF['Per Student Budget'].min()) to figure out
#the range of the bins.

Bins1 = [0, 583.99, 628.99, 643.99, 675]
SpendingLabels = ['<$584', '$585-629', '$630-644', '$645-675']
SpendDF['Spending Ranges (Per Student)'] = pd.cut(SpendDF['Per Student Budget'], Bins1, labels = SpendingLabels, 
                                                include_lowest = True)
SpendDF = SpendDF[['Average Math Score', 'Average Reading Score', '% Passing Math', '% Passing Reading', 
       '% Overall Passing', 'Spending Ranges (Per Student)']]

SpendDF = SpendDF.groupby('Spending Ranges (Per Student)').mean()
SpendDF

In [None]:

SizeDF = SchoolSummaryDF.reset_index(drop=True)
Bins2 = [0, 999, 1999, 4999]
SizeLabels = ['Small (<1000)', 'Medium (1000-2000)', 'Large (2000-5000)']
SizeDF['School Size'] = pd.cut(SizeDF['Total Students'], Bins2, labels = SizeLabels, include_lowest = True)
SizeDF = SizeDF[['Average Math Score', 'Average Reading Score', '% Passing Math', '% Passing Reading', 
       '% Overall Passing', 'School Size']]


SizeDF = SizeDF.groupby('School Size').mean()
SizeDF

In [None]:
TypeDF = SchoolSummaryDF.reset_index(drop=True)

TypeDF = TypeDF[['School Type', 'Average Math Score', 'Average Reading Score', '% Passing Math', '% Passing Reading', 
                 '% Overall Passing']]

TypeDF = TypeDF.groupby('School Type').mean()
TypeDF