In [1]:
import pandas as pd
from scipy import stats
from scipy.stats import ttest_ind, ttest_ind_from_stats
from scipy.special import stdtr
import numpy as np

In [2]:
df= pd.read_csv('school_stats.csv', delimiter= '\t')

In [3]:
df.iloc[20:25,]

Unnamed: 0,Name,Math_Avg,Reading_Avg,Attendance_Ratio,Rating
20,GRAMERCY ARTS HIGH SCHOOL,391,391,0.181,3
21,NYC ISCHOOL,483,473,0.086,4
22,MANHATTAN BUSINESS ACADEMY,0,0,0.184,3
23,BUSINESS OF SPORTS SCHOOL,0,0,0.186,3
24,EMMA LAZARUS HIGH SCHOOL,512,319,0.076,3


In [4]:
#This will get rid of all values of 0 in rows
df= df[(df != 0).all(1)]

In [5]:
df.iloc[20:25,]

Unnamed: 0,Name,Math_Avg,Reading_Avg,Attendance_Ratio,Rating
20,GRAMERCY ARTS HIGH SCHOOL,391,391,0.181,3
21,NYC ISCHOOL,483,473,0.086,4
24,EMMA LAZARUS HIGH SCHOOL,512,319,0.076,3
26,HIGH SCHOOL FOR ENVIRONMENTAL STUDIES,493,465,0.113,4
27,INSTITUTE FOR COLLABORATIVE EDUCATION,465,492,0.08,4


In [6]:
rating= df.groupby('Rating').Rating.count()
'''Count how many times each rating group occured. 
This already shows that our data with be skewed because of uneven sampling distribution'''
rating

Rating
1     18
2     79
3    197
4    111
Name: Rating, dtype: int64

In [7]:
'''Find math sat avg for each rating'''
math_r_avg= df.groupby(['Rating'])['Math_Avg'].mean()
math_r_avg

Rating
1    387.833333
2    384.101266
3    410.329949
4    449.171171
Name: Math_Avg, dtype: float64

In [8]:
'''Find math sat std for each rating'''
math_r_std= df.groupby(['Rating'])['Math_Avg'].std()
math_r_std

Rating
1    22.747333
2    30.501196
3    58.729645
4    81.955739
Name: Math_Avg, dtype: float64

In [9]:
"""total math len presents"""
math_pop= df['Math_Avg'].count()
math_pop

405

In [10]:
'''this is just the math average, without taking into consideration the rating of school
the math average of all schools is similar to the math average of the rating 3 school
this has to do with sample size'''
math_pop_mean= df['Math_Avg'].mean()
math_pop_mean

414.85925925925926

In [11]:
math_pop_std= df['Math_Avg'].std()
math_pop_std

65.22711612257409

In [12]:
"two-sample t-test for math ratings"
math_rating_ttest= stats.ttest_ind_from_stats(math_pop_mean, math_pop_std, math_pop, math_r_avg, math_r_std, rating)
math_rating_ttest

Ttest_indResult(statistic=Rating
1    1.751410
2    4.102059
3    0.825330
4   -4.631557
Name: Math_Avg, dtype: float64, pvalue=array([8.06034519e-02, 4.80703075e-05, 4.09512336e-01, 4.60385564e-06]))

In [13]:
print('the above results tell us there is significant relationship between math sat scores and school rating. lets break it down.')
print('our null hypothesis is that there is no difference between school rating and math sat scores')
print('alternative is there is significant difference, with alpha at 0.05')
print('we will ignore rating 1- due to small sample size and rating 3- due to sample size being half of total sample. skew results even though they are significant')
print('looking at 2 and 4, sample size are similar to each other and about the same portion from total population')
print('both pvalue are less than alpha, stating there is significant difference between school rating and math sat scores')

the above results tell us there is significant relationship between math sat scores and school rating. lets break it down.
our null hypothesis is that there is no difference between school rating and math sat scores
alternative is there is significant difference, with alpha at 0.05
we will ignore rating 1- due to small sample size and rating 3- due to sample size being half of total sample. skew results even though they are significant
looking at 2 and 4, sample size are similar to each other and about the same portion from total population
both pvalue are less than alpha, stating there is significant difference between school rating and math sat scores


In [14]:
'''Find reading sat avg for each rating'''
reading_r_avg= df.groupby(['Rating'])['Reading_Avg'].mean()
reading_r_avg

Rating
1    376.833333
2    378.835443
3    397.101523
4    428.918919
Name: Reading_Avg, dtype: float64

In [15]:
'''Find reading sat std for each rating'''
reading_r_std= df.groupby(['Rating'])['Reading_Avg'].std()
reading_r_std

Rating
1    17.550767
2    29.348669
3    49.537039
4    76.112373
Name: Reading_Avg, dtype: float64

In [16]:
'''reading population mean, overall mean not categorized by rating'''
reading_pop_mean= df['Reading_Avg'].mean()
reading_pop_mean

401.358024691358

In [17]:
'''rading population std'''
reading_pop_std= df['Reading_Avg'].std()
reading_pop_std

57.364025329722935

In [18]:
'''reading population- total reading sample size'''
reading_pop= df['Reading_Avg'].count()
reading_pop

405

In [19]:
'''two sample t_test for reading sat'''
reading_rating_ttest= stats.ttest_ind_from_stats(reading_pop_mean, reading_pop_std, reading_pop, reading_r_avg, reading_r_std, rating)
reading_rating_ttest

Ttest_indResult(statistic=Rating
1    1.808232
2    3.401920
3    0.892084
4   -4.158866
Name: Reading_Avg, dtype: float64, pvalue=array([7.12837765e-02, 7.24830609e-04, 3.72705703e-01, 3.74778441e-05]))

In [20]:
print('the above results tell us there is significant relationship between reading sat scores and school rating. lets break it down.')
print('our null hypothesis is that there is no difference between school rating and reading sat scores')
print('alternative is there is significant difference, with alpha at 0.05')
print('we will ignore rating 1- due to small sample size and rating 3- due to sample size being half of total sample. skew results even though they are significant')
print('looking at 2 and 4, sample size are similar to each other and about the same portion from total population')
print('both pvalue are less than alpha, stating there is significant difference between school rating and reading sat scores')

the above results tell us there is significant relationship between reading sat scores and school rating. lets break it down.
our null hypothesis is that there is no difference between school rating and reading sat scores
alternative is there is significant difference, with alpha at 0.05
we will ignore rating 1- due to small sample size and rating 3- due to sample size being half of total sample. skew results even though they are significant
looking at 2 and 4, sample size are similar to each other and about the same portion from total population
both pvalue are less than alpha, stating there is significant difference between school rating and reading sat scores


In [21]:
'''finding absence avg by rating'''
absence_r_avg= df.groupby(['Rating'])['Attendance_Ratio'].mean()
absence_r_avg

Rating
1    0.326389
2    0.279848
3    0.189721
4    0.138667
Name: Attendance_Ratio, dtype: float64

In [22]:
'''finding absence std by rating'''
absence_r_std= df.groupby(['Rating'])['Attendance_Ratio'].std()
absence_r_std

Rating
1    0.147414
2    0.144640
3    0.114124
4    0.110959
Name: Attendance_Ratio, dtype: float64

In [35]:
'''finding absence pop avg'''
absence_pop_mean= df['Attendance_Ratio'].mean()
absence_pop_mean

0.1993827160493825

In [32]:
'''finding absence pop std'''
absence_pop_std= df['Attendance_Ratio'].std()
absence_pop_std

0.1329197971440027

In [33]:
'''total absence population'''
absence_pop= df['Attendance_Ratio'].count()
absence_pop

405

In [36]:
'''two sample t-test for absence ratio'''
absence_ttest= stats.ttest_ind_from_stats(absence_pop_mean, absence_pop_std, absence_pop, absence_r_avg, absence_r_std, rating)
absence_ttest

Ttest_indResult(statistic=Rating
1   -3.948403
2   -4.850227
3    0.875241
4    4.409030
Name: Attendance_Ratio, dtype: float64, pvalue=array([9.22007921e-05, 1.66709493e-06, 3.81792886e-01, 1.26508796e-05]))

In [37]:
print('the above results tell us there is significant relationship between absence and school rating. lets break it down.')
print('our null hypothesis is that there is no difference between school rating and student absence')
print('alternative is there is significant difference, with alpha at 0.05')
print('we will ignore rating 1- due to small sample size and rating 3- due to sample size being half of total sample. skew results even though they are significant')
print('looking at 2 and 4, sample size are similar to each other and about the same portion from total population')
print('both pvalue are less than alpha, stating there is significant difference between school rating and student absence')

the above results tell us there is significant relationship between absence and school rating. lets break it down.
our null hypothesis is that there is no difference between school rating and student absence
alternative is there is significant difference, with alpha at 0.05
we will ignore rating 1- due to small sample size and rating 3- due to sample size being half of total sample. skew results even though they are significant
looking at 2 and 4, sample size are similar to each other and about the same portion from total population
both pvalue are less than alpha, stating there is significant difference between school rating and student absence


In [None]:
print('still need to add linear regression to quantify the difference')

**IGNORE EVERYTHING BELOW**

In [25]:
'''just all the data from rating group in a neat table'''
data= {'Rating': [1, 2, 3, 4], 'Rating Count': [18, 79, 197, 111], 'Math_Rating': [387.83, 384.10, 410.33, 449.17], 'Reading_Rating': [376.83, 378.85, 397.10, 428.92], "Absense_Rating": [0.33, 0.28, 0.19, 0.14]}
school_stats= pd.DataFrame.from_dict(data)
school_stats.head()

Unnamed: 0,Rating,Rating Count,Math_Rating,Reading_Rating,Absense_Rating
0,1,18,387.83,376.83,0.33
1,2,79,384.1,378.85,0.28
2,3,197,410.33,397.1,0.19
3,4,111,449.17,428.92,0.14


In [26]:
'''describing all basic statistical information one may want'''
school_stats.describe()

Unnamed: 0,Rating,Rating Count,Math_Rating,Reading_Rating,Absense_Rating
count,4.0,4.0,4.0,4.0,4.0
mean,2.5,101.25,407.8575,395.425,0.235
std,1.290994,74.584963,29.879511,24.119324,0.085829
min,1.0,18.0,384.1,376.83,0.14
25%,1.75,63.75,386.8975,378.345,0.1775
50%,2.5,95.0,399.08,387.975,0.235
75%,3.25,132.5,420.04,405.055,0.2925
max,4.0,197.0,449.17,428.92,0.33


In [27]:
'''skew defines asymetry. rating is 1-4 and absense is less than 1
rating count shows skewness due to not all rating having the same numbers of samples
math and reading have a positive skew, meaning most of the data would be closer to 1 than 0'''
school_stats.skew()

Rating            0.000000
Rating Count      0.467225
Math_Rating       1.226618
Reading_Rating    1.263844
Absense_Rating    0.000000
dtype: float64

In [28]:
data2= {'Math_var': [669.56], 'Math_std': [25.86], 'Reading_var': [436.38], 'Reading_std': [20.89], 'Absense_var': [0.005], 'Absense_std': [0.074]}
var_stds= pd.DataFrame.from_dict(data2)
var_stds

Unnamed: 0,Math_var,Math_std,Reading_var,Reading_std,Absense_var,Absense_std
0,669.56,25.86,436.38,20.89,0.005,0.074


In [29]:
'''showing correlation between each column.
both math and reading have a high correlation with rating while absense has negative correlation
this may mean that less absense leads to better score'''
school_stats.corr()

Unnamed: 0,Rating,Rating Count,Math_Rating,Reading_Rating,Absense_Rating
Rating,1.0,0.687169,0.90842,0.934124,-0.992734
Rating Count,0.687169,1.0,0.424105,0.445408,-0.736538
Math_Rating,0.90842,0.424105,1.0,0.996334,-0.91084
Reading_Rating,0.934124,0.445408,0.996334,1.0,-0.92905
Absense_Rating,-0.992734,-0.736538,-0.91084,-0.92905,1.0


In [30]:
'''is a measure of the directional relationship between the returns on two risky assets. 
A positive covariance means that asset returns move together 
negative covariance means returns move inversely '''
'''this shows similar results to our correlation, meaning there may be some causational relationship'''
school_stats.cov()

Unnamed: 0,Rating,Rating Count,Math_Rating,Reading_Rating,Absense_Rating
Rating,1.666667,66.166667,35.041667,29.086667,-0.11
Rating Count,66.166667,5562.916667,945.144167,801.261667,-4.715
Math_Rating,35.041667,945.144167,892.785158,718.03185,-2.335883
Reading_Rating,29.086667,801.261667,718.03185,581.741767,-1.923267
Absense_Rating,-0.11,-4.715,-2.335883,-1.923267,0.007367


In [31]:
school_stats['Rating'].var()

1.6666666666666667