## 두 집단 평균 차이 검정

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

In [5]:
df = pd.read_csv("./data/StudentsPerformance.csv")

In [6]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [7]:
df['race/ethnicity'].unique()

array(['group B', 'group C', 'group A', 'group D', 'group E'],
      dtype=object)

In [8]:
A = df[df['race/ethnicity']=="group A"]
B = df[df['race/ethnicity']=="group B"]
C = df[df['race/ethnicity']=="group C"]
D = df[df['race/ethnicity']=="group D"]
E = df[df['race/ethnicity']=="group E"]

In [9]:
A

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
3,male,group A,associate's degree,free/reduced,none,47,57,44
13,male,group A,some college,standard,completed,78,72,70
14,female,group A,master's degree,standard,none,50,53,58
25,male,group A,master's degree,free/reduced,none,73,74,72
46,female,group A,associate's degree,standard,completed,55,65,62
...,...,...,...,...,...,...,...,...
974,female,group A,some college,standard,none,54,63,67
983,female,group A,some college,standard,completed,78,87,91
985,male,group A,high school,standard,none,57,51,54
988,female,group A,some high school,free/reduced,none,44,45,45


In [36]:
A_math = A["math score"]
B_math = B["math score"]

In [14]:
A_mean = np.mean(A_math)
print(A_mean)

61.62921348314607


In [17]:
A_var = np.std(A_math)**2
print(A_var)

208.54791061734625


In [18]:
n1 = len(A_math)
print(n1)

89


In [19]:
B_mean = np.mean(B_math)
print(B_mean)

63.45263157894737


In [20]:
B_var = np.std(B_math)**2
print(B_var)

238.00565096952909


In [21]:
n2 = len(B_math)
print(n2)

190


In [22]:
# z-score의 분모 부분
s = ((A_var/n1) + (B_var/n2))**0.5
s

1.896284860364855

In [29]:
# z-score 구하기
z_value = (B_mean - A_mean) /s
z_value

0.9615739353898898

In [30]:
z_dist = stats.norm(0,1)
p_value = 1 - z_dist.cdf(z_value)

In [31]:
p_value

0.16813183477339466

### 다른 관점

In [32]:
# z-score 구하기 (반대로 생각해봐) => p-value 구할 때 1에서 + 더해줘야함 
z_value = (A_mean - B_mean) /s
z_value

-0.9615739353898898

In [33]:
z_dist = stats.norm(0,1)  # Normal distribution 정규분포 (평균=0, 분산=1)
p_value = 1 + z_dist.cdf(z_value)

In [34]:
p_value

1.1681318347733947

## 세 집단 평균 차이 검정

In [41]:
A_math = A["math score"]
B_math = B["math score"]
C_math = C["math score"]
D_math = D["math score"]
E_math = E["math score"]

In [42]:
# 정규성 검정
test_stat, p = stats.shapiro(A_math)
print("검정통계량: {}, p-value: {}".format(test_stat, p))

test_stat, p = stats.shapiro(B_math)
print("검정통계량: {}, p-value: {}".format(test_stat, p))

test_stat, p = stats.shapiro(C_math)
print("검정통계량: {}, p-value: {}".format(test_stat, p))

test_stat, p = stats.shapiro(D_math)
print("검정통계량: {}, p-value: {}".format(test_stat, p))

test_stat, p = stats.shapiro(E_math)
print("검정통계량: {}, p-value: {}".format(test_stat, p))

검정통계량: 0.991736114025116, p-value: 0.8545348644256592
검정통계량: 0.980807363986969, p-value: 0.010394944809377193
검정통계량: 0.9891065359115601, p-value: 0.017411569133400917
검정통계량: 0.9896672964096069, p-value: 0.05927419662475586
검정통계량: 0.9770451784133911, p-value: 0.01849539391696453


#### 0.05보다 커야 H0 채택하고 -> 정규분포를 따른다는 거니까 ->  B,C는 정규성을 띄지 않음

- ANOVA 검정

In [37]:
f_val, p_val = stats.f_oneway(A_math, B_math, C_math, D_math, E_math)

In [38]:
print("F-통계량: ", f_val)
print("p-value: ", p_val)

F-통계량:  14.593885166332635
p-value:  1.3732194030370688e-11


- Kruskal 검정

In [39]:
f_val, p_val = stats.kruskal(A_math, B_math, C_math, D_math, E_math)

In [40]:
print("F-통계량: ", f_val)
print("p-value: ", p_val)

F-통계량:  57.079329705742886
p-value:  1.1906568165839682e-11


## 분할표를 활용한 연관성 분석

In [43]:
df

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [45]:
df['test preparation course'].unique()

array(['none', 'completed'], dtype=object)

In [46]:
df['gender'].unique()

array(['female', 'male'], dtype=object)

In [56]:
none = df['test preparation course'] == 'none'
completed = df['test preparation course'] == 'completed'
male = df['gender'] == 'male'
female = df['gender'] == 'female'

In [57]:
none_f = len(df[none & female])
none_m = len(df[none & male])
com_f = len(df[completed & female])
com_m = len(df[completed & male])

In [55]:
# none_f = len(df[none + female])
# none_f

In [58]:
none_f

334

In [59]:
none_m

308

In [60]:
com_f

184

In [61]:
com_m

174

In [67]:
# 관측값
obs = np.array([[none_f, none_m],[com_f, com_m]])
chi2, p, d, expected = stats.chi2_contingency(obs)

msg = 'Test Statistic: {} \n p-value: {} \n Degree of Freedom: {}'
print(msg.format(chi2, p, d ))
print(expected)

Test Statistic: 0.015529201882465888 
 p-value: 0.9008273880804724 
 Degree of Freedom: 1
[[332.556 309.444]
 [185.444 172.556]]
