In [1]:
# import文
import numpy as np
import pandas as pd
import pingouin as pg

### 母平均の検定（母分散が未知の場合）

In [2]:
x = np.array([14, 15, 15, 18, 20])
data = pd.DataFrame()
data["月収"] = x
data

Unnamed: 0,月収
0,14
1,15
2,15
3,18
4,20


In [3]:
data.月収.values

array([14, 15, 15, 18, 20])

In [4]:
pg.ttest(x = data.月収.values,
         y = 15,
         alternative = "greater",
         confidence = 0.95)

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.247219,4,greater,0.140178,"[14.01, inf]",0.557773,1.37,0.273233


In [5]:
(data.月収.mean() - 15.0) / np.sqrt(data.月収.var(ddof = 1)/len(data.月収))

1.247219128924646

### 母平均の差の検定

In [6]:
# dataディレクトリの中に、tokyo_osaka_salary.csv
data = pd.read_csv("./data/tokyo_osaka_salary.csv",
                   skiprows = 1,
                   encoding = "cp932")
data.head(n = 5)

Unnamed: 0,number,place,salary
0,1,Tokyo,435.9
1,2,Tokyo,622.0
2,3,Tokyo,518.4
3,4,Tokyo,941.7
4,5,Tokyo,655.6


In [7]:
tokyo = data.query('place == "Tokyo"').salary.values
osaka = data.query('place == "Osaka"').salary.values
tokyo.shape, osaka.shape

((57,), (45,))

以下は、対応がある場合。父子の身長の差など。東京・大阪の年収の差はこれに該当しない。

In [8]:
# 東京都大阪の年収の差
pg.ttest(x = tokyo[:45], y = osaka, paired = True,
         alternative = "two-sided", confidence = 0.95)

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,5.798575,44,two-sided,6.666878e-07,"[75.23, 155.39]",1.076844,24390.0,1.0


In [9]:
pg.ttest(x = tokyo[:45]-osaka, y = 0,
         alternative = "two-sided", confidence = 0.95)

Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,5.798575,44,two-sided,6.666878e-07,"[75.23, 155.39]",0.8644,24390.0,0.999896


### 独立性の検定

In [10]:
# クロス集計表
cross = np.array([[60,30,10], [20,40,40]])
cross

array([[60, 30, 10],
       [20, 40, 40]])

In [11]:
# データフレームに変えていく
ctab = pd.DataFrame(cross, 
                    columns = ["賛成", "中立", "反対"],
                    index = ["男性", "女性"])
#ctab
ctab.reset_index()

Unnamed: 0,index,賛成,中立,反対
0,男性,60,30,10
1,女性,20,40,40


In [12]:
ctab_df = pd.melt(ctab.reset_index(), id_vars = "index")
ctab_df.head(n = 5)

Unnamed: 0,index,variable,value
0,男性,賛成,60
1,女性,賛成,20
2,男性,中立,30
3,女性,中立,40
4,男性,反対,10


In [13]:
ctab_df = ctab_df.reindex(ctab_df.index.repeat(ctab_df.value)).reset_index(drop = True).drop("value", axis = 1)
ctab_df.head(n = 5)

Unnamed: 0,index,variable
0,男性,賛成
1,男性,賛成
2,男性,賛成
3,男性,賛成
4,男性,賛成


In [14]:
_, _, stats = pg.chi2_independence(data = ctab_df,
                                   x = "index",
                                   y = "variable")
stats.query('test == "pearson"')

Unnamed: 0,test,lambda,chi2,dof,pval,cramer,power
0,pearson,1.0,39.428571,2.0,2.742802e-09,0.444008,0.999962
