# 1. 상관분석

In [8]:
import pandas as pd
from sklearn.datasets import load_diabetes

diabetes = load_diabetes()
data = pd.DataFrame(diabetes['data'], columns=diabetes['feature_names'])
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     442 non-null    float64
 1   sex     442 non-null    float64
 2   bmi     442 non-null    float64
 3   bp      442 non-null    float64
 4   s1      442 non-null    float64
 5   s2      442 non-null    float64
 6   s3      442 non-null    float64
 7   s4      442 non-null    float64
 8   s5      442 non-null    float64
 9   s6      442 non-null    float64
dtypes: float64(10)
memory usage: 34.7 KB


In [4]:
from scipy.stats import pearsonr
print(pearsonr(data['age'], data['bmi']))

PearsonRResult(statistic=0.18508466614655553, pvalue=9.076791865417336e-05)


In [5]:
print(data[['age', 'bmi']].corr())

          age       bmi
age  1.000000  0.185085
bmi  0.185085  1.000000


# 2. 정규성 검정

In [6]:
import numpy as np
np.random.seed(2024)
x = np.random.random(10)

In [7]:
from scipy.stats import shapiro
print(shapiro(x))

ShapiroResult(statistic=0.8969662710477171, pvalue=0.20285343366469372)


# 3. 모평균 검정

In [11]:
import numpy as np
from scipy.stats import ttest_1samp

kg = np.array([75.5, 83.9, 75.7, 56.2, 73.4, 67.7, 79.0,
               50.7, 58.4, 74.1, 65.1, 77.8, 48.1, 46.3])

print(kg.mean())

print(ttest_1samp(kg, 70))

66.56428571428572
TtestResult(statistic=-1.0289933120202257, pvalue=0.3222484823978743, df=13)


In [15]:
import pandas as pd
from scipy.stats import ttest_rel

female = np.array([50.7, 58.4, 74.1, 65.1, 77.8, 48.1, 46.3])
male = np.array([75.5, 83.9, 75.7, 56.2, 73.4, 67.7, 79.0])

diff = female - male
print(diff.mean())

print(ttest_rel(female, male))

-12.985714285714291
TtestResult(statistic=-2.078446933064972, pvalue=0.08291274205610201, df=6)


In [16]:
from scipy.stats import ttest_ind 

print(female.mean(), male.mean())

print(ttest_ind(female, male))

60.07142857142857 73.05714285714286
TtestResult(statistic=-2.2186641577772956, pvalue=0.046550122110569664, df=12.0)


# 4. 모분산 검정 (1)

In [22]:
import numpy as np
from scipy.stats import chi2
score = np.array([80.5, 60.2, 70, 87, 45, 91, 85])

var0 = 1100
var = np.var(score, ddof=1)
dof = len(score)-1

stat = (dof*var/var0)

print(chi2.cdf(stat, df=dof)) # 좌측 검정
print(1-chi2.cdf(stat, df=dof)) # 우측 검정
print(2*chi2.cdf(stat, df=dof)) # 양측 검정

0.041637780038918736
0.9583622199610813
0.08327556007783747


# 5. 모분산 검정 (2)

In [41]:
import numpy as np
from scipy.stats import f

a = np.array([70, 80, 75, 65, 100, 98])
b = np.array([20, 100, 50, 94, 28, 80, 95, 30])

# 가설 H0 : var_a = var_b, H1 : var_a < var_b
var_a = np.var(a, ddof=1)
var_b = np.var(b, ddof=1)
print(var_a, var_b)

df_a = len(a)-1
df_b = len(b)-1

stat = var_a / var_b
print(stat)

p_value = f.cdf(stat, df_a, df_b)
print(p_value)

212.66666666666669 1138.4107142857142
0.18681014154026346
0.041539430375629585


In [39]:
# var_b / var_a > 1

stat = var_b / var_a
1 - f.cdf(stat, df_b, df_a)

0.04153943037562957