1. 단일 표본 검정

In [5]:
# 데이터
import pandas as pd
df = pd.DataFrame({
    'Caffeine(mg)':[
        94.2, 93.7, 95.5, 93.9, 94.0, 95.2, 94.7, 93.5, 92.8, 94.4,
        93.8, 94.6, 93.3, 95.1, 94.3, 94.9, 93.9, 94.8, 95.0, 94.2,
        93.7, 94.4, 95.1, 94.0, 93.6
    ]
})

In [11]:
# 1. 표본 데이터의 평균 구하기
print(df.mean())

Caffeine(mg)    94.264
dtype: float64


In [13]:
# 2. Shapiro-Wilk 검정의 p-value 구하기
# 샤피로-윌크 검정 : 정규성 검정
from scipy import stats
stats.shapiro(df['Caffeine(mg)'])

ShapiroResult(statistic=0.9826578166170533, pvalue=0.9322031137746914)

In [15]:
# 3~5. 단일 표본 t-검정
stats.ttest_1samp(df['Caffeine(mg)'], 95, alternative='less')
# p-value < 0.05 이므로 귀무가설 기각

TtestResult(statistic=-5.501737036221897, pvalue=5.8686553916715e-06, df=24)

2. 독립 표본 검정

In [18]:
# 데이터
import pandas as pd
df = pd.DataFrame({
    '충전기':['New'] * 10 + ['Old'] * 10,
    '충전시간' : [
        1.5, 1.6, 1.4, 1.7, 1.5, 1.6, 1.7, 1.4, 1.6, 1.5,
        1.7, 1.8, 1.7, 1.9, 1.8, 1.7, 1.8, 1.9, 1.7, 1.6
    ]
})

In [20]:
df.head(2)

Unnamed: 0,충전기,충전시간
0,New,1.5
1,New,1.6


In [26]:
cond1 = df['충전기'] == 'New'
cond2 = df['충전기'] == 'Old'
stats.ttest_ind(df[cond1]['충전시간'], df[cond2]['충전시간'],
               alternative='less', equal_var=True)
# p-value < 0.05 이므로 귀무가설 기각

TtestResult(statistic=-4.582575694955849, pvalue=0.00011546547787696304, df=18.0)

3. 대응 표본 검정

In [29]:
# 데이터
import pandas as pd
df = pd.DataFrame({
    'User' : list(range(1,11)),
    '기존방법' : [60.4, 60.7, 60.5, 60.3, 60.8, 60.6, 60.2, 60.5, 60.7, 60.4],
    '새로운방법' : [59.8, 60.2, 60.1, 59.9, 59.7, 58.4, 57.0, 60.3, 59.6, 59.8]
})
df.head(2)

Unnamed: 0,User,기존방법,새로운방법
0,1,60.4,59.8
1,2,60.7,60.2


In [35]:
df['diff'] = df['새로운방법'] - df['기존방법']
print(df['diff'].mean())

-1.0300000000000005


In [37]:
from scipy import stats
stats.ttest_rel(df['새로운방법'], df['기존방법'], alternative='less')
# p-value < 0.05 이므로 귀무가설 기각

TtestResult(statistic=-3.407973078114844, pvalue=0.0038872633380070652, df=9)

4. 일원 분산 분석

In [40]:
import pandas as pd
df = pd.read_csv('math.csv')

In [44]:
print(df.shape)
display(df.head())
display(df.tail())
print(df.info())

(40, 2)


Unnamed: 0,groups,scores
0,group_A,85
1,group_A,88
2,group_A,90
3,group_A,82
4,group_A,87


Unnamed: 0,groups,scores
35,group_D,86
36,group_D,84
37,group_D,85
38,group_D,87
39,group_D,86


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   groups  40 non-null     object
 1   scores  40 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 772.0+ bytes
None


In [48]:
from scipy import stats 

In [52]:
# 그룹별 Shapiro-Wilk 검정
cond1 = df['groups'] == 'group_A'
cond2 = df['groups'] == 'group_B'
cond3 = df['groups'] == 'group_C'
cond4 = df['groups'] == 'group_D'

print(stats.shapiro(df.loc[cond1]['scores']))
print(stats.shapiro(df.loc[cond2]['scores']))
print(stats.shapiro(df.loc[cond3]['scores']))
print(stats.shapiro(df.loc[cond4]['scores']))
# 4 그룹 모두 p-value > 0.05이므로 귀무가설 기각 실패
# 따라서 그룹별로 점수의 정규성이 만족된다.

ShapiroResult(statistic=0.9715896670696531, pvalue=0.9051800443853569)
ShapiroResult(statistic=0.9499422438060351, pvalue=0.6678172590861611)
ShapiroResult(statistic=0.9299424104842702, pvalue=0.44732595113862045)
ShapiroResult(statistic=0.9065684572704982, pvalue=0.25824165549017347)


In [58]:
# Levene(레빈) 검정 - 등분산성
print(stats.levene(df[cond1]['scores'], df[cond2]['scores'], df[cond3]['scores'], df[cond4]['scores']))
# p-value > 0.05이므로 귀무가설 기각 실패

LeveneResult(statistic=1.757685352622062, pvalue=0.17270284963232108)


In [60]:
# 일원 분산 분석 - f_oneway
print(stats.f_oneway(df[cond1]['scores'], df[cond2]['scores'], df[cond3]['scores'], df[cond4]['scores']))

F_onewayResult(statistic=34.17427385892114, pvalue=1.2406415428510513e-10)


In [62]:
from statsmodels.formula.api import ols
model = ols('scores~groups', df).fit()

In [66]:
from statsmodels.stats.anova import anova_lm
print(anova_lm(model))
# p-value < 0.05 이므로 귀무가설 기각
# 그룹변수의 자유도 : 3
# 잔차의 자유도 : 36
# 성적의 제곱합 : 411.8
# 성적의 평균제곱 : 137.266667
# F-통계량 : 34.174274
# 성적에 대한 p-value : 1.240642e-10

            df  sum_sq     mean_sq          F        PR(>F)
groups     3.0   411.8  137.266667  34.174274  1.240642e-10
Residual  36.0   144.6    4.016667        NaN           NaN


5. 이원 분산 분석

In [69]:
import pandas as pd
df = pd.read_csv('tomato2.csv')

In [71]:
print(df.shape)
display(df.head())
display(df.tail())
print(df.info())

(36, 3)


Unnamed: 0,비료유형,물주기,수확량
0,A,1,514
1,A,1,480
2,A,1,507
3,A,2,452
4,A,2,526


Unnamed: 0,비료유형,물주기,수확량
31,C,3,515
32,C,3,522
33,C,4,507
34,C,4,511
35,C,4,521


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   비료유형    36 non-null     object
 1   물주기     36 non-null     int64 
 2   수확량     36 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 996.0+ bytes
None


In [83]:
# 이원 분산 분석
import statsmodels.api as sm
from statsmodels.formula.api import ols

model = ols('수확량~ C(비료유형)*C(물주기)', data=df).fit()
anova_table = sm.stats.anova_lm(model)
print(anova_table)

                  df        sum_sq      mean_sq         F    PR(>F)
C(비료유형)          2.0   5251.722222  2625.861111  3.184685  0.059334
C(물주기)           3.0   9057.000000  3019.000000  3.661490  0.026460
C(비료유형):C(물주기)   6.0   4271.833333   711.972222  0.863491  0.535426
Residual        24.0  19788.666667   824.527778       NaN       NaN


6. 적합도 검정

In [88]:
# 1. 교통사고 경험자 중 5회 이상의 비율
print(30/1000)

0.03


In [90]:
# 2~4. 적합도 검정
from scipy import stats
observed = [550, 250, 100, 70, 30]
expected = [1000*0.6, 1000*0.25, 1000*0.08, 1000*0.05, 1000*0.02]
print(stats.chisquare(observed, expected))

Power_divergenceResult(statistic=22.166666666666668, pvalue=0.00018567620386641424)


7. 독립성 검정

In [105]:
observed = pd.DataFrame([[50, 30], [60, 40]])
observed

Unnamed: 0,0,1
0,50,30
1,60,40


In [107]:
stats.chi2_contingency(observed)

Chi2ContingencyResult(statistic=0.03535714285714309, pvalue=0.8508492527705047, dof=1, expected_freq=array([[48.88888889, 31.11111111],
       [61.11111111, 38.88888889]]))

In [93]:
# 데이터
import pandas as pd
df = pd.DataFrame({
    '캠프': ['빅분기']*80 + ['정처기']*100,
    '등록여부':['등록']*50 + ['등록안함']*30 + ['등록']*60 + ['등록안함']*40
})

In [95]:
df.head()

Unnamed: 0,캠프,등록여부
0,빅분기,등록
1,빅분기,등록
2,빅분기,등록
3,빅분기,등록
4,빅분기,등록


In [99]:
df_table = pd.crosstab(df['캠프'], df['등록여부'])
df_table

등록여부,등록,등록안함
캠프,Unnamed: 1_level_1,Unnamed: 2_level_1
빅분기,50,30
정처기,60,40


In [103]:
from scipy import stats
print(stats.chi2_contingency(df_table))

Chi2ContingencyResult(statistic=0.03535714285714309, pvalue=0.8508492527705047, dof=1, expected_freq=array([[48.88888889, 31.11111111],
       [61.11111111, 38.88888889]]))


8. 다중 선형 회귀

In [112]:
import pandas as pd
df = pd.DataFrame({
    '할인율' : [28, 24, 13, 0, 27, 30, 10, 16, 6, 5, 7, 11, 11, 30, 25,
            4, 7, 24, 19, 21, 6, 10, 26, 13, 15, 6, 12, 6, 20, 2],
    '온도' : [15, 34, 15, 22, 29, 30, 14, 17, 28, 29, 19, 19, 34, 10,
           29, 28, 12, 25, 32, 28, 22, 16, 30, 11, 16, 18, 16, 33, 12, 22],
    '광고비' : [342, 666, 224, 764, 148, 499, 711, 596, 797, 484, 986, 347, 146, 362, 642,
            591, 846, 260, 560, 941, 469, 309, 730, 305, 892, 147, 887, 526, 525, 884],
    '주문량' : [635, 958, 525, 25, 607, 872, 858, 732, 1082, 863, 904, 686, 699, 615, 893,
            830, 856, 679, 918, 951, 789, 583, 988, 631, 866, 549, 910, 946, 647, 943]
})

In [114]:
print(df.shape)
df.head()

(30, 4)


Unnamed: 0,할인율,온도,광고비,주문량
0,28,15,342,635
1,24,34,666,958
2,13,15,224,525
3,0,22,764,25
4,27,29,148,607


In [116]:
from statsmodels.formula.api import ols
model = ols('주문량~할인율+온도+광고비', data=df).fit()

In [122]:
# 1. 상관계수
print(round(df['할인율'].corr(df['온도']),2))

0.09


In [124]:
# 2. 결정계수
print(round(model.rsquared, 2))

0.4


In [130]:
# 3. 회귀계수
print(round(model.params, 4))

Intercept    267.6609
할인율            4.2068
온도             9.4798
광고비            0.4148
dtype: float64


In [134]:
# 4. 절편
print(round(model.params['Intercept'], 4))

267.6609


In [142]:
# 5. 회귀계수 검정
print(round(model.pvalues['온도'], 4))

0.0289


In [150]:
# 6. 예측 판매량
new_data = pd.DataFrame({'할인율':[10], '온도':[20], '광고비':[500]})
result = model.predict(new_data)
print(int(result[0]))

706


In [154]:
# 7. 잔차 제곱합
df['잔차'] = df['주문량'] - model.predict(df)
round(sum(df['잔차']**2), 2)

732197.9

In [156]:
# 8. MSE
MSE = (df['잔차']**2).mean()
print(round(MSE, 4))

24406.5966


In [160]:
# 9. 90% 신뢰구간
print(model.conf_int(alpha=0.1))

                   0           1
Intercept  45.955720  489.366084
할인율        -1.847229   10.260887
온도          2.490702   16.468984
광고비         0.201064    0.628589


In [162]:
# 10. 새로운 데이터의 예측값의 90% 신뢰구간과 예측구간
new_data2 = pd.DataFrame({'할인율':[15], '온도':[25], '광고비':[300]})
pred = model.get_prediction(new_data)
result = pred.summary_frame(alpha=0.1)
print(result)

         mean    mean_se  mean_ci_lower  mean_ci_upper  obs_ci_lower  \
0  706.739338  36.456283     644.558848     768.919829    413.836923   

   obs_ci_upper  
0    999.641754  


In [164]:
# 11. 가설검정
cond = model.pvalues['광고비'] < 0.05
if cond:
    result = '기각'
else:
    result= '채택'
print(result)

기각


In [166]:
# 선형 회귀 모델의 요약 결과
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                    주문량   R-squared:                       0.400
Model:                            OLS   Adj. R-squared:                  0.330
Method:                 Least Squares   F-statistic:                     5.770
Date:                Fri, 14 Nov 2025   Prob (F-statistic):            0.00366
Time:                        12:00:49   Log-Likelihood:                -194.11
No. Observations:                  30   AIC:                             396.2
Df Residuals:                      26   BIC:                             401.8
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    267.6609    129.985      2.059      0.0

9. 로지스틱 회귀

In [169]:
import pandas as pd
df = pd.read_csv('customer_travel.csv')

In [171]:
# 데이터 분할
midpoint = len(df) // 2
a = df.iloc[:midpoint]
b = df.iloc[midpoint:]

In [173]:
a.shape, b.shape

((400, 5), (400, 5))

In [179]:
# 유의하지 않은 독립변수 개수
from statsmodels.formula.api import logit
formula = "target ~ age + service + social + booked"
model = logit(formula, data=a).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.527521
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                 target   No. Observations:                  400
Model:                          Logit   Df Residuals:                      395
Method:                           MLE   Df Model:                            4
Date:                Mon, 17 Nov 2025   Pseudo R-squ.:                 0.05254
Time:                        10:40:41   Log-Likelihood:                -211.01
converged:                       True   LL-Null:                       -222.71
Covariance Type:            nonrobust   LLR p-value:                 0.0001052
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.3314      1.204      1.937      0.053      -0.028       4.691
age           -0.1043      0.

In [195]:
model.pvalues[1:]

age        0.005413
service    0.567383
social     0.436256
booked     0.000445
dtype: float64

In [191]:
print(sum(model.pvalues[1:] >= 0.05))

2


In [197]:
# 수정된 모델에서 가장 큰 p-value를 가진 변수 구하기
formula2 = "target ~ age + booked"
model2 = logit(formula2, data=a).fit()
print(model2.summary())

Optimization terminated successfully.
         Current function value: 0.528581
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                 target   No. Observations:                  400
Model:                          Logit   Df Residuals:                      397
Method:                           MLE   Df Model:                            2
Date:                Mon, 17 Nov 2025   Pseudo R-squ.:                 0.05064
Time:                        10:43:27   Log-Likelihood:                -211.43
converged:                       True   LL-Null:                       -222.71
Covariance Type:            nonrobust   LLR p-value:                 1.265e-05
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      2.4581      1.184      2.076      0.038       0.137       4.779
age           -0.1025      0.

In [205]:
print(model2.pvalues[1:].idxmax())

age


In [215]:
# 수정된 모델에서 독립변수 중 절댓값이 가장 큰 회귀계수를 가진 변수 구하기
print(abs(model2.params[1:]).idxmax())

booked


In [217]:
# 로그 우도
print(model2.llf)

-211.4323825144558


In [241]:
# 잔차 이탈도 : 로그 우도에 (-2)을 곱함
print(-2*model2.llf)

422.8647650289116


In [225]:
# 수정된 모델에서 'booked'가 3 증가할 때 오즈비 계산
import numpy as np
print(np.exp(model2.params['booked']*3))

0.058533122917711476


In [233]:
# p-value가 0.05보다 작은 회귀계수의 총합
print(model2.params[model2.pvalues < 0.05].sum())

1.4094682705861938


In [235]:
# 정확도
pred = model.predict(b)
pred = (pred > 0.5).astype(int)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(b['target'], pred)
print(accuracy)

0.765


In [239]:
# 오류율
error_rate = 1 - accuracy
print(error_rate)

0.235
