## 기본 데이터, 라이브러리 로드

In [1]:
import numpy as np
import pandas as pd

df_x = pd.read_csv('data/x_train.csv', encoding='euc-kr')
df_y = pd.read_csv('data/y_train.csv', encoding='euc-kr')
df = pd.merge(df_x, df_y, on='cust_id')
df.fillna(0, inplace=True)

In [2]:
man = df.loc[df['gender']==1, '내점일수']
woman = df.loc[df['gender']==0, '내점일수']

## 1. T 검정

- 단일 표본

In [None]:
from scipy.stats import ttest_1samp

avg = 18
stat, pval = ttest_1samp(woman, popmean=avg)
print('H0 : woman 평균은 {}이다 : {}'.format(avg, pval > 0.05))

- 독립표본

In [None]:
from scipy.stats import ttest_ind
ttest_ind(man, woman, equal_var=True)
print('H0 : man과 woman의 평균에는 차이가 없다 : {}'.format(pval > 0.05))

- 대응표본

In [None]:
from scipy.stats import ttest_rel
ttest_rel(man, woman)

## 2. 분산분석(ANOVA)

### 1) 일원분산분석(One-way ANOVA)
종속변수 1개, 독립변수 1개

- scipy.stats 이용

In [None]:
import scipy.stats as stats

stat, pval = stats.f_oneway(man, woman)

print('F={0:.1f}, p={1:}'.format(stat, pval))
print('H0 : man과 woman의 평균에는 차이가 없다 : {}'.format(pval > 0.05))

- statsmodel 이용

In [None]:
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

model = ols('내점일수 ~ C(gender)', df).fit()
print(anova_lm(model))

- 사후검정

In [None]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# reject가 True면 PValue가 0.05보다 작음(평균의 차이가 유의미)
print(pairwise_tukeyhsd(df['내점일수'], df['gender']))

## 로짓분석

In [3]:
from statsmodels.formula.api import logit

print(logit('gender~총구매액+최대구매액+내점일수+구매주기+주말방문비율', df).fit().summary())

Optimization terminated successfully.
         Current function value: 0.642669
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:                 gender   No. Observations:                 3500
Model:                          Logit   Df Residuals:                     3494
Method:                           MLE   Df Model:                            5
Date:                Wed, 09 Mar 2022   Pseudo R-squ.:                 0.02931
Time:                        12:22:54   Log-Likelihood:                -2249.3
converged:                       True   LL-Null:                       -2317.3
Covariance Type:            nonrobust   LLR p-value:                 1.382e-27
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.3469      0.071     -4.867      0.000      -0.487      -0.207
총구매액       -1.238e-09   5.28e

In [4]:
import numpy as np
np.exp(-2.569e-09)

0.999999997431