In [1]:
from scipy.stats import chi2_contingency
import pandas as pd

# 데이터 예시 (성별에 따라 제품 선택에 차이가 있도록 설정)
df = pd.DataFrame({
    'gender': ['M','M','M','M','M','F','F','F','F','F'],
    'product': ['A','A','A','B','B','B','B','B','A','A']
})


In [2]:
# 교차표 생성
crosstab = pd.crosstab(df['gender'], df['product'])
crosstab


product,A,B
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
F,2,3
M,3,2


In [3]:
# # 카이제곱 검정
chi2, pval, dof, expected = chi2_contingency(crosstab)
print('Chi-square: ',chi2)
print('p-value: ',pval)
print('자유도: ', dof)
print('기대빈도: ',expected)


Chi-square:  0.0
p-value:  1.0
자유도:  1
기대빈도:  [[2.5 2.5]
 [2.5 2.5]]


In [4]:
# # 결과 해석
alpha = 0.05
if pval < alpha:
    print('→ 두 변수는 유의한 관계가 있습니다.')
else:
    print('→ 두 변수는 독립적입니다.')


→ 두 변수는 독립적입니다.


# ---------------------------------------

In [8]:
import pandas as pd

#1. CSV 파일 불러오기
df = pd.read_csv('train.csv')

df.info() ## 각 컬럼 파악

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29304 entries, 0 to 29303
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              29304 non-null  int64  
 1   age             29292 non-null  float64
 2   workclass       27642 non-null  object 
 3   fnlwgt          29304 non-null  int64  
 4   education       29304 non-null  object 
 5   education.num   29304 non-null  int64  
 6   marital.status  29304 non-null  object 
 7   occupation      27636 non-null  object 
 8   relationship    29304 non-null  object 
 9   race            29304 non-null  object 
 10  sex             29304 non-null  object 
 11  capital.gain    29304 non-null  int64  
 12  capital.loss    29304 non-null  int64  
 13  hours.per.week  29291 non-null  float64
 14  native.country  28767 non-null  object 
 15  income          29304 non-null  object 
dtypes: float64(2), int64(5), object(9)
memory usage: 3.6+ MB


In [9]:
# 2 target 분리
target = df.pop('income')

In [10]:
# 3 범주형 X 분리
categorical_cols = df.select_dtypes(include = ['object']).columns.tolist()
categorical_cols

['workclass',
 'education',
 'marital.status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'native.country']

In [15]:
# 4. 카이제곱 검정 수행 및 결과 저장
results = []

for col in categorical_cols:
    crosstab = pd.crosstab(df[col], target)
    chi2, p, dof, expected = chi2_contingency(crosstab)
    results.append({
        '변수명': col,
        'Chi2 통계량': round(chi2, 4),
        'p-value':round(p, 4),
        '자유도': dof,
        '유의성(p<0.05)': '유의함' if p < 0.05 else '유의하지 않음'
    })

# 6. DataFrame 생성 및 정렬
chi2_df = pd.DataFrame(results).sort_values('p-value', ascending=True).reset_index(drop=True)

# 7. 결과 출력
print("=== 카이제곱 검정 결과 (income 종속변수 기준) ===")
display(chi2_df)

=== 카이제곱 검정 결과 (income 종속변수 기준) ===


Unnamed: 0,변수명,Chi2 통계량,p-value,자유도,유의성(p<0.05)
0,workclass,733.5968,0.0,7,유의함
1,education,3949.6069,0.0,15,유의함
2,marital.status,5939.7594,0.0,6,유의함
3,occupation,3360.2003,0.0,13,유의함
4,relationship,6099.7599,0.0,5,유의함
5,race,298.2598,0.0,4,유의함
6,sex,1362.7943,0.0,1,유의함
7,native.country,284.2286,0.0,40,유의함
