# 목적
- 마케팅 전략 수립을 위한 고객 군집화
- 군집별 고객의 특성 파악

In [40]:
# 기본 패키지 불러오기
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import koreanize_matplotlib

# 데이터 준비
- 출처
    - https://www.kaggle.com/datasets/vjchoudhary7/customer-segmentation-tutorial-in-python
    - 원본 데이터셋의 컬럼명을 한글화 하였음
- 단위
    - 연간소득 : 천달러
    - 소비점수 : 1~100점

In [41]:
df = pd.read_csv("data/쇼핑몰고객군집화.csv")
df.head()

Unnamed: 0,고객ID,성별,나이,연간소득,소비점수
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


# 훈련에 사용할 변수

In [42]:
X = df.drop(['고객ID'], axis=1)

In [43]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.cluster import KMeans
import numpy as np

# -----------------------------------------
# 1️⃣ 범주형/수치형 컬럼 지정
# -----------------------------------------
# 유형별 컬럼 구분
cat_cols = X.select_dtypes(include=['object']).columns
num_cols = X.select_dtypes(include=['number']).columns

# -----------------------------------------
# 2️⃣ 전처리: 원핫인코딩, 표준화
# -----------------------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first'), cat_cols),  # 범주형 컬럼 : 원핫인코딩
        ('num', StandardScaler(), num_cols)                         # 숫자형: 표준화 추가 
    ],
    remainder='passthrough' # 지정하지 않은 컬럼은 그대로 둠
)

# -----------------------------------------
# 3️⃣ 파이프라인 구성 (인코딩 + 모델)
# -----------------------------------------
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('cluster', KMeans(n_clusters=6, random_state=42, n_init=20))
])

# -----------------------------------------
# 4️⃣ 학습 
# -----------------------------------------
pipe.fit(X)



0,1,2
,steps,"[('preprocessor', ...), ('cluster', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_clusters,6
,init,'k-means++'
,n_init,20
,max_iter,300
,tol,0.0001
,verbose,0
,random_state,42
,copy_x,True
,algorithm,'lloyd'


# 군집 결과

In [44]:
df['군집'] = pipe['cluster'].labels_
df

Unnamed: 0,고객ID,성별,나이,연간소득,소비점수,군집
0,1,Male,19,15,39,4
1,2,Male,21,15,81,4
2,3,Female,20,16,6,2
3,4,Female,23,16,77,4
4,5,Female,31,17,40,2
...,...,...,...,...,...,...
195,196,Female,35,120,79,1
196,197,Female,45,126,28,3
197,198,Male,32,126,74,1
198,199,Male,32,137,18,3


# 파이프라인 다운로드

In [45]:
import joblib
joblib.dump(pipe, 'model/customer_clustering_pipeline.pkl')


['model/customer_clustering_pipeline.pkl']