In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv("https://raw.githubusercontent.com/haram4th/ablearn/main/shopping-data.csv")
data

Unnamed: 0,CustomerID,Genre,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40
...,...,...,...,...,...
195,196,Female,35,120,79
196,197,Female,45,126,28
197,198,Male,32,126,74
198,199,Male,32,137,18


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              200 non-null    int64 
 1   Genre                   200 non-null    object
 2   Age                     200 non-null    int64 
 3   Annual Income (k$)      200 non-null    int64 
 4   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB


In [4]:
data.describe(include='all')

Unnamed: 0,CustomerID,Genre,Age,Annual Income (k$),Spending Score (1-100)
count,200.0,200,200.0,200.0,200.0
unique,,2,,,
top,,Female,,,
freq,,112,,,
mean,100.5,,38.85,60.56,50.2
std,57.879185,,13.969007,26.264721,25.823522
min,1.0,,18.0,15.0,1.0
25%,50.75,,28.75,41.5,34.75
50%,100.5,,36.0,61.5,50.0
75%,150.25,,49.0,78.0,73.0


# 계층적 군집 분석
* 비지도학습
* 데이터가 적을 경우 사용
* 거리 계산 지표: 유클리드, 맨하탄, 코사인유사도
  * 단일연결법(single linkage)-최단연결법: 군집과 군집 사이에서 가장 가까운 데이터를 기준으로 묶음.
  * 완전연결법(complete linkage)-최장연결법: 군집과 군집 사이에서 가장 먼 데이터를 기준은 묶음.
  * 평균연결법(average linkage)- 군집과 군집 사이의 모든 데이터의 평균 데이터로 계산(이상치에 덜 민감하다)
  * 중심연결법(controid linkage)- 군집의 중심점 사이의 거리를 거리로 측정한 방법, 계산이 빠르다.
  * 와드연결법(ward linkage)- 군집내 오차가 최소가 되는 데이터로 계산, 군집내 분산이 최소. 조밀한 군집
* 계층적 군집의 시각화는 덴드로그램으로 한다.

In [5]:
import scipy.cluster.hierarchy as shc

In [6]:
data = pd.get_dummies(data, drop_first=True)
data

Unnamed: 0,CustomerID,Age,Annual Income (k$),Spending Score (1-100),Genre_Male
0,1,19,15,39,True
1,2,21,15,81,True
2,3,20,16,6,False
3,4,23,16,77,False
4,5,31,17,40,False
...,...,...,...,...,...
195,196,35,120,79,False
196,197,45,126,28,False
197,198,32,126,74,True
198,199,32,137,18,True


In [7]:
data2 = data.copy()
data2

Unnamed: 0,CustomerID,Age,Annual Income (k$),Spending Score (1-100),Genre_Male
0,1,19,15,39,True
1,2,21,15,81,True
2,3,20,16,6,False
3,4,23,16,77,False
4,5,31,17,40,False
...,...,...,...,...,...
195,196,35,120,79,False
196,197,45,126,28,False
197,198,32,126,74,True
198,199,32,137,18,True


In [8]:
data = data.drop("CustomerID", axis=1)
data

Unnamed: 0,Age,Annual Income (k$),Spending Score (1-100),Genre_Male
0,19,15,39,True
1,21,15,81,True
2,20,16,6,False
3,23,16,77,False
4,31,17,40,False
...,...,...,...,...
195,35,120,79,False
196,45,126,28,False
197,32,126,74,True
198,32,137,18,True


In [9]:
# plt.switch_backend('TkAgg')

In [88]:
# plt.figure(figsize=(20,10))
# plt.title("Single linkage Dendrogram")
# dend = shc.dendrogram(shc.linkage(data, method='single'))
# plt.show()

In [89]:
# plt.figure(figsize=(20,10))
# plt.title("Complete linkage Dendrogram")
# dend = shc.dendrogram(shc.linkage(data, method='complete'))
# plt.show()

In [91]:
# plt.figure(figsize=(20,10))
# plt.title("Average linkage Dendrogram")
# dend = shc.dendrogram(shc.linkage(data, method='average'))
# plt.show()

In [92]:
# plt.figure(figsize=(20,10))
# plt.title("Centroid linkage Dendrogram")
# dend = shc.dendrogram(shc.linkage(data, method='centroid'))
# plt.show()

In [14]:
# plt.figure(figsize=(20,10))
# plt.title("Ward linkage Dendrogram")
# dend = shc.dendrogram(shc.linkage(data, method='ward'))
# plt.show()

In [15]:
from sklearn.cluster import AgglomerativeClustering

In [16]:
data

Unnamed: 0,Age,Annual Income (k$),Spending Score (1-100),Genre_Male
0,19,15,39,True
1,21,15,81,True
2,20,16,6,False
3,23,16,77,False
4,31,17,40,False
...,...,...,...,...
195,35,120,79,False
196,45,126,28,False
197,32,126,74,True
198,32,137,18,True


In [17]:
# 클러스터링 수행
cluster = AgglomerativeClustering(n_clusters=3, metric='euclidean', linkage='single')
result = cluster.fit_predict(data)



In [18]:
result

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,
       1, 0])

In [19]:
data['result'] = result
data

Unnamed: 0,Age,Annual Income (k$),Spending Score (1-100),Genre_Male,result
0,19,15,39,True,0
1,21,15,81,True,0
2,20,16,6,False,0
3,23,16,77,False,0
4,31,17,40,False,0
...,...,...,...,...,...
195,35,120,79,False,0
196,45,126,28,False,2
197,32,126,74,True,0
198,32,137,18,True,1


In [20]:
data.columns

Index(['Age', 'Annual Income (k$)', 'Spending Score (1-100)', 'Genre_Male',
       'result'],
      dtype='object')

In [21]:
data.groupby('result')[['Age', 'Annual Income (k$)', 'Spending Score (1-100)', 'Genre_Male']].mean()

Unnamed: 0_level_0,Age,Annual Income (k$),Spending Score (1-100),Genre_Male
result,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,38.812183,59.538071,50.649746,0.441624
1,32.0,137.0,18.0,1.0
2,46.0,123.0,22.0,0.0


In [22]:
data[data['result'] == 0]

Unnamed: 0,Age,Annual Income (k$),Spending Score (1-100),Genre_Male,result
0,19,15,39,True,0
1,21,15,81,True,0
2,20,16,6,False,0
3,23,16,77,False,0
4,31,17,40,False,0
...,...,...,...,...,...
192,33,113,8,True,0
193,38,113,91,False,0
195,35,120,79,False,0
197,32,126,74,True,0


# 비계층적 군집 - 데이터 수가 많을 때 사용
* k-means(k평균 군집): 거리를 기반으로 군집을 형성, 이상치에 민감함. 


In [23]:
data = pd.read_csv("https://raw.githubusercontent.com/haram4th/ADsP/main/06%EA%B3%A0%EA%B0%9D%EC%9D%B4%ED%83%88%EC%98%88%EC%B8%A1.csv", encoding='cp949')
data.head()

Unnamed: 0,회원ID,성별,고연령,배우자,피부양자,가입기간,전화서비스,2회선이상,인터넷서비스,온라인보안,...,기기보호서비스,기술지원,스트리밍TV,스트리밍Movies,약정옵션,온라인고지서,지불수단,월요금,합산요금,이탈여부
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


하향식방법 vs 상향식 방법

In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   회원ID        7043 non-null   object 
 1   성별          7043 non-null   object 
 2   고연령         7043 non-null   int64  
 3   배우자         7043 non-null   object 
 4   피부양자        7043 non-null   object 
 5   가입기간        7043 non-null   int64  
 6   전화서비스       7043 non-null   object 
 7   2회선이상       7043 non-null   object 
 8   인터넷서비스      7043 non-null   object 
 9   온라인보안       7043 non-null   object 
 10  온라인백업       7043 non-null   object 
 11  기기보호서비스     7043 non-null   object 
 12  기술지원        7043 non-null   object 
 13  스트리밍TV      7043 non-null   object 
 14  스트리밍Movies  7043 non-null   object 
 15  약정옵션        7043 non-null   object 
 16  온라인고지서      7043 non-null   object 
 17  지불수단        7043 non-null   object 
 18  월요금         7043 non-null   float64
 19  합산요금        7043 non-null  

In [25]:
data2 = data.copy()

In [26]:
data = data.drop('회원ID', axis=1)
data

Unnamed: 0,성별,고연령,배우자,피부양자,가입기간,전화서비스,2회선이상,인터넷서비스,온라인보안,온라인백업,기기보호서비스,기술지원,스트리밍TV,스트리밍Movies,약정옵션,온라인고지서,지불수단,월요금,합산요금,이탈여부
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [27]:
data['합산요금'] = data['합산요금'].apply(lambda x: x.replace(' ', '0') if x == ' ' else x)

In [28]:
data['합산요금'] = data['합산요금'].astype('float')

In [29]:
data = pd.get_dummies(data, drop_first=True)

In [30]:
from sklearn.cluster import KMeans

In [31]:
km = KMeans(n_clusters=5)
km.fit(data) 
result = km.labels_
result

array([4, 1, 4, ..., 4, 4, 2], dtype=int32)

In [32]:
data['cluster'] = result

In [33]:
data.head()

Unnamed: 0,고연령,가입기간,월요금,합산요금,성별_Male,배우자_Yes,피부양자_Yes,전화서비스_Yes,2회선이상_No phone service,2회선이상_Yes,...,스트리밍Movies_No internet service,스트리밍Movies_Yes,약정옵션_One year,약정옵션_Two year,온라인고지서_Yes,지불수단_Credit card (automatic),지불수단_Electronic check,지불수단_Mailed check,이탈여부_Yes,cluster
0,0,1,29.85,29.85,False,True,False,False,True,False,...,False,False,False,False,True,False,True,False,False,4
1,0,34,56.95,1889.5,True,False,False,True,False,False,...,False,False,True,False,False,False,False,True,False,1
2,0,2,53.85,108.15,True,False,False,True,False,False,...,False,False,False,False,True,False,False,True,True,4
3,0,45,42.3,1840.75,True,False,False,False,True,False,...,False,False,True,False,False,False,False,False,False,1
4,0,2,70.7,151.65,False,False,False,True,False,False,...,False,False,False,False,True,False,True,False,True,4


In [34]:
data.columns

Index(['고연령', '가입기간', '월요금', '합산요금', '성별_Male', '배우자_Yes', '피부양자_Yes',
       '전화서비스_Yes', '2회선이상_No phone service', '2회선이상_Yes',
       '인터넷서비스_Fiber optic', '인터넷서비스_No', '온라인보안_No internet service',
       '온라인보안_Yes', '온라인백업_No internet service', '온라인백업_Yes',
       '기기보호서비스_No internet service', '기기보호서비스_Yes',
       '기술지원_No internet service', '기술지원_Yes', '스트리밍TV_No internet service',
       '스트리밍TV_Yes', '스트리밍Movies_No internet service', '스트리밍Movies_Yes',
       '약정옵션_One year', '약정옵션_Two year', '온라인고지서_Yes',
       '지불수단_Credit card (automatic)', '지불수단_Electronic check',
       '지불수단_Mailed check', '이탈여부_Yes', 'cluster'],
      dtype='object')

In [35]:
data[['이탈여부_Yes', 'cluster']]

Unnamed: 0,이탈여부_Yes,cluster
0,False,4
1,False,1
2,True,4
3,False,1
4,True,4
...,...,...
7038,False,1
7039,False,2
7040,False,4
7041,True,4


In [36]:
from sklearn.metrics import accuracy_score

In [37]:
print(accuracy_score(data['이탈여부_Yes'], data['cluster']))

0.15831321879880733


In [38]:
data[data['cluster'] == 0]

Unnamed: 0,고연령,가입기간,월요금,합산요금,성별_Male,배우자_Yes,피부양자_Yes,전화서비스_Yes,2회선이상_No phone service,2회선이상_Yes,...,스트리밍Movies_No internet service,스트리밍Movies_Yes,약정옵션_One year,약정옵션_Two year,온라인고지서_Yes,지불수단_Credit card (automatic),지불수단_Electronic check,지불수단_Mailed check,이탈여부_Yes,cluster
12,0,58,100.35,5681.10,True,True,False,True,False,True,...,False,True,True,False,False,True,False,False,False,0
13,0,49,103.70,5036.30,True,False,False,True,False,True,...,False,True,False,False,True,False,False,False,True,0
26,0,47,99.35,4749.15,True,True,True,True,False,True,...,False,True,False,False,True,False,True,False,True,0
41,0,70,69.20,4872.35,False,True,True,True,False,True,...,False,False,False,True,True,True,False,False,False,0
43,0,63,79.85,4861.45,False,False,False,True,False,True,...,False,False,False,True,True,True,False,False,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6993,1,50,88.05,4367.35,False,True,False,True,False,True,...,False,False,False,False,True,False,True,False,True,0
6995,0,57,89.55,5012.35,True,False,True,True,False,True,...,False,True,False,True,False,False,False,True,False,0
7007,1,72,63.10,4685.55,True,True,False,False,True,False,...,False,True,False,True,True,False,False,False,False,0
7012,0,62,84.95,5150.55,False,True,False,True,False,True,...,False,True,False,True,True,False,True,False,False,0


In [39]:
data[data['cluster'] == 1]

Unnamed: 0,고연령,가입기간,월요금,합산요금,성별_Male,배우자_Yes,피부양자_Yes,전화서비스_Yes,2회선이상_No phone service,2회선이상_Yes,...,스트리밍Movies_No internet service,스트리밍Movies_Yes,약정옵션_One year,약정옵션_Two year,온라인고지서_Yes,지불수단_Credit card (automatic),지불수단_Electronic check,지불수단_Mailed check,이탈여부_Yes,cluster
1,0,34,56.95,1889.50,True,False,False,True,False,False,...,False,False,True,False,False,False,False,True,False,1
3,0,45,42.30,1840.75,True,False,False,False,True,False,...,False,False,True,False,False,False,False,False,False,1
6,0,22,89.10,1949.40,True,False,True,True,False,True,...,False,False,False,False,True,True,False,False,False,1
16,0,52,20.65,1022.95,False,False,False,True,False,False,...,True,False,True,False,False,False,False,True,False,1
19,0,21,90.05,1862.90,False,False,False,True,False,False,...,False,True,False,False,True,False,True,False,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7017,0,51,20.65,1020.75,False,False,False,True,False,False,...,True,False,False,True,False,False,False,False,False,1
7025,0,18,95.05,1679.40,False,False,False,True,False,True,...,False,True,False,False,True,False,False,False,False,1
7035,0,19,78.70,1495.10,True,False,False,True,False,False,...,False,False,False,False,True,False,False,False,False,1
7037,0,72,21.15,1419.40,False,False,False,True,False,False,...,True,False,False,True,True,False,False,False,False,1


In [40]:
data[data['cluster'] == 2]

Unnamed: 0,고연령,가입기간,월요금,합산요금,성별_Male,배우자_Yes,피부양자_Yes,전화서비스_Yes,2회선이상_No phone service,2회선이상_Yes,...,스트리밍Movies_No internet service,스트리밍Movies_Yes,약정옵션_One year,약정옵션_Two year,온라인고지서_Yes,지불수단_Credit card (automatic),지불수단_Electronic check,지불수단_Mailed check,이탈여부_Yes,cluster
15,0,69,113.25,7895.15,False,True,True,True,False,True,...,False,True,False,True,False,True,False,False,False,2
17,0,71,106.70,7382.25,True,False,True,True,False,True,...,False,True,False,True,False,False,False,False,False,2
28,0,72,90.25,6369.45,True,True,False,True,False,True,...,False,True,False,True,True,True,False,False,False,2
30,1,71,96.35,6766.95,False,True,False,True,False,True,...,False,False,False,True,True,True,False,False,False,2
35,0,72,99.90,7251.70,False,True,True,True,False,True,...,False,False,False,True,False,False,False,False,False,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7022,0,72,104.95,7544.30,True,False,False,True,False,True,...,False,True,True,False,True,False,True,False,False,2
7023,1,63,103.50,6479.40,False,True,False,True,False,True,...,False,True,False,False,True,False,True,False,False,2
7034,0,67,102.95,6886.25,False,False,False,True,False,True,...,False,False,False,False,True,True,False,False,True,2
7039,0,72,103.20,7362.90,False,True,True,True,False,True,...,False,True,True,False,True,True,False,False,False,2


In [41]:
data[data['cluster'] == 3]

Unnamed: 0,고연령,가입기간,월요금,합산요금,성별_Male,배우자_Yes,피부양자_Yes,전화서비스_Yes,2회선이상_No phone service,2회선이상_Yes,...,스트리밍Movies_No internet service,스트리밍Movies_Yes,약정옵션_One year,약정옵션_Two year,온라인고지서_Yes,지불수단_Credit card (automatic),지불수단_Electronic check,지불수단_Mailed check,이탈여부_Yes,cluster
8,0,28,104.80,3046.05,False,True,False,True,False,True,...,False,True,False,False,True,False,True,False,True,3
9,0,62,56.15,3487.95,True,False,True,True,False,False,...,False,False,True,False,False,False,False,False,False,3
14,0,25,105.50,2686.05,True,False,False,True,False,False,...,False,True,False,False,True,False,True,False,False,3
23,0,58,59.90,3505.10,False,True,False,True,False,True,...,False,False,False,True,True,True,False,False,False,3
24,0,49,59.60,2970.30,True,True,True,True,False,False,...,False,False,False,False,False,True,False,False,False,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7014,0,41,89.20,3645.75,True,False,False,True,False,True,...,False,False,False,False,True,False,True,False,False,3
7015,1,34,85.20,2874.45,True,True,False,True,False,False,...,False,False,False,False,True,True,False,False,False,3
7024,0,44,84.80,3626.35,True,True,False,True,False,True,...,False,False,False,False,True,True,False,False,False,3
7031,1,55,60.00,3316.10,True,True,False,True,False,True,...,False,False,True,False,False,True,False,False,False,3


In [42]:
data[data['cluster'] == 4]

Unnamed: 0,고연령,가입기간,월요금,합산요금,성별_Male,배우자_Yes,피부양자_Yes,전화서비스_Yes,2회선이상_No phone service,2회선이상_Yes,...,스트리밍Movies_No internet service,스트리밍Movies_Yes,약정옵션_One year,약정옵션_Two year,온라인고지서_Yes,지불수단_Credit card (automatic),지불수단_Electronic check,지불수단_Mailed check,이탈여부_Yes,cluster
0,0,1,29.85,29.85,False,True,False,False,True,False,...,False,False,False,False,True,False,True,False,False,4
2,0,2,53.85,108.15,True,False,False,True,False,False,...,False,False,False,False,True,False,False,True,True,4
4,0,2,70.70,151.65,False,False,False,True,False,False,...,False,False,False,False,True,False,True,False,True,4
5,0,8,99.65,820.50,False,False,False,True,False,True,...,False,True,False,False,True,False,True,False,True,4
7,0,10,29.75,301.90,False,False,False,False,True,False,...,False,False,False,False,False,False,False,True,False,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7030,0,2,20.05,39.25,False,False,False,True,False,False,...,True,False,False,False,True,False,False,True,False,4
7032,1,1,75.75,75.75,True,False,False,True,False,True,...,False,False,False,False,True,False,True,False,True,4
7036,0,12,60.65,743.30,False,False,False,False,True,False,...,False,True,True,False,False,False,True,False,False,4
7040,0,11,29.60,346.45,False,True,True,False,True,False,...,False,False,False,False,True,False,True,False,False,4


In [43]:
data.groupby('cluster')['이탈여부_Yes'].value_counts()

cluster  이탈여부_Yes
0        False        749
         True         140
1        False       1288
         True         366
2        False        564
         True          82
3        False        804
         True         227
4        False       1769
         True        1054
Name: count, dtype: int64

In [44]:
cluster1 = data[data['cluster'] == 0]
cluster1

Unnamed: 0,고연령,가입기간,월요금,합산요금,성별_Male,배우자_Yes,피부양자_Yes,전화서비스_Yes,2회선이상_No phone service,2회선이상_Yes,...,스트리밍Movies_No internet service,스트리밍Movies_Yes,약정옵션_One year,약정옵션_Two year,온라인고지서_Yes,지불수단_Credit card (automatic),지불수단_Electronic check,지불수단_Mailed check,이탈여부_Yes,cluster
12,0,58,100.35,5681.10,True,True,False,True,False,True,...,False,True,True,False,False,True,False,False,False,0
13,0,49,103.70,5036.30,True,False,False,True,False,True,...,False,True,False,False,True,False,False,False,True,0
26,0,47,99.35,4749.15,True,True,True,True,False,True,...,False,True,False,False,True,False,True,False,True,0
41,0,70,69.20,4872.35,False,True,True,True,False,True,...,False,False,False,True,True,True,False,False,False,0
43,0,63,79.85,4861.45,False,False,False,True,False,True,...,False,False,False,True,True,True,False,False,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6993,1,50,88.05,4367.35,False,True,False,True,False,True,...,False,False,False,False,True,False,True,False,True,0
6995,0,57,89.55,5012.35,True,False,True,True,False,True,...,False,True,False,True,False,False,False,True,False,0
7007,1,72,63.10,4685.55,True,True,False,False,True,False,...,False,True,False,True,True,False,False,False,False,0
7012,0,62,84.95,5150.55,False,True,False,True,False,True,...,False,True,False,True,True,False,True,False,False,0


In [45]:
cluster1.describe()

Unnamed: 0,고연령,가입기간,월요금,합산요금,cluster
count,889.0,889.0,889.0,889.0,889.0
mean,0.192351,58.656918,89.178515,5139.804499,0.0
std,0.394369,9.003025,12.899796,552.725085,0.0
min,0.0,38.0,58.35,4209.95,0.0
25%,0.0,52.0,79.95,4674.4,0.0
50%,0.0,58.0,89.8,5088.4,0.0
75%,0.0,67.0,99.75,5623.7,0.0
max,1.0,72.0,117.45,6118.95,0.0


In [46]:
cluster1['고연령'].value_counts()

고연령
0    718
1    171
Name: count, dtype: int64

In [47]:
cols = cluster1.columns

In [48]:
for col in cols:
    print(col)
    print(cluster1[col].value_counts())
    print()
    print()

고연령
고연령
0    718
1    171
Name: count, dtype: int64


가입기간
가입기간
72    83
56    44
71    36
52    36
70    36
54    35
53    32
49    31
60    30
68    30
50    29
61    29
55    29
57    27
58    27
63    27
64    27
51    26
66    26
59    26
67    25
62    25
46    23
65    22
47    21
48    20
69    19
45    19
43    14
41    12
42    10
44     8
40     3
39     1
38     1
Name: count, dtype: int64


월요금
월요금
99.0     5
100.3    5
99.8     5
80.6     5
79.4     5
        ..
105.4    1
69.4     1
86.9     1
83.6     1
101.4    1
Name: count, Length: 552, dtype: int64


합산요금
합산요금
5714.20    2
5731.85    2
4528.00    2
5597.65    2
5682.25    2
          ..
4367.35    1
5012.35    1
4685.55    1
5150.55    1
4326.25    1
Name: count, Length: 880, dtype: int64


성별_Male
성별_Male
True     455
False    434
Name: count, dtype: int64


배우자_Yes
배우자_Yes
True     607
False    282
Name: count, dtype: int64


피부양자_Yes
피부양자_Yes
False    591
True     298
Name: count, dtype: int64


전화서비스_Yes
전화서비스_Y

In [49]:
cols

Index(['고연령', '가입기간', '월요금', '합산요금', '성별_Male', '배우자_Yes', '피부양자_Yes',
       '전화서비스_Yes', '2회선이상_No phone service', '2회선이상_Yes',
       '인터넷서비스_Fiber optic', '인터넷서비스_No', '온라인보안_No internet service',
       '온라인보안_Yes', '온라인백업_No internet service', '온라인백업_Yes',
       '기기보호서비스_No internet service', '기기보호서비스_Yes',
       '기술지원_No internet service', '기술지원_Yes', '스트리밍TV_No internet service',
       '스트리밍TV_Yes', '스트리밍Movies_No internet service', '스트리밍Movies_Yes',
       '약정옵션_One year', '약정옵션_Two year', '온라인고지서_Yes',
       '지불수단_Credit card (automatic)', '지불수단_Electronic check',
       '지불수단_Mailed check', '이탈여부_Yes', 'cluster'],
      dtype='object')

In [50]:
cluster1.groupby('가입기간')['이탈여부_Yes'].mean()

가입기간
38    0.000000
39    1.000000
40    0.000000
41    0.666667
42    0.400000
43    0.642857
44    0.250000
45    0.052632
46    0.260870
47    0.380952
48    0.250000
49    0.354839
50    0.275862
51    0.192308
52    0.194444
53    0.281250
54    0.314286
55    0.241379
56    0.136364
57    0.148148
58    0.148148
59    0.153846
60    0.100000
61    0.068966
62    0.000000
63    0.000000
64    0.037037
65    0.181818
66    0.115385
67    0.120000
68    0.066667
69    0.052632
70    0.027778
71    0.000000
72    0.000000
Name: 이탈여부_Yes, dtype: float64

In [51]:
cluster1.groupby('이탈여부_Yes')['합산요금'].mean()

이탈여부_Yes
False    5151.198932
True     5078.844286
Name: 합산요금, dtype: float64

# 군집의 최적 개수 정하기
* elbow method
* 실루엣 지수

군집분석의 응집도(얼마나 모여 있는지 판단하는 지표)
* inertia_값이 작을 수록 군집이 잘 형성됨

Elbow method: inertia_(응집도)를 출력하고 시각화 한 뒤 응집도의 감소폭이 급격한 변화가 사라지는 지점을 최적 군집의 수로 채택하는 것

In [52]:
print(km.inertia_) #집단 내 제곱합. 급격하게 완만해지는 지점을 찾기

1330911014.1273248


In [53]:
distance = []
for i in range(1, 21):
    km = KMeans(n_clusters = i)
    km.fit(data)
    distance.append(km.inertia_)
distance

[36194988576.55949,
 7935352593.516737,
 3706752170.6181183,
 2117701371.868119,
 1330334591.1539938,
 920882249.2892963,
 674162673.7647347,
 527856944.5018771,
 417170602.5340221,
 329628236.4370811,
 270832227.4458141,
 230567740.04660666,
 196989464.24450958,
 169540810.15924752,
 152368237.25034818,
 133414504.77231243,
 118245582.97905028,
 106566704.6836487,
 96774535.73653355,
 87282382.5312982]

In [54]:
plt.figure(figsize=(20,10))
sns.lineplot(x= range(1, 21), y=distance, marker='o', markersize=10, markerfacecolor='red')
plt.show()

실루엣 지수를 출력해서 최적 군집 개수 구하기
* 실루엣 지수를 구하기 위해서는 최소 2개 이상의 군집이 필요

In [55]:
from sklearn.metrics import silhouette_score

In [56]:
silhouette_scores = []
for i in range(2, 21):
    km2 = KMeans(n_clusters= i)
    km2.fit(data)
    labels = km2.labels_
    silhouette_scores.append(silhouette_score(data, labels))
silhouette_scores   

[np.float64(0.7030784466128602),
 np.float64(0.6445551997372608),
 np.float64(0.6022828203450401),
 np.float64(0.5968831049802236),
 np.float64(0.5938934853059714),
 np.float64(0.5825212263220294),
 np.float64(0.5739378052715307),
 np.float64(0.5636155701961028),
 np.float64(0.5632690360488533),
 np.float64(0.5617983345990608),
 np.float64(0.560743227414357),
 np.float64(0.5471566525289361),
 np.float64(0.549901778215374),
 np.float64(0.5456902189888262),
 np.float64(0.545102495098368),
 np.float64(0.5301381332983107),
 np.float64(0.5298918077771623),
 np.float64(0.5345682326704141),
 np.float64(0.5248981495257409)]

In [57]:
plt.figure(figsize=(20,10))
sns.lineplot(x= range(2, 21), y=silhouette_scores, marker='o', markersize=10, markerfacecolor='red')
plt.show()

In [58]:
data

Unnamed: 0,고연령,가입기간,월요금,합산요금,성별_Male,배우자_Yes,피부양자_Yes,전화서비스_Yes,2회선이상_No phone service,2회선이상_Yes,...,스트리밍Movies_No internet service,스트리밍Movies_Yes,약정옵션_One year,약정옵션_Two year,온라인고지서_Yes,지불수단_Credit card (automatic),지불수단_Electronic check,지불수단_Mailed check,이탈여부_Yes,cluster
0,0,1,29.85,29.85,False,True,False,False,True,False,...,False,False,False,False,True,False,True,False,False,4
1,0,34,56.95,1889.50,True,False,False,True,False,False,...,False,False,True,False,False,False,False,True,False,1
2,0,2,53.85,108.15,True,False,False,True,False,False,...,False,False,False,False,True,False,False,True,True,4
3,0,45,42.30,1840.75,True,False,False,False,True,False,...,False,False,True,False,False,False,False,False,False,1
4,0,2,70.70,151.65,False,False,False,True,False,False,...,False,False,False,False,True,False,True,False,True,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,24,84.80,1990.50,True,True,True,True,False,True,...,False,True,True,False,True,False,False,True,False,1
7039,0,72,103.20,7362.90,False,True,True,True,False,True,...,False,True,True,False,True,True,False,False,False,2
7040,0,11,29.60,346.45,False,True,True,False,True,False,...,False,False,False,False,True,False,True,False,False,4
7041,1,4,74.40,306.60,True,True,False,True,False,True,...,False,False,False,False,True,False,False,True,True,4


# K-means 로 군집분석을 할 때 주의점
* k-means는 평균값을 이용하고 주로 유클리드 거리를 사용하기 때문에 이상치에 민감
* 군집분석하기 전에 데이터의 스케일을 미리 맞춰주는 것이 중요

In [59]:
data3 = data.drop('cluster', axis=1)
data3

Unnamed: 0,고연령,가입기간,월요금,합산요금,성별_Male,배우자_Yes,피부양자_Yes,전화서비스_Yes,2회선이상_No phone service,2회선이상_Yes,...,스트리밍TV_Yes,스트리밍Movies_No internet service,스트리밍Movies_Yes,약정옵션_One year,약정옵션_Two year,온라인고지서_Yes,지불수단_Credit card (automatic),지불수단_Electronic check,지불수단_Mailed check,이탈여부_Yes
0,0,1,29.85,29.85,False,True,False,False,True,False,...,False,False,False,False,False,True,False,True,False,False
1,0,34,56.95,1889.50,True,False,False,True,False,False,...,False,False,False,True,False,False,False,False,True,False
2,0,2,53.85,108.15,True,False,False,True,False,False,...,False,False,False,False,False,True,False,False,True,True
3,0,45,42.30,1840.75,True,False,False,False,True,False,...,False,False,False,True,False,False,False,False,False,False
4,0,2,70.70,151.65,False,False,False,True,False,False,...,False,False,False,False,False,True,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,24,84.80,1990.50,True,True,True,True,False,True,...,True,False,True,True,False,True,False,False,True,False
7039,0,72,103.20,7362.90,False,True,True,True,False,True,...,True,False,True,True,False,True,True,False,False,False
7040,0,11,29.60,346.45,False,True,True,False,True,False,...,False,False,False,False,False,True,False,True,False,False
7041,1,4,74.40,306.60,True,True,False,True,False,True,...,False,False,False,False,False,True,False,False,True,True


In [60]:
sns.boxplot(data3['합산요금'])

<Axes: ylabel='합산요금'>

In [61]:
sns.boxplot(data3['월요금'])

<Axes: ylabel='합산요금'>

In [62]:
X = data3.drop('이탈여부_Yes', axis=1)

In [63]:
X

Unnamed: 0,고연령,가입기간,월요금,합산요금,성별_Male,배우자_Yes,피부양자_Yes,전화서비스_Yes,2회선이상_No phone service,2회선이상_Yes,...,스트리밍TV_No internet service,스트리밍TV_Yes,스트리밍Movies_No internet service,스트리밍Movies_Yes,약정옵션_One year,약정옵션_Two year,온라인고지서_Yes,지불수단_Credit card (automatic),지불수단_Electronic check,지불수단_Mailed check
0,0,1,29.85,29.85,False,True,False,False,True,False,...,False,False,False,False,False,False,True,False,True,False
1,0,34,56.95,1889.50,True,False,False,True,False,False,...,False,False,False,False,True,False,False,False,False,True
2,0,2,53.85,108.15,True,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,True
3,0,45,42.30,1840.75,True,False,False,False,True,False,...,False,False,False,False,True,False,False,False,False,False
4,0,2,70.70,151.65,False,False,False,True,False,False,...,False,False,False,False,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,24,84.80,1990.50,True,True,True,True,False,True,...,False,True,False,True,True,False,True,False,False,True
7039,0,72,103.20,7362.90,False,True,True,True,False,True,...,False,True,False,True,True,False,True,True,False,False
7040,0,11,29.60,346.45,False,True,True,False,True,False,...,False,False,False,False,False,False,True,False,True,False
7041,1,4,74.40,306.60,True,True,False,True,False,True,...,False,False,False,False,False,False,True,False,False,True


In [64]:
from sklearn.preprocessing import MinMaxScaler

In [65]:
mm = MinMaxScaler()
scaled_X = mm.fit_transform(X)
scaled_X

array([[0.        , 0.01388889, 0.11542289, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.47222222, 0.38507463, ..., 0.        , 0.        ,
        1.        ],
       [0.        , 0.02777778, 0.35422886, ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.        , 0.15277778, 0.11293532, ..., 0.        , 1.        ,
        0.        ],
       [1.        , 0.05555556, 0.55870647, ..., 0.        , 0.        ,
        1.        ],
       [0.        , 0.91666667, 0.86965174, ..., 0.        , 0.        ,
        0.        ]])

In [66]:
scaled_X = pd.DataFrame(scaled_X, columns=X.columns)
scaled_X

Unnamed: 0,고연령,가입기간,월요금,합산요금,성별_Male,배우자_Yes,피부양자_Yes,전화서비스_Yes,2회선이상_No phone service,2회선이상_Yes,...,스트리밍TV_No internet service,스트리밍TV_Yes,스트리밍Movies_No internet service,스트리밍Movies_Yes,약정옵션_One year,약정옵션_Two year,온라인고지서_Yes,지불수단_Credit card (automatic),지불수단_Electronic check,지불수단_Mailed check
0,0.0,0.013889,0.115423,0.003437,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,0.472222,0.385075,0.217564,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.027778,0.354229,0.012453,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,0.625000,0.239303,0.211951,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.027778,0.521891,0.017462,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0.0,0.333333,0.662189,0.229194,1.0,1.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
7039,0.0,1.000000,0.845274,0.847792,0.0,1.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
7040,0.0,0.152778,0.112935,0.039892,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
7041,1.0,0.055556,0.558706,0.035303,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [67]:
km4 = KMeans(n_clusters=2)
km4.fit(scaled_X)
result4 = km4.labels_

In [68]:
result4

array([1, 1, 1, ..., 1, 1, 1], dtype=int32)

In [69]:
scaled_X['cluster'] = result4
scaled_X

Unnamed: 0,고연령,가입기간,월요금,합산요금,성별_Male,배우자_Yes,피부양자_Yes,전화서비스_Yes,2회선이상_No phone service,2회선이상_Yes,...,스트리밍TV_Yes,스트리밍Movies_No internet service,스트리밍Movies_Yes,약정옵션_One year,약정옵션_Two year,온라인고지서_Yes,지불수단_Credit card (automatic),지불수단_Electronic check,지불수단_Mailed check,cluster
0,0.0,0.013889,0.115423,0.003437,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1
1,0.0,0.472222,0.385075,0.217564,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
2,0.0,0.027778,0.354229,0.012453,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1
3,0.0,0.625000,0.239303,0.211951,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.027778,0.521891,0.017462,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0.0,0.333333,0.662189,0.229194,1.0,1.0,1.0,1.0,0.0,1.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1
7039,0.0,1.000000,0.845274,0.847792,0.0,1.0,1.0,1.0,0.0,1.0,...,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1
7040,0.0,0.152778,0.112935,0.039892,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1
7041,1.0,0.055556,0.558706,0.035303,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1


In [70]:
print(accuracy_score(data3['이탈여부_Yes'], result4))

0.4499503052676416


In [71]:
distance2 =[]
silhouette_scores2 = []
for i in range(2,21):
    km4 = KMeans(n_clusters=i, random_state=77)
    km4.fit(scaled_X)
#     result4 = km4.labels_
    distance2.append(km4.inertia_)
    silhouette_scores2.append(silhouette_score(scaled_X, labels))
distance2

[27290.878957910092,
 24175.3732987248,
 22523.637339941695,
 21926.15780847342,
 20982.937145125477,
 20363.119074509385,
 19728.81350601308,
 19453.71912425688,
 19229.11304273256,
 18692.768203401152,
 18338.034205145017,
 18041.726193032304,
 17784.85886015354,
 17522.398465635986,
 17182.62324110495,
 17118.195972001355,
 16834.68350377159,
 16682.20063004888,
 16537.305385907806]

In [72]:
plt.figure(figsize=(20,10))
sns.lineplot(x= range(2, 21), y=distance2, marker='o', markersize=10, markerfacecolor='red')
plt.show()

  func(*args)
  func(*args)
  func(*args)
  func(*args)


In [73]:
silhouette_scores2 = []
for i in range(2,21):
    km4 = KMeans(n_clusters=i, random_state=77)
    km4.fit(scaled_X)
    labels = km4.labels_
    silhouette_scores2.append(silhouette_score(scaled_X, labels))
silhouette_scores2

[np.float64(0.33537259240562683),
 np.float64(0.20486830809453152),
 np.float64(0.1874462441457756),
 np.float64(0.1239388923411754),
 np.float64(0.11212497473696784),
 np.float64(0.1056040166032437),
 np.float64(0.10595861576122348),
 np.float64(0.10333168786143268),
 np.float64(0.10252590438776678),
 np.float64(0.09813459043840268),
 np.float64(0.09554011304360674),
 np.float64(0.09478571845039245),
 np.float64(0.09389124236738705),
 np.float64(0.09214133551555398),
 np.float64(0.0993003464916869),
 np.float64(0.09370838650931634),
 np.float64(0.09641347329665709),
 np.float64(0.09072021196392067),
 np.float64(0.0912310451093464)]

In [74]:
plt.figure(figsize=(20,10))
sns.lineplot(x= range(2, 21), y=silhouette_scores2, marker='o', markersize=10, markerfacecolor='red')
plt.show()

DBSCAN: 밀도기반 군집분석 알고리즘

In [75]:
from sklearn.cluster import DBSCAN

In [76]:
data3['cluster'] = labels

In [77]:
scaled_X

Unnamed: 0,고연령,가입기간,월요금,합산요금,성별_Male,배우자_Yes,피부양자_Yes,전화서비스_Yes,2회선이상_No phone service,2회선이상_Yes,...,스트리밍TV_Yes,스트리밍Movies_No internet service,스트리밍Movies_Yes,약정옵션_One year,약정옵션_Two year,온라인고지서_Yes,지불수단_Credit card (automatic),지불수단_Electronic check,지불수단_Mailed check,cluster
0,0.0,0.013889,0.115423,0.003437,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1
1,0.0,0.472222,0.385075,0.217564,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
2,0.0,0.027778,0.354229,0.012453,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1
3,0.0,0.625000,0.239303,0.211951,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.027778,0.521891,0.017462,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0.0,0.333333,0.662189,0.229194,1.0,1.0,1.0,1.0,0.0,1.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1
7039,0.0,1.000000,0.845274,0.847792,0.0,1.0,1.0,1.0,0.0,1.0,...,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1
7040,0.0,0.152778,0.112935,0.039892,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1
7041,1.0,0.055556,0.558706,0.035303,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1


In [78]:
data3

Unnamed: 0,고연령,가입기간,월요금,합산요금,성별_Male,배우자_Yes,피부양자_Yes,전화서비스_Yes,2회선이상_No phone service,2회선이상_Yes,...,스트리밍Movies_No internet service,스트리밍Movies_Yes,약정옵션_One year,약정옵션_Two year,온라인고지서_Yes,지불수단_Credit card (automatic),지불수단_Electronic check,지불수단_Mailed check,이탈여부_Yes,cluster
0,0,1,29.85,29.85,False,True,False,False,True,False,...,False,False,False,False,True,False,True,False,False,17
1,0,34,56.95,1889.50,True,False,False,True,False,False,...,False,False,True,False,False,False,False,True,False,7
2,0,2,53.85,108.15,True,False,False,True,False,False,...,False,False,False,False,True,False,False,True,True,16
3,0,45,42.30,1840.75,True,False,False,False,True,False,...,False,False,True,False,False,False,False,False,False,10
4,0,2,70.70,151.65,False,False,False,True,False,False,...,False,False,False,False,True,False,True,False,True,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,24,84.80,1990.50,True,True,True,True,False,True,...,False,True,True,False,True,False,False,True,False,12
7039,0,72,103.20,7362.90,False,True,True,True,False,True,...,False,True,True,False,True,True,False,False,False,2
7040,0,11,29.60,346.45,False,True,True,False,True,False,...,False,False,False,False,True,False,True,False,False,17
7041,1,4,74.40,306.60,True,True,False,True,False,True,...,False,False,False,False,True,False,False,True,True,5


In [79]:
final_df = scaled_X.copy()
final_df

Unnamed: 0,고연령,가입기간,월요금,합산요금,성별_Male,배우자_Yes,피부양자_Yes,전화서비스_Yes,2회선이상_No phone service,2회선이상_Yes,...,스트리밍TV_Yes,스트리밍Movies_No internet service,스트리밍Movies_Yes,약정옵션_One year,약정옵션_Two year,온라인고지서_Yes,지불수단_Credit card (automatic),지불수단_Electronic check,지불수단_Mailed check,cluster
0,0.0,0.013889,0.115423,0.003437,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1
1,0.0,0.472222,0.385075,0.217564,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
2,0.0,0.027778,0.354229,0.012453,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1
3,0.0,0.625000,0.239303,0.211951,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.027778,0.521891,0.017462,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0.0,0.333333,0.662189,0.229194,1.0,1.0,1.0,1.0,0.0,1.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1
7039,0.0,1.000000,0.845274,0.847792,0.0,1.0,1.0,1.0,0.0,1.0,...,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1
7040,0.0,0.152778,0.112935,0.039892,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1
7041,1.0,0.055556,0.558706,0.035303,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1


In [80]:
scaled_X = scaled_X.drop('cluster', axis=1)
scaled_X

Unnamed: 0,고연령,가입기간,월요금,합산요금,성별_Male,배우자_Yes,피부양자_Yes,전화서비스_Yes,2회선이상_No phone service,2회선이상_Yes,...,스트리밍TV_No internet service,스트리밍TV_Yes,스트리밍Movies_No internet service,스트리밍Movies_Yes,약정옵션_One year,약정옵션_Two year,온라인고지서_Yes,지불수단_Credit card (automatic),지불수단_Electronic check,지불수단_Mailed check
0,0.0,0.013889,0.115423,0.003437,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,0.472222,0.385075,0.217564,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.027778,0.354229,0.012453,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,0.625000,0.239303,0.211951,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.027778,0.521891,0.017462,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0.0,0.333333,0.662189,0.229194,1.0,1.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
7039,0.0,1.000000,0.845274,0.847792,0.0,1.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
7040,0.0,0.152778,0.112935,0.039892,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
7041,1.0,0.055556,0.558706,0.035303,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [81]:
model = DBSCAN(eps=2, min_samples=5)
DB_result = model.fit_predict(scaled_X)
DB_result

array([0, 0, 0, ..., 0, 0, 0])

In [82]:
final_df['db_cluster'] = DB_result

In [83]:
final_df['db_cluster'].value_counts()

db_cluster
0    5517
1    1526
Name: count, dtype: int64

In [84]:
final_df[['cluster','db_cluster']].head(60)

Unnamed: 0,cluster,db_cluster
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
5,1,0
6,1,0
7,1,0
8,1,0
9,1,0


In [85]:
final_df['db_cluster'] = final_df['db_cluster'].apply(lambda x: 1 if x == 0 else 0)

In [86]:
final_df[['cluster','db_cluster']].head(60)

Unnamed: 0,cluster,db_cluster
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
5,1,1
6,1,1
7,1,1
8,1,1
9,1,1


In [87]:
accuracy_score(final_df['cluster'], final_df['db_cluster'])

1.0