In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import koreanize_matplotlib
import seaborn as sns
from ydata_profiling import ProfileReport

  from .autonotebook import tqdm as notebook_tqdm


In [63]:
data = pd.read_csv("https://raw.githubusercontent.com/haram4th/ablearn/main/shopping-data.csv")
data.head()

Unnamed: 0,CustomerID,Genre,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [64]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              200 non-null    int64 
 1   Genre                   200 non-null    object
 2   Age                     200 non-null    int64 
 3   Annual Income (k$)      200 non-null    int64 
 4   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB


In [65]:
pf = ProfileReport(data)
pf.to_file("쇼핑.html")

Summarize dataset: 100%|████████████████████████████████████████████████████| 30/30 [00:03<00:00,  8.05it/s, Completed]
Generate report structure: 100%|█████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.92s/it]
Render HTML: 100%|███████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.65it/s]
Export report to file: 100%|████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 199.92it/s]


# 계층적 군집 분석
* 비지도 학습
* 데이터가 적을 경우 사용
* 거리 계산 지표
* 단일연결법(최소연결법)
* 완전연결법(최장연결법)
* 평균연결법
* 중심연결법
* 와드연결법
* 덴드로그램으로 계층적 군집 시각화

In [66]:
import scipy.cluster.hierarchy as shc

In [67]:
data = pd.get_dummies(data, drop_first=True)
data.head()

Unnamed: 0,CustomerID,Age,Annual Income (k$),Spending Score (1-100),Genre_Male
0,1,19,15,39,True
1,2,21,15,81,True
2,3,20,16,6,False
3,4,23,16,77,False
4,5,31,17,40,False


In [68]:
data

Unnamed: 0,CustomerID,Age,Annual Income (k$),Spending Score (1-100),Genre_Male
0,1,19,15,39,True
1,2,21,15,81,True
2,3,20,16,6,False
3,4,23,16,77,False
4,5,31,17,40,False
...,...,...,...,...,...
195,196,35,120,79,False
196,197,45,126,28,False
197,198,32,126,74,True
198,199,32,137,18,True


In [69]:
data2 = data.iloc[:, 3:5]

In [70]:
plt.switch_backend('TkAgg')

In [71]:
plt.figure(figsize=(30, 10))
plt.title("Customer Dendrograms_single")
dend = shc.dendrogram(shc.linkage(data2, method='single'))
plt.show()

In [72]:
plt.figure(figsize=(30, 10))
plt.title("Customer Dendrograms_complete")
dend = shc.dendrogram(shc.linkage(data2, method='complete'))
plt.show()

In [73]:
plt.figure(figsize=(30, 10))
plt.title("Customer Dendrograms_average")
dend = shc.dendrogram(shc.linkage(data2, method='average'))
plt.show()

In [74]:
plt.figure(figsize=(30, 10))
plt.title("Customer Dendrograms_centroid")
dend = shc.dendrogram(shc.linkage(data2, method='centroid'))
plt.show()

In [75]:
plt.figure(figsize=(30, 10))
plt.title("Customer Dendrograms_ward")
dend = shc.dendrogram(shc.linkage(data2, method='ward'))
plt.show()

In [76]:
dend

{'icoord': [[15.0, 15.0, 25.0, 25.0],
  [5.0, 5.0, 20.0, 20.0],
  [45.0, 45.0, 55.0, 55.0],
  [35.0, 35.0, 50.0, 50.0],
  [12.5, 12.5, 42.5, 42.5],
  [75.0, 75.0, 85.0, 85.0],
  [65.0, 65.0, 80.0, 80.0],
  [95.0, 95.0, 105.0, 105.0],
  [125.0, 125.0, 135.0, 135.0],
  [115.0, 115.0, 130.0, 130.0],
  [155.0, 155.0, 165.0, 165.0],
  [145.0, 145.0, 160.0, 160.0],
  [122.5, 122.5, 152.5, 152.5],
  [100.0, 100.0, 137.5, 137.5],
  [72.5, 72.5, 118.75, 118.75],
  [185.0, 185.0, 195.0, 195.0],
  [175.0, 175.0, 190.0, 190.0],
  [205.0, 205.0, 215.0, 215.0],
  [225.0, 225.0, 235.0, 235.0],
  [210.0, 210.0, 230.0, 230.0],
  [182.5, 182.5, 220.0, 220.0],
  [245.0, 245.0, 255.0, 255.0],
  [285.0, 285.0, 295.0, 295.0],
  [275.0, 275.0, 290.0, 290.0],
  [265.0, 265.0, 282.5, 282.5],
  [250.0, 250.0, 273.75, 273.75],
  [201.25, 201.25, 261.875, 261.875],
  [95.625, 95.625, 231.5625, 231.5625],
  [27.5, 27.5, 163.59375, 163.59375],
  [305.0, 305.0, 315.0, 315.0],
  [335.0, 335.0, 345.0, 345.0],
  [325.0

In [77]:
data2 = data.iloc[:, 3:5].values
data2[:3]

array([[39, True],
       [81, True],
       [6, False]], dtype=object)

In [78]:
from sklearn.cluster import AgglomerativeClustering

cluster = AgglomerativeClustering(n_clusters=5, metric='euclidean', linkage='ward')
result = cluster.fit_predict(data2)
plt.figure(figsize=(10,5))
plt.scatter(data2[:,0], data2[:,1], c=cluster.labels_, cmap='rainbow')

<matplotlib.collections.PathCollection at 0x294e0abf790>

In [79]:
result

array([0, 3, 2, 4, 0, 4, 2, 3, 2, 4, 2, 3, 2, 4, 2, 4, 0, 4, 0, 3, 0, 4,
       2, 4, 2, 3, 0, 1, 0, 3, 2, 4, 2, 3, 2, 3, 2, 4, 0, 4, 0, 3, 0, 1,
       0, 4, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 3, 0, 4, 0, 3, 2, 4, 2, 4,
       0, 4, 2, 3, 2, 4, 2, 4, 2, 3, 0, 3, 2, 3, 0, 4, 0, 3, 2, 3, 0, 4,
       2, 3, 2, 4, 2, 4, 0, 3, 2, 3, 0, 4, 0, 3, 0, 4, 2, 4, 2, 3, 2, 3,
       2, 4, 2, 3, 0, 3, 2, 3, 0, 3, 0, 4, 2, 3, 0, 4, 2, 3, 2, 4, 0, 4,
       2, 3], dtype=int64)

In [80]:
data['cluster'] = result

In [81]:
data

Unnamed: 0,CustomerID,Age,Annual Income (k$),Spending Score (1-100),Genre_Male,cluster
0,1,19,15,39,True,0
1,2,21,15,81,True,3
2,3,20,16,6,False,2
3,4,23,16,77,False,4
4,5,31,17,40,False,0
...,...,...,...,...,...,...
195,196,35,120,79,False,4
196,197,45,126,28,False,0
197,198,32,126,74,True,4
198,199,32,137,18,True,2


In [82]:
data[data['cluster']==0]

Unnamed: 0,CustomerID,Age,Annual Income (k$),Spending Score (1-100),Genre_Male,cluster
0,1,19,15,39,True,0
4,5,31,17,40,False,0
16,17,35,21,35,False,0
18,19,52,23,29,True,0
20,21,35,24,35,True,0
26,27,45,28,32,False,0
28,29,40,29,31,False,0
38,39,36,37,26,False,0
40,41,65,38,35,False,0
42,43,48,39,36,True,0


In [83]:
data[data['cluster']==1]

Unnamed: 0,CustomerID,Age,Annual Income (k$),Spending Score (1-100),Genre_Male,cluster
27,28,35,28,61,True,1
43,44,31,39,61,False,1
46,47,50,40,55,False,1
47,48,27,40,47,False,1
50,51,49,42,52,False,1
...,...,...,...,...,...,...
115,116,19,65,50,False,1
117,118,49,65,59,False,1
119,120,50,67,57,False,1
120,121,27,67,56,True,1


In [84]:
data[data['cluster']==2]

Unnamed: 0,CustomerID,Age,Annual Income (k$),Spending Score (1-100),Genre_Male,cluster
2,3,20,16,6,False,2
6,7,35,18,6,False,2
8,9,64,19,3,True,2
10,11,67,19,14,True,2
12,13,58,20,15,False,2
14,15,37,20,13,True,2
22,23,46,25,5,False,2
24,25,54,28,14,False,2
30,31,60,30,4,True,2
32,33,53,33,4,True,2


In [85]:
data[data['cluster']==3]

Unnamed: 0,CustomerID,Age,Annual Income (k$),Spending Score (1-100),Genre_Male,cluster
1,2,21,15,81,True,3
7,8,23,18,94,False,3
11,12,35,19,99,False,3
19,20,35,23,98,False,3
25,26,29,28,82,True,3
29,30,23,29,87,False,3
33,34,18,33,92,True,3
35,36,21,33,81,False,3
41,42,24,38,92,True,3
123,124,39,69,91,True,3


In [86]:
data[data['cluster']==4]

Unnamed: 0,CustomerID,Age,Annual Income (k$),Spending Score (1-100),Genre_Male,cluster
3,4,23,16,77,False,4
5,6,22,17,76,False,4
9,10,30,19,72,False,4
13,14,24,20,77,False,4
15,16,22,20,79,True,4
17,18,20,21,66,True,4
21,22,25,24,73,True,4
23,24,31,25,73,True,4
31,32,21,30,73,False,4
37,38,30,34,73,False,4


# 비계층적 군집 분석

## K-Means(K평균)을 사용한 군집 분석

* 통신사 고객 이동 데이터 분석
  * 독립변수(Feature) 20개: 회원ID, 성별, 고연령, 배우자, 피부양자, 가입기간, 전화서비스, 2회선이상, 인터넷서비스, 온라인보안, 온라인백업, 기기보호서비스, 기술지원, 스트리밍TV,스트리밍Movies, 약정옵션, 온라인고지서, 지불수단, 월요금, 합산요금
  * 종속변수(target) 1개: 이탈여부

In [12]:
data = pd.read_csv("https://raw.githubusercontent.com/haram4th/ADsP/main/06%EA%B3%A0%EA%B0%9D%EC%9D%B4%ED%83%88%EC%98%88%EC%B8%A1.csv", encoding='cp949')
data.head()

Unnamed: 0,회원ID,성별,고연령,배우자,피부양자,가입기간,전화서비스,2회선이상,인터넷서비스,온라인보안,...,기기보호서비스,기술지원,스트리밍TV,스트리밍Movies,약정옵션,온라인고지서,지불수단,월요금,합산요금,이탈여부
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [88]:
pf = ProfileReport(data)
pf.to_file("통신사고객이탈.html")

Summarize dataset:  81%|██████████████████████████████       | 26/32 [00:13<00:06,  1.06s/it, scatter 가입기간, 월요금]matrix]Exception in Tkinter callback
Traceback (most recent call last):
  File "C:\Users\hi\anaconda3\envs\testenv\lib\tkinter\__init__.py", line 1892, in __call__
    return self.func(*args)
  File "C:\Users\hi\anaconda3\envs\testenv\lib\site-packages\matplotlib\backends\_backend_tk.py", line 552, in destroy
    Gcf.destroy(self)
  File "C:\Users\hi\anaconda3\envs\testenv\lib\site-packages\matplotlib\_pylab_helpers.py", line 66, in destroy
    manager.destroy()
  File "C:\Users\hi\anaconda3\envs\testenv\lib\site-packages\matplotlib\backends\_backend_tk.py", line 569, in destroy
    self._window_dpi.trace_remove('write', self._window_dpi_cbname)
  File "C:\Users\hi\anaconda3\envs\testenv\lib\tkinter\__init__.py", line 430, in trace_remove
    self._tk.deletecommand(cbname)
_tkinter.TclError: can't delete Tcl command
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(


Summarize dataset: 100%|████████████████████████████████████████████████████| 34/34 [00:15<00:00,  2.13it/s, Completed]
Generate report structure:   0%|                                                                 | 0/1 [00:10<?, ?it/s]


KeyboardInterrupt: 

In [13]:
data.head(2)

Unnamed: 0,회원ID,성별,고연령,배우자,피부양자,가입기간,전화서비스,2회선이상,인터넷서비스,온라인보안,...,기기보호서비스,기술지원,스트리밍TV,스트리밍Movies,약정옵션,온라인고지서,지불수단,월요금,합산요금,이탈여부
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   회원ID        7043 non-null   object 
 1   성별          7043 non-null   object 
 2   고연령         7043 non-null   int64  
 3   배우자         7043 non-null   object 
 4   피부양자        7043 non-null   object 
 5   가입기간        7043 non-null   int64  
 6   전화서비스       7043 non-null   object 
 7   2회선이상       7043 non-null   object 
 8   인터넷서비스      7043 non-null   object 
 9   온라인보안       7043 non-null   object 
 10  온라인백업       7043 non-null   object 
 11  기기보호서비스     7043 non-null   object 
 12  기술지원        7043 non-null   object 
 13  스트리밍TV      7043 non-null   object 
 14  스트리밍Movies  7043 non-null   object 
 15  약정옵션        7043 non-null   object 
 16  온라인고지서      7043 non-null   object 
 17  지불수단        7043 non-null   object 
 18  월요금         7043 non-null   float64
 19  합산요금        7043 non-null  

In [5]:
data['합산요금'] = data['합산요금'].astype(float)

ValueError: could not convert string to float: ''

In [15]:
data['합산요금'].value_counts()

합산요금
          11
20.2      11
19.75      9
20.05      8
19.9       8
          ..
6849.4     1
692.35     1
130.15     1
3211.9     1
6844.5     1
Name: count, Length: 6531, dtype: int64

In [16]:
na_index = data[data['합산요금'] == " "].index

In [18]:
data['합산요금'] = data.apply(lambda x: x['월요금'] if x['합산요금'] == " " else x['합산요금'], axis=1)

In [19]:
data.loc[na_index]

Unnamed: 0,회원ID,성별,고연령,배우자,피부양자,가입기간,전화서비스,2회선이상,인터넷서비스,온라인보안,...,기기보호서비스,기술지원,스트리밍TV,스트리밍Movies,약정옵션,온라인고지서,지불수단,월요금,합산요금,이탈여부
488,4472-LVYGI,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,...,Yes,Yes,Yes,No,Two year,Yes,Bank transfer (automatic),52.55,52.55,No
753,3115-CZMZD,Male,0,No,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.25,20.25,No
936,5709-LVOEQ,Female,0,Yes,Yes,0,Yes,No,DSL,Yes,...,Yes,No,Yes,Yes,Two year,No,Mailed check,80.85,80.85,No
1082,4367-NUYAO,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.75,25.75,No
1340,1371-DWPAZ,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,...,Yes,Yes,Yes,No,Two year,No,Credit card (automatic),56.05,56.05,No
3331,7644-OMVMY,Male,0,Yes,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.85,19.85,No
3826,3213-VVOLG,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.35,25.35,No
4380,2520-SGTTA,Female,0,Yes,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.0,20.0,No
5218,2923-ARZLG,Male,0,Yes,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,One year,Yes,Mailed check,19.7,19.7,No
6670,4075-WKNIU,Female,0,Yes,Yes,0,Yes,Yes,DSL,No,...,Yes,Yes,Yes,No,Two year,No,Mailed check,73.35,73.35,No
