<a href="https://colab.research.google.com/github/kszymon/machine-learning-bootcamp/blob/main/unsupervised%20/05_case_studies/01_customer_segmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### scikit-learn
Strona biblioteki: [https://scikit-learn.org](https://scikit-learn.org)  

Dokumentacja/User Guide: [https://scikit-learn.org/stable/user_guide.html](https://scikit-learn.org/stable/user_guide.html)

Podstawowa biblioteka do uczenia maszynowego w języku Python.

Aby zainstalować bibliotekę scikit-learn, użyj polecenia poniżej:
```
!pip install scikit-learn
```
Aby zaktualizować do najnowszej wersji bibliotekę scikit-learn, użyj polecenia poniżej:
```
!pip install --upgrade scikit-learn
```

### Spis treści:
1. [Import bibliotek](#0)
2. [Załadowanie danych](#1)
3. [Eksploracja danych](#2)
4. [Wyznacznie retencji klienta](#3)
5. [Retencja - KMeans](#4)
6. [Retencja - DBSCAN](#5)
7. [Sprzedaż](#6)
8. [Sprzedaż - KMeans](#7)
9. [Sprzedaż - DBSCAN](#8)
10. [Retencja, sprzedaż - KMeans](#9)

### <a name='0'></a> Import bibliotek

In [67]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

### <a name='1'></a> Załadowanie danych

In [68]:
url = 'https://storage.googleapis.com/esmartdata-courses-files/ml-course/OnlineRetail.csv'
raw_data = pd.read_csv(url, encoding='latin', parse_dates=['InvoiceDate'])
data = raw_data.copy()
data.head(3)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom


### <a name='2'></a> Eksploracja danych

In [69]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


In [70]:
data.describe()

Unnamed: 0,Quantity,InvoiceDate,UnitPrice,CustomerID
count,541909.0,541909,541909.0,406829.0
mean,9.55225,2011-07-04 13:34:57.156386048,4.611114,15287.69057
min,-80995.0,2010-12-01 08:26:00,-11062.06,12346.0
25%,1.0,2011-03-28 11:34:00,1.25,13953.0
50%,3.0,2011-07-19 17:17:00,2.08,15152.0
75%,10.0,2011-10-19 11:27:00,4.13,16791.0
max,80995.0,2011-12-09 12:50:00,38970.0,18287.0
std,218.081158,,96.759853,1713.600303


In [71]:
data.describe(include='object')

Unnamed: 0,InvoiceNo,StockCode,Description,Country
count,541909,541909,540455,541909
unique,25900,4070,4223,38
top,573585,85123A,WHITE HANGING HEART T-LIGHT HOLDER,United Kingdom
freq,1114,2313,2369,495478


In [72]:
data.describe(include=['datetime'])

Unnamed: 0,InvoiceDate
count,541909
mean,2011-07-04 13:34:57.156386048
min,2010-12-01 08:26:00
25%,2011-03-28 11:34:00
50%,2011-07-19 17:17:00
75%,2011-10-19 11:27:00
max,2011-12-09 12:50:00


In [73]:
data.isnull().sum()

Unnamed: 0,0
InvoiceNo,0
StockCode,0
Description,1454
Quantity,0
InvoiceDate,0
UnitPrice,0
CustomerID,135080
Country,0


In [74]:
# Usunięcie braków
data = data.dropna()
data.isnull().sum()

Unnamed: 0,0
InvoiceNo,0
StockCode,0
Description,0
Quantity,0
InvoiceDate,0
UnitPrice,0
CustomerID,0
Country,0


In [75]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 406829 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    406829 non-null  object        
 1   StockCode    406829 non-null  object        
 2   Description  406829 non-null  object        
 3   Quantity     406829 non-null  int64         
 4   InvoiceDate  406829 non-null  datetime64[ns]
 5   UnitPrice    406829 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      406829 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 27.9+ MB


In [76]:
data['Country'].value_counts()

Unnamed: 0_level_0,count
Country,Unnamed: 1_level_1
United Kingdom,361878
Germany,9495
France,8491
EIRE,7485
Spain,2533
Netherlands,2371
Belgium,2069
Switzerland,1877
Portugal,1480
Australia,1259


In [77]:
tmp = data['Country'].value_counts().nlargest(10).reset_index()
tmp.columns = ['Country', 'Count']
px.bar(tmp, x='Country', y='Count', template='plotly_dark', color_discrete_sequence=['#03fcb5'],
       title='Częstotliwość zakupów ze względu na kraj', width=950, height=450)

In [78]:
# Obcięcie tylko do United Kingdom
data_uk = data.query("Country == 'United Kingdom'").copy()
data_uk.info()

<class 'pandas.core.frame.DataFrame'>
Index: 361878 entries, 0 to 541893
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    361878 non-null  object        
 1   StockCode    361878 non-null  object        
 2   Description  361878 non-null  object        
 3   Quantity     361878 non-null  int64         
 4   InvoiceDate  361878 non-null  datetime64[ns]
 5   UnitPrice    361878 non-null  float64       
 6   CustomerID   361878 non-null  float64       
 7   Country      361878 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 24.8+ MB


In [79]:
# Utworzenie nowej zmiennej Sales
data_uk['Sales'] = data_uk['Quantity'] * data_uk['UnitPrice']
data_uk.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Sales
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34


In [80]:
# Utworzenie nowej zmiennej Sales
data_uk['Sales'] = data_uk['Quantity'] * data_uk['UnitPrice']
data_uk.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Sales
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34


In [81]:
# Częstotliwość zakupów ze względu na dzień
tmp = data_uk.groupby(data_uk['InvoiceDate'].dt.date)['CustomerID'].count().reset_index()
tmp.columns = ['InvoiceDate', 'Count']
tmp.head()

Unnamed: 0,InvoiceDate,Count
0,2010-12-01,1809
1,2010-12-02,2029
2,2010-12-03,937
3,2010-12-05,2492
4,2010-12-06,1915


In [82]:
from plotly.subplots import make_subplots
fig = make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.02)

trace1 = px.line(tmp, x='InvoiceDate', y='Count', template='plotly_dark', color_discrete_sequence=['#03fcb5'])['data'][0]
trace2 = px.scatter(tmp, x='InvoiceDate', y='Count', template='plotly_dark', color_discrete_sequence=['#03fcb5'])['data'][0]

fig.add_trace(trace1, row=1, col=1)
fig.add_trace(trace2, row=2, col=1)
fig.update_layout(template='plotly_dark', title='Częstotliość zakupów ze względu na dzień', width=950)
fig.show()

In [83]:
data_uk.head(3)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Sales
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,22.0


In [84]:
# Łączna sprzedaż ze względu na dzień
tmp = data_uk.groupby(data_uk['InvoiceDate'].dt.date)['Sales'].sum().reset_index()
tmp.columns = ['InvoiceDate', 'Sales']
tmp.head()

Unnamed: 0,InvoiceDate,Sales
0,2010-12-01,42030.85
1,2010-12-02,45622.08
2,2010-12-03,17512.44
3,2010-12-05,25458.85
4,2010-12-06,29007.74


In [85]:
fig = make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.02)

trace1 = px.line(tmp, x='InvoiceDate', y='Sales', template='plotly_dark', color_discrete_sequence=['#03fcb5'])['data'][0]
trace2 = px.scatter(tmp, x='InvoiceDate', y='Sales', template='plotly_dark', color_discrete_sequence=['#03fcb5'])['data'][0]

fig.add_trace(trace1, row=1, col=1)
fig.add_trace(trace2, row=2, col=1)
fig.update_layout(template='plotly_dark', title='Łączna sprzedaż ze względu na dzień', width=950)
fig.show()

### <a name='3'></a> Wyznacznie retencji klienta

In [86]:
# wydobycie unikalnych wartości CustomerID
data_user = pd.DataFrame(data['CustomerID'].unique(), columns=['CustomerID'])
data_user.head(3)

Unnamed: 0,CustomerID
0,17850.0
1,13047.0
2,12583.0


In [87]:
# Wydobycie daty ostatniego zakupu dla każdego klienta
last_purchase = data.groupby('CustomerID')['InvoiceDate'].max().reset_index()
last_purchase.columns = ['CustomerID', 'LastPurchaseDate']
last_purchase.head(3)

Unnamed: 0,CustomerID,LastPurchaseDate
0,12346.0,2011-01-18 10:17:00
1,12347.0,2011-12-07 15:52:00
2,12348.0,2011-09-25 13:13:00


In [88]:
# wyznaczenie retencji jako liczby dni od daty ostatniego kupna klienta do maksymalnej (ostatniej) daty kupna w danych
last_purchase['Retention'] = (last_purchase['LastPurchaseDate'].max() - last_purchase['LastPurchaseDate']).dt.days
last_purchase.head()

Unnamed: 0,CustomerID,LastPurchaseDate,Retention
0,12346.0,2011-01-18 10:17:00,325
1,12347.0,2011-12-07 15:52:00,1
2,12348.0,2011-09-25 13:13:00,74
3,12349.0,2011-11-21 09:51:00,18
4,12350.0,2011-02-02 16:01:00,309


In [89]:
last_purchase['Retention'].value_counts()

Unnamed: 0_level_0,count
Retention,Unnamed: 1_level_1
3,127
0,105
8,105
2,100
1,92
...,...
243,1
174,1
285,1
250,1


In [90]:
px.histogram(last_purchase, x='Retention', template='plotly_dark',
             width=950, height=500, title='Retention', nbins=100,
             color_discrete_sequence=['#03fcb5'])

In [91]:
# Połączenie CustomerID oraz retencji
data_user = pd.merge(data_user, last_purchase, on='CustomerID')
data_user = data_user[['CustomerID', 'Retention']]
data_user.head()

Unnamed: 0,CustomerID,Retention
0,17850.0,301
1,13047.0,31
2,12583.0,2
3,13748.0,95
4,15100.0,329


In [92]:
px.scatter(data_user, x='CustomerID', y='Retention', template='plotly_dark', width=950,
           color_discrete_sequence=['#03fcb5'])

In [93]:
data_retention = data_user[['Retention']]
data_retention.head()

Unnamed: 0,Retention
0,301
1,31
2,2
3,95
4,329


In [94]:
# Standaryzacja danych
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data_user['RetentionScaled'] = scaler.fit_transform(data_retention)
data_user.head()

Unnamed: 0,CustomerID,Retention,RetentionScaled
0,17850.0,301,2.083819
1,13047.0,31,-0.595978
2,12583.0,2,-0.883808
3,13748.0,95,0.039233
4,15100.0,329,2.361724


In [95]:
px.scatter(data_user, x='CustomerID', y='RetentionScaled', template='plotly_dark', width=950,
           color_discrete_sequence=['#03fcb5'])

In [96]:
data_retention_scaled = data_user[['RetentionScaled']]
data_retention_scaled.head()

Unnamed: 0,RetentionScaled
0,2.083819
1,-0.595978
2,-0.883808
3,0.039233
4,2.361724


### <a name='4'></a> Retencja - KMeans

In [97]:
from sklearn.cluster import KMeans

wcss = []
for i in range(1, 10):
    kmeans = KMeans(n_clusters=i, max_iter=1000)
    kmeans.fit(data_retention_scaled)
    wcss.append(kmeans.inertia_)

wcss = pd.DataFrame(data=np.c_[range(1, 10), wcss], columns=['NumberOfClusters', 'WCSS'])
wcss

Unnamed: 0,NumberOfClusters,WCSS
0,1.0,4372.0
1,2.0,898.363553
2,3.0,424.626826
3,4.0,241.544529
4,5.0,149.299675
5,6.0,109.033615
6,7.0,80.568882
7,8.0,60.305806
8,9.0,49.643892


In [98]:
fig = px.line(wcss, x='NumberOfClusters', y='WCSS', template='plotly_dark', title='WCSS',
              width=950, color_discrete_sequence=['#03fcb5'])
fig.add_trace(px.scatter(wcss, x='NumberOfClusters', y='WCSS', template='plotly_dark', title='WCSS',
                         width=950, color_discrete_sequence=['#03fcb5'])['data'][0])

In [99]:
kmeans = KMeans(n_clusters=3, max_iter=1000)
kmeans.fit(data_retention_scaled)

In [100]:
data_user['Cluster'] = kmeans.labels_
data_user.head()

Unnamed: 0,CustomerID,Retention,RetentionScaled,Cluster
0,17850.0,301,2.083819,1
1,13047.0,31,-0.595978,0
2,12583.0,2,-0.883808,0
3,13748.0,95,0.039233,2
4,15100.0,329,2.361724,1


In [101]:
tmp = data_user.groupby('Cluster')['Retention'].describe()
tmp

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,2922.0,30.039357,24.803359,0.0,8.25,23.0,49.0,90.0
1,634.0,294.37224,45.419375,224.0,256.0,287.0,329.0,373.0
2,816.0,151.533088,38.494508,91.0,116.0,150.0,184.0,222.0


In [102]:
px.scatter(data_user, x='CustomerID', y='Retention', color='Cluster', template='plotly_dark',
           width=950, title='KMeans - Wizualizacja klastrów')

In [103]:
tmp = data_user['Cluster'].value_counts().reset_index()
tmp.columns = ['Cluster', 'Count']
px.bar(tmp, x='Cluster', y='Count', template='plotly_dark', color_discrete_sequence=['#03fcb5'], width=950,
       title='Rozkład częstości klastrów')

### <a name='5'></a> Retencja - DBSCAN

In [104]:
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.03, min_samples=5)
dbscan.fit(data_retention_scaled)
clusters = dbscan.labels_
data_user['Cluster'] = clusters
data_user.head()

Unnamed: 0,CustomerID,Retention,RetentionScaled,Cluster
0,17850.0,301,2.083819,0
1,13047.0,31,-0.595978,1
2,12583.0,2,-0.883808,1
3,13748.0,95,0.039233,1
4,15100.0,329,2.361724,0


In [105]:
px.scatter(data_user, x='CustomerID', y='Retention', color='Cluster', template='plotly_dark', width=950,
           title='Wizualizacja klastrów')

### <a name='6'></a> Sprzedaż

In [106]:
data_sales = data_uk.groupby('CustomerID')['Sales'].sum().reset_index()
data_sales.head()

Unnamed: 0,CustomerID,Sales
0,12346.0,0.0
1,12747.0,4196.01
2,12748.0,29072.1
3,12749.0,3868.2
4,12820.0,942.34


In [107]:
data_user = pd.merge(data_user, data_sales, on='CustomerID')
data_user.head()

Unnamed: 0,CustomerID,Retention,RetentionScaled,Cluster,Sales
0,17850.0,301,2.083819,0,5288.63
1,13047.0,31,-0.595978,1,3079.1
2,13748.0,95,0.039233,1,948.25
3,15100.0,329,2.361724,0,635.1
4,15291.0,25,-0.655529,1,4596.51


In [108]:
scaler = StandardScaler()
data_user['SalesScaled'] = scaler.fit_transform(data_user[['Sales']])
data_user.head()

Unnamed: 0,CustomerID,Retention,RetentionScaled,Cluster,Sales,SalesScaled
0,17850.0,301,2.083819,0,5288.63,0.546024
1,13047.0,31,-0.595978,1,3079.1,0.208577
2,13748.0,95,0.039233,1,948.25,-0.116854
3,15100.0,329,2.361724,0,635.1,-0.16468
4,15291.0,25,-0.655529,1,4596.51,0.440321


In [109]:
px.scatter(data_user, x='CustomerID', y='Sales', template='plotly_dark', width=950,
           color_discrete_sequence=['#03fcb5'], title='Sprzedaż w rozbiciu na klienta')

In [110]:
px.scatter(data_user, x='CustomerID', y='SalesScaled', template='plotly_dark', width=950,
           color_discrete_sequence=['#03fcb5'], title='Sprzedaż w rozbiciu na klienta - dane przeskalowane')

In [111]:
data_sales_scaled = data_user[['SalesScaled']]
data_sales_scaled.head()

Unnamed: 0,SalesScaled
0,0.546024
1,0.208577
2,-0.116854
3,-0.16468
4,0.440321


### <a name='7'></a> Sprzedaż - KMeans

In [112]:
wcss = []
for i in range(1, 10):
    kmeans = KMeans(n_clusters=i, max_iter=1000)
    kmeans.fit(data_sales_scaled)
    wcss.append(kmeans.inertia_)

wcss = pd.DataFrame(data=np.c_[range(1, 10), wcss], columns=['NumberOfClusters', 'WCSS'])
wcss

Unnamed: 0,NumberOfClusters,WCSS
0,1.0,3950.0
1,2.0,1848.825546
2,3.0,595.548482
3,4.0,354.491493
4,5.0,308.231389
5,6.0,167.790791
6,7.0,105.250718
7,8.0,75.916969
8,9.0,53.589411


In [113]:
fig = px.line(wcss, x='NumberOfClusters', y='WCSS', template='plotly_dark', color_discrete_sequence=['#03fcb5'],
              width=950, title='WCSS')
fig.add_trace(px.scatter(wcss, x='NumberOfClusters', y='WCSS', color_discrete_sequence=['#03fcb5'])['data'][0])

In [114]:
kmeans = KMeans(n_clusters=3, max_iter=1000)
kmeans.fit(data_sales_scaled)

In [115]:
data_user['Cluster'] = kmeans.labels_
data_user['Cluster'] = data_user['Cluster'].astype(str)
data_user.head()

Unnamed: 0,CustomerID,Retention,RetentionScaled,Cluster,Sales,SalesScaled
0,17850.0,301,2.083819,0,5288.63,0.546024
1,13047.0,31,-0.595978,0,3079.1,0.208577
2,13748.0,95,0.039233,0,948.25,-0.116854
3,15100.0,329,2.361724,0,635.1,-0.16468
4,15291.0,25,-0.655529,0,4596.51,0.440321


In [116]:
kmeans.cluster_centers_

array([[-0.06065062],
       [33.63689221],
       [ 6.31619638]])

In [118]:
px.scatter(data_user, x='CustomerID', y='SalesScaled', color='Cluster', template='plotly_dark', width=950,
           title='Wizualizacja klastrów - dane przeskalowane')

### <a name='8'></a> Sprzedaż - DBSCAN

In [119]:
dbscan = DBSCAN(eps=0.03, min_samples=7)
dbscan.fit(data_sales_scaled)
clusters = dbscan.labels_
data_user['Cluster'] = clusters
data_user['Cluster'] = data_user['Cluster'].astype(str)
data_user.head()

Unnamed: 0,CustomerID,Retention,RetentionScaled,Cluster,Sales,SalesScaled
0,17850.0,301,2.083819,0,5288.63,0.546024
1,13047.0,31,-0.595978,0,3079.1,0.208577
2,13748.0,95,0.039233,0,948.25,-0.116854
3,15100.0,329,2.361724,0,635.1,-0.16468
4,15291.0,25,-0.655529,0,4596.51,0.440321


In [120]:
px.scatter(data_user, x='CustomerID', y='Sales', color='Cluster', template='plotly_dark', width=950,
           title='DBSCAN - Wizualizacja klastrów')

In [121]:
px.scatter(data_user, x='CustomerID', y='SalesScaled', color='Cluster', template='plotly_dark', width=950,
           title='DBSCAN - Wizualizacja klastrów - dane przeskalowane')

### <a name='9'></a> Retencja, sprzedaż - KMeans

In [122]:
data_user.head()

Unnamed: 0,CustomerID,Retention,RetentionScaled,Cluster,Sales,SalesScaled
0,17850.0,301,2.083819,0,5288.63,0.546024
1,13047.0,31,-0.595978,0,3079.1,0.208577
2,13748.0,95,0.039233,0,948.25,-0.116854
3,15100.0,329,2.361724,0,635.1,-0.16468
4,15291.0,25,-0.655529,0,4596.51,0.440321


In [None]:
px.scatter(data_user, x='RetentionScaled', y='SalesScaled', template='plotly_dark', width=950,
           title='Retencji vs. Sprzedaż')

In [123]:
data_scaled = data_user[['RetentionScaled', 'SalesScaled']]
data_scaled.head()

Unnamed: 0,RetentionScaled,SalesScaled
0,2.083819,0.546024
1,-0.595978,0.208577
2,0.039233,-0.116854
3,2.361724,-0.16468
4,-0.655529,0.440321


In [137]:
from sklearn.cluster import KMeans

wcss = []
for i in range(1, 10):
    kmeans = KMeans(n_clusters=i, max_iter=1000)
    kmeans.fit(data_scaled)
    wcss.append(kmeans.inertia_)

wcss = pd.DataFrame(data=np.c_[range(1, 10), wcss], columns=['NumberOfClusters', 'WCSS'])
wcss

Unnamed: 0,NumberOfClusters,WCSS
0,1.0,7858.090877
1,2.0,4705.580268
2,3.0,2449.809912
3,4.0,1374.46298
4,5.0,949.120087
5,6.0,728.732034
6,7.0,565.270689
7,8.0,480.611023
8,9.0,425.291896


In [136]:
px.line(wcss, x='NumberOfClusters', y='WCSS', template='plotly_dark', color_discrete_sequence=['#03fcb5'], width=950,
        title='WCSS')

In [138]:
kmeans = KMeans(n_clusters=5, max_iter=1000)
kmeans.fit(data_scaled)

In [139]:
data_user['Cluster'] = kmeans.labels_
data_user['Cluster'] = data_user['Cluster'].astype(str)
data_user.head()

Unnamed: 0,CustomerID,Retention,RetentionScaled,Cluster,Sales,SalesScaled
0,17850.0,301,2.083819,3,5288.63,0.546024
1,13047.0,31,-0.595978,1,3079.1,0.208577
2,13748.0,95,0.039233,0,948.25,-0.116854
3,15100.0,329,2.361724,3,635.1,-0.16468
4,15291.0,25,-0.655529,1,4596.51,0.440321


In [140]:
px.scatter(data_user, x = 'RetentionScaled', y='SalesScaled', color='Cluster', template='plotly_dark', width=950,
           title='KMeans - Wizualizacja klastrów')

In [142]:
centroids = kmeans.cluster_centers_
centroids

array([[ 6.46991221e-01, -1.63469353e-01],
       [-5.96053258e-01, -1.29918819e-03],
       [-8.68920299e-01,  3.36368922e+01],
       [ 2.03228763e+00, -2.08834653e-01],
       [-8.46312960e-01,  6.31619638e+00]])

In [146]:
fig = px.scatter(data_user, x='RetentionScaled', y='SalesScaled', color='Cluster', template='plotly_dark', width=900,
                 title = 'KMeans - Wizualizacja klastrów + centroidy')
fig.add_trace(go.Scatter(x = centroids[:, 0], y = centroids[:, 1], mode = 'markers', marker_symbol='star',
                         marker_size=10, marker_color='white', showlegend=False))

In [147]:
desc = data_user.groupby('Cluster')[['Retention','Sales']].describe()
desc

Unnamed: 0_level_0,Retention,Retention,Retention,Retention,Retention,Retention,Retention,Retention,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0,732.0,155.76776,38.350185,92.0,121.0,155.0,188.0,234.0,732.0,646.074236,1083.359206,-4287.63,197.73,380.675,750.485,21535.9
1,2638.0,30.968537,25.384629,0.0,9.0,24.0,50.0,99.0,2638.0,1705.069857,2327.266318,-1165.3,396.59,910.36,2046.2325,21086.3
2,2.0,3.5,4.949747,0.0,1.75,3.5,5.25,7.0,2.0,221960.33,48759.481478,187482.17,204721.25,221960.33,239199.41,256438.49
3,551.0,295.30127,44.905476,226.0,258.5,287.0,329.0,373.0,551.0,345.099982,543.593967,-1192.2,137.54,248.42,387.43,7741.47
4,27.0,5.777778,9.082245,0.0,0.0,2.0,6.5,38.0,27.0,43070.445185,15939.249588,25748.35,28865.49,36351.42,53489.79,88125.38


In [148]:
tmp = pd.merge(desc['Retention'][['count','mean']].reset_index(), desc['Sales'][['mean']].reset_index(), on='Cluster',
               suffixes=('_Retention', '_Sales'))
tmp

Unnamed: 0,Cluster,count,mean_Retention,mean_Sales
0,0,732.0,155.76776,646.074236
1,1,2638.0,30.968537,1705.069857
2,2,2.0,3.5,221960.33
3,3,551.0,295.30127,345.099982
4,4,27.0,5.777778,43070.445185


In [149]:
px.bar(tmp, x='count', y='Cluster', hover_data=['mean_Retention', 'mean_Sales'], template='plotly_dark',
       width=950, orientation='h', title='Rozkład klastrów')