<a href="https://colab.research.google.com/github/kurek0010/machine-learing-bootcamp/blob/main/unsupervised/05_case_studies/01_customer_segmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### scikit-learn
Strona biblioteki: [https://scikit-learn.org](https://scikit-learn.org)  

Dokumentacja/User Guide: [https://scikit-learn.org/stable/user_guide.html](https://scikit-learn.org/stable/user_guide.html)

Podstawowa biblioteka do uczenia maszynowego w języku Python.

Aby zainstalować bibliotekę scikit-learn, użyj polecenia poniżej:
```
!pip install scikit-learn
```
Aby zaktualizować do najnowszej wersji bibliotekę scikit-learn, użyj polecenia poniżej:
```
!pip install --upgrade scikit-learn
```
Kurs stworzony w oparciu o wersję `0.22.1`

### Spis treści:
1. [Import bibliotek](#0)
2. [Załadowanie danych](#1)
3. [Eksploracja danych](#2)
4. [Wyznacznie retencji klienta](#3)
5. [Retencja - KMeans](#4)
6. [Retencja - DBSCAN](#5)
7. [Sprzedaż](#6)
8. [Sprzedaż - KMeans](#7)
9. [Sprzedaż - DBSCAN](#8)
10. [Retencja, sprzedaż - KMeans](#9)




### <a name='0'></a> Import bibliotek

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

### <a name='1'></a> Załadowanie danych

In [None]:
url = 'https://storage.googleapis.com/esmartdata-courses-files/ml-course/OnlineRetail.csv'
raw_data = pd.read_csv(url, encoding='latin', parse_dates=['InvoiceDate'])
data = raw_data.copy()
data.head(3)

### <a name='2'></a> Eksploracja danych

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.describe(include=['object'])

In [None]:
data.describe(include=['datetime'])

In [None]:
data.isnull().sum()

In [None]:
# usunięcie braków
data = data.dropna()
data.isnull().sum()

In [None]:
data['Country'].value_counts()

In [None]:
tmp = data['Country'].value_counts().nlargest(10).reset_index()
tmp.columns = ['Country', 'Count']
tmp.query("Count > 200", inplace=True)
px.bar(tmp, x='Country', y='Count', template='plotly_dark', color_discrete_sequence=['#03fcb5'],
       title='Częstotliwość zakupów ze względu na kraj', width=900, height=500)

In [None]:
# obcięcie tylko do United Kingdom
data_uk = data.query("Country == 'United Kingdom'").copy()
data_uk.head()

In [None]:
# utworzenie nowej zmiennej Sales
data_uk['Sales'] = data_uk['Quantity'] * data_uk['UnitPrice']
data_uk.head()

In [None]:
# częstotliwość zakupów ze względu na datę
tmp = data_uk.groupby(data_uk['InvoiceDate'].dt.date)['CustomerID'].count().reset_index()
tmp.columns = ['InvoiceDate', 'Count']
tmp.head()

In [None]:
from plotly.subplots import make_subplots

fig = make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.02)

trace1 = px.line(tmp, x='InvoiceDate', y='Count', template='plotly_dark', color_discrete_sequence=['#03fcb5'])['data'][0]
trace2 = px.scatter(tmp, x='InvoiceDate', y='Count', template='plotly_dark', color_discrete_sequence=['#03fcb5'])['data'][0]

fig.add_trace(trace1, row=1, col=1)
fig.add_trace(trace2, row=2, col=1)
fig.update_layout(template='plotly_dark', title='Częstotliwość zakupów ze względu na datę', width=950)
fig.show()

In [None]:
data_uk.head()

In [None]:
# Łączna sprzedaż ze względu na datę
tmp = data_uk.groupby(data_uk['InvoiceDate'].dt.date)['Sales'].sum().reset_index()
tmp.columns = ['InvoiceDate', 'Sales']
tmp.head()

In [None]:
fig = make_subplots(rows=2, cols=1, shared_xaxes=True, vertical_spacing=0.02)

trace1 = px.line(tmp, x='InvoiceDate', y='Sales', template='plotly_dark', color_discrete_sequence=['#03fcb5'])['data'][0]
trace2 = px.scatter(tmp, x='InvoiceDate', y='Sales', template='plotly_dark', color_discrete_sequence=['#03fcb5'])['data'][0]

fig.add_trace(trace1, row=1, col=1)
fig.add_trace(trace2, row=2, col=1)
fig.update_layout(template='plotly_dark', title='Łączna sprzedaż ze względu na datę', width=950)
fig.show()

### <a name='3'></a> Wyznacznie retencji klienta

In [None]:
# wydobycie unikalnych wartości CustomerID
data_user = pd.DataFrame(data['CustomerID'].unique(), columns=['CustomerID'])
data_user.head(3)

In [None]:
# wydobycie daty ostatniego zakupu dla każdego klienta
last_purchase = data_uk.groupby('CustomerID')['InvoiceDate'].max().reset_index()
last_purchase.columns = ['CustomerID', 'LastPurchaseDate']
last_purchase.head()

In [None]:
last_purchase['LastPurchaseDate'].max()

In [None]:
last_purchase['LastPurchaseDate'].min()

In [None]:
# wyznaczenie retencji jako liczby dni od daty ostatniego kupna klienta do maksymalnej (ostatniej) daty kupna w danych
last_purchase['Retention'] = (last_purchase['LastPurchaseDate'].max() - last_purchase['LastPurchaseDate']).dt.days
last_purchase.head()

In [None]:
last_purchase['Retention'].value_counts()

In [None]:
px.histogram(last_purchase, x='Retention', template='plotly_dark',
             width=950, height=500, title='Retention', nbins=100,
             color_discrete_sequence=['#03fcb5'])

In [None]:
# połaczenie CustomerID oraz retencji
data_user = pd.merge(data_user, last_purchase, on='CustomerID')
data_user = data_user[['CustomerID', 'Retention']]
data_user.head()

In [None]:
px.scatter(data_user, x='CustomerID', y='Retention', template='plotly_dark', width=950,
           color_discrete_sequence=['#03fcb5'])

In [None]:
data_retention = data_user[['Retention']]
data_retention.head()

In [None]:
# standaryzacja danych
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data_user['RetentionScaled'] = scaler.fit_transform(data_retention)
data_user.head()

In [None]:
px.scatter(data_user, x='CustomerID', y='RetentionScaled', template='plotly_dark', width=950,
           color_discrete_sequence=['#03fcb5'])

In [None]:
data_retention_scaled = data_user[['RetentionScaled']]
data_retention_scaled.head()

### <a name='4'></a> Retencja - KMeans

In [None]:
from sklearn.cluster import KMeans

wcss = []
for i in range(1, 10):
    kmeans = KMeans(n_clusters=i, max_iter=1000)
    kmeans.fit(data_retention_scaled)
    wcss.append(kmeans.inertia_)

wcss = pd.DataFrame(data=np.c_[range(1, 10), wcss], columns=['NumberOfClusters', 'WCSS'])
wcss

In [None]:
px.line(wcss, x='NumberOfClusters', y='WCSS', template='plotly_dark', title='WCSS',
        width=950, color_discrete_sequence=['#03fcb5'])

In [None]:
kmeans = KMeans(n_clusters=3, max_iter=1000)
kmeans.fit(data_retention_scaled)

In [None]:
data_user['Cluster'] = kmeans.labels_
data_user.head()

In [None]:
tmp = data_user.groupby('Cluster')['Retention'].describe()
tmp

In [None]:
tmp = tmp['mean'].reset_index()
tmp.columns = ['Cluster', 'MeanRetention']
px.bar(tmp, x='Cluster', y='MeanRetention', template='plotly_dark', width=950,
       height=400, color_discrete_sequence=['#03fcb5'])

In [None]:
px.scatter(data_user, x='CustomerID', y='Retention', color='Cluster', template='plotly_dark',
           width=950, title='Wizualizacja klastrów')

In [None]:
tmp = data_user['Cluster'].value_counts().reset_index()
tmp.columns = ['Cluster', 'Count']
px.bar(tmp, x='Cluster', y='Count', template='plotly_dark', color_discrete_sequence=['#03fcb5'], width=950,
       title='Rozkład częstości klastrów')

### <a name='5'></a> Retencja - DBSCAN

In [None]:
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.03, min_samples=5)
dbscan.fit(data_retention_scaled)
clusters = dbscan.labels_
data_user['Cluster'] = clusters
data_user.head()

In [None]:
px.scatter(data_user, x='CustomerID', y='Retention', color='Cluster', template='plotly_dark', width=950,
           title='Wizualizacja klastrów')

### <a name='6'></a> Sprzedaż

In [None]:
data_sales = data_uk.groupby('CustomerID')['Sales'].sum().reset_index()
data_sales.head()

In [None]:
data_user = pd.merge(data_user, data_sales, on='CustomerID')
data_user.head()

In [None]:
scaler = StandardScaler()
data_user['SalesScaled'] = scaler.fit_transform(data_user[['Sales']])
data_user.head()

In [None]:
px.scatter(data_user, x='CustomerID', y='Sales', template='plotly_dark',
           color_discrete_sequence=['#03fcb5'], title='Sprzedaż w rozbiciu na klienta')

In [None]:
px.scatter(data_user, x='CustomerID', y='SalesScaled', template='plotly_dark',
           color_discrete_sequence=['#03fcb5'], title='Sprzedaż w rozbiciu na klienta - dane przeskalowane')

In [None]:
data_sales_scaled = data_user[['SalesScaled']]
data_sales_scaled.head()

### <a name='7'></a> Sprzedaż - KMeans

In [None]:
wcss = []
for i in range(1, 10):
    kmeans = KMeans(n_clusters=i, max_iter=1000)
    kmeans.fit(data_sales_scaled)
    wcss.append(kmeans.inertia_)

wcss = pd.DataFrame(data=np.c_[range(1, 10), wcss], columns=['NumberOfClusters', 'WCSS'])
wcss

In [None]:
px.line(wcss, x='NumberOfClusters', y='WCSS', template='plotly_dark', color_discrete_sequence=['#03fcb5'],
        width=950, title='WCSS')

In [None]:
kmeans = KMeans(n_clusters=3, max_iter=1000)
kmeans.fit(data_sales_scaled)

In [None]:
data_user['Cluster'] = kmeans.labels_
data_user['Cluster'] = data_user['Cluster'].astype(str)
data_user.head()

In [None]:
kmeans.cluster_centers_

In [None]:
px.scatter(data_user, x='CustomerID', y='SalesScaled', color='Cluster', template='plotly_dark', width=950,
           title='Wizualizacja klastrów - dane przeskalowane')

### <a name='8'></a> Sprzedaż - DBSCAN

In [None]:
dbscan = DBSCAN(eps=0.5, min_samples=7)
dbscan.fit(data_sales_scaled)
clusters = dbscan.labels_
data_user['Cluster'] = clusters
data_user['Cluster'] = data_user['Cluster'].astype(str)
data_user.head()

In [None]:
px.scatter(data_user, x='CustomerID', y='Sales', color='Cluster', template='plotly_dark', width=950,
           title='DBSCAN - Wizualizacja klastrów')

### <a name='9'></a> Retencja, sprzedaż - KMeans

In [None]:
data_user.head()

In [None]:
px.scatter(data_user, x='RetentionScaled', y='SalesScaled', template='plotly_dark', width=950,
           title='Retencja vs. Sprzedaż')

In [None]:
data_scaled = data_user[['RetentionScaled', 'SalesScaled']]
data_scaled.head()

In [None]:
wcss = []
for i in range(1, 10):
    kmeans = KMeans(n_clusters=i, max_iter=1000)
    kmeans.fit(data_scaled)
    wcss.append(kmeans.inertia_)

wcss = pd.DataFrame(data=np.c_[range(1, 10), wcss], columns=['NumberOfClusters', 'WCSS'])
wcss

In [None]:
px.line(wcss, x='NumberOfClusters', y='WCSS', template='plotly_dark', color_discrete_sequence=['#03fcb5'], width=950,
        title='WCSS')

In [None]:
kmeans = KMeans(n_clusters=5, max_iter=1000)
kmeans.fit(data_scaled)

In [None]:
data_user['Cluster'] = kmeans.labels_
data_user['Cluster'] = data_user['Cluster'].astype(str)
data_user.head()

In [None]:
px.scatter(data_user, x='RetentionScaled', y='SalesScaled', color='Cluster', template='plotly_dark', width=950,
           title='KMeans - Wizualizacja klastrów')

In [None]:
centroids = kmeans.cluster_centers_
centroids

In [None]:
fig = px.scatter(data_user, x='RetentionScaled', y='SalesScaled', color='Cluster', template='plotly_dark', width=900,
                 title='KMeans - Wizualizacja klastrów + centroidy')
fig.add_trace(go.Scatter(x=centroids[:, 0], y=centroids[:, 1], mode='markers', marker_symbol='star',
                         marker_size=10, marker_color='white', showlegend=False))

In [None]:
desc = data_user.groupby('Cluster')[['Retention', 'Sales']].describe()
desc

In [None]:
tmp = pd.merge(desc['Retention'][['count', 'mean']].reset_index(), desc['Sales'][['mean']].reset_index(), on='Cluster',
         suffixes=('_Retention', '_Sales'))
tmp

In [None]:
px.bar(tmp, x='count', y='Cluster', hover_data=['mean_Retention', 'mean_Sales'], template='plotly_dark',
       width=950, orientation='h', title='Rozkład klastrów')