In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

#### Импортируем датасет 

In [None]:
df = pd.read_csv('file.csv', low_memory=False)
df.head()

In [None]:
Total=df.isnull().sum().sort_values(ascending=False)
Percent_null=(df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data=pd.concat([Total, Percent_null],axis=1, keys=['Total', 'Percent_null'])
missing_data.head(15)

#### Удаление выбросов методом z-оценки

In [None]:
df.describe()

In [None]:
df_no_outlier=df
for col in ('col1','col2','col3'):
  m=df_no_outlier[col].mean()
  s=df_no_outlier[col].std()
  df_no_outlier=df_no_outlier.loc[((df[col] -m)/s)<3]

#### Анализ корреляций

In [None]:
corrmat = df.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True, annot=True)

Если корреляции между показателями не превышают 0.75 по модулю, то не являются высокими.

In [None]:
for col in ('col3','col4'):
  df=df.drop(col,1)
  df_no_outlier=df_no_outlier.drop(col,1)

#### Обработка категориальных данных

In [None]:
df = pd.get_dummies(df)
df_no_outlier=pd.get_dummies(df_no_outlier)

#### Нормализация данных

In [None]:
df_normalized=df.drop('user_id',1)
df_no_outlier_normalized=df_no_outlier.drop('user_id',1)

In [None]:
df_normalized=preprocessing.normalize(df_normalized, norm='max')
df_no_outlier_normalized=preprocessing.normalize(df_no_outlier_normalized, norm='max')

#### Кластеризация

#### 1. K-средних

#### Определим оптимальное количество кластеров с помощью графика суммы квадратов расстояний точек к ближайшему центру кластера.

In [None]:
sse={}
n=15
for k in range(n):
  kmeans=KMeans(n_clusters=k+1,max_iter=1000,random_state=0).fit(df_no_outlier_normalized)
  sse[k+1]=kmeans.inertia_

plt.figure()
plt.plot(list(sse.keys()),list(sse.values()))
plt.xlabel("Number of clusters")
plt.ylabel("SSE")
plt.show()


#### Оптимальное количество кластеров: 4 (согласно методу изгиба локтя).

#### При попытке увеличить число кластеров увеличить число кластеров до 100 метод не отрабатывает (происходит переполнение RAM).

#### Рассчитаем коэфициент силуэта для 4 кластеров.

In [None]:
kmeans=KMeans(n_clusters=4,max_iter=1000,random_state=0).fit(df_no_outlier_normalized)

In [None]:
s={}
n=10000
for k in range(5):
  i=k*1000000
  small=df_no_outlier_normalized[i:i+n]
  small_labels=kmeans.labels_[i:i+n]
  s[k]=silhouette_score(small,small_labels,metric='euclidean')

plt.figure()
plt.plot(list(s.keys()),list(s.values()))
plt.xlabel("Number of iteration")
plt.ylabel("Silhouette")
plt.show()

In [None]:
print("Silhouette Coefficient:", score)

#### 2. DBSCAN

In [None]:
from sklearn.cluster import DBSCAN
from sklearn import metrics

In [None]:
n=10000
small=df_no_outlier_normalized[0:n]

Метод не отрабатывает для выборки большого объема (100 тыс. записей и более).

In [None]:
# Compute DBSCAN
db = DBSCAN(eps=0.5, min_samples=10).fit(small)
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

In [None]:
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(small, labels))

#### 3. Ward

In [None]:
from sklearn.cluster import AgglomerativeClustering
n_clusters = 4  
ward = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')

Метод не отрабатывает для выборки большого объема (100 тыс. записей и более).

In [None]:
n=10000
small=df_no_outlier_normalized[0:n]

In [None]:
ward.fit(small)

In [None]:
score=silhouette_score(small,ward.labels_,metric='euclidean')
print("Silhouette Coefficient: %0.3f" % score)

#### 4. Birch

In [None]:
from sklearn.cluster import Birch
brc = Birch(branching_factor=50, n_clusters=None, threshold=0.53)

In [None]:
brc.fit(df_no_outlier_normalized)

In [None]:
labels_true = brc.predict(df_no_outlier_normalized)
print(set(labels_true))

In [None]:
s={}
n=10000
for k in range(5):
  i=k*1000000
  small=df_no_outlier_normalized[i:i+n]
  small_labels=labels_true[i:i+n]
  s[k]=silhouette_score(small,small_labels,metric='euclidean')

plt.figure()
plt.plot(list(s.keys()),list(s.values()))
plt.xlabel("Number of iteration")
plt.ylabel("Silhouette")
plt.show()

#### Результаты кластеризации

In [None]:
df_no_outlier["kmeans"]=kmeans.labels_
df_no_outlier.head()

In [None]:
df_mean_kmeans = df_no_outlier.drop("user_id",1).groupby("kmeans").mean()
df_mean_kmeans

In [None]:
df_mean_kmeans["cluster_size"] = df_no_outlier.groupby("kmeans").size()
df_mean_kmeans

In [None]:
colors = ['#DF2020', '#81DF20', '#2095DF','#fc9d03']
df_no_outlier['col_kmeans'] = df_no_outlier.kmeans.map({0:colors[0], 1:colors[1], 2:colors[2],3:colors[3]})

In [None]:
plt.scatter(df_no_outlier.card_views, df_no_outlier.sum_orders_s,c=df_no_outlier.col_kmeans, alpha = 0.6, s=10)

In [None]:
labels = ['0','1','2','3']
order_sum = df_mean_kmeans["card_views"]

x = np.arange(len(labels))  # the label locations
width = 0.5  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(x, order_sum, width, label='card_views')

# Add some text for labels, title and custom x-axis tick labels, etc.
# ax.set_ylabel('card views')
ax.set_title('Kmeans - card views by clusters')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

ax.bar_label(rects1, padding=3)

fig.tight_layout()

plt.show()

In [None]:
vals = df_mean_kmeans["cluster_size"]
labels = ["active users", "no orders", "one-time", "loyal thrifty"]
fig, ax = plt.subplots()
ax.pie(vals, labels=labels, autopct='%1.1f%%',)
ax.axis("equal")
plt.show()

In [None]:
df_no_outlier["birch"]=brc.predict(df_no_outlier_normalized)

In [None]:
df_mean_birch = df_no_outlier.drop("user_id",1).groupby("birch").mean()
df_mean_birch

In [None]:
df_mean_birch["size"] = df_no_outlier.groupby("birch").size()
df_mean_birch

In [None]:
labels = ['0','1','2','3','4','5']
cnt_ord = df_mean_birch["count_orders_s"]

x = np.arange(len(labels))  # the label locations
width = 0.5  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(x, cnt_ord, width)

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('orders_s_count')
ax.set_title('Birch - orders_s count')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

ax.bar_label(rects1, padding=3)

fig.tight_layout()

plt.show()

In [None]:
vals = df_mean_birch["size"]
labels = ["one-time", "active", "loyal thrifty", "would-be clients", "one-time transition mobile", "one-time transition tablet"]
fig, ax = plt.subplots()
ax.pie(vals, labels=labels, autopct='%1.1f%%',)
ax.axis("equal")
plt.show()

Код ниже нужен для записи результатов из Google Colab на Google Drive

In [None]:
from google.colab import drive
drive.mount('drive')

In [None]:
df_no_outlier.to_csv('out.csv')
!cp out.csv "drive/My Drive/"