In [1]:
import pandas as pd
import numpy as np

dataset = pd.read_csv('transaction.csv')
dataset

Unnamed: 0,InvoiceNo,StockCode,Qty,InvoiceDate,CustomerID,Country
0,537626,22725,830,12/7/2010 14:57,12347,Iceland
1,537626,22729,948,12/7/2010 14:57,12347,Iceland
2,537626,22195,695,12/7/2010 14:57,12347,Iceland
3,542237,22725,636,1/26/2011 14:30,12347,Iceland
4,542237,22729,536,1/26/2011 14:30,12347,Iceland
...,...,...,...,...,...,...
10541,543911,21700,455,2/14/2011 12:46,17829,United Arab Emirates
10542,543911,22111,578,2/14/2011 12:46,17829,United Arab Emirates
10543,543911,22112,163,2/14/2011 12:46,17829,United Arab Emirates
10544,564428,23296,545,8/25/2011 11:27,17844,Canada


In [2]:
country = dataset["Country"].unique()
dataset["Country"].value_counts()

Germany                 2269
France                  2109
EIRE                    1620
Netherlands              634
Spain                    539
Belgium                  486
Switzerland              434
Portugal                 367
Australia                356
Norway                   239
Italy                    190
Channel Islands          184
Finland                  152
Cyprus                   113
Sweden                   109
Denmark                   98
Japan                     92
Austria                   88
Poland                    80
Israel                    61
USA                       47
Singapore                 45
Unspecified               44
Canada                    36
Iceland                   35
Greece                    33
United Arab Emirates      23
Malta                     15
RSA                       14
Lithuania                  8
Brazil                     8
Lebanon                    5
European Community         5
Czech Republic             4
Bahrain       

In [3]:
transaksi = pd.DataFrame(columns=["Country","TransactionCount"])
transaksi["Country"] = country
itr = 0
for ctr in transaksi["Country"]:
    inv = dataset[dataset["Country"] == ctr]["InvoiceNo"].unique()
    transaksi.iloc[itr]["TransactionCount"] = len(inv)
    itr += 1
transaksi

Unnamed: 0,Country,TransactionCount
0,Iceland,6
1,Finland,26
2,Italy,31
3,Norway,28
4,Spain,72
5,Bahrain,1
6,Portugal,43
7,Switzerland,41
8,Austria,12
9,Cyprus,16


In [11]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors import NearestCentroid

X = transaksi[["TransactionCount"]]

cluster = AgglomerativeClustering(n_clusters=3, affinity="euclidean", linkage="average")
ypred = cluster.fit_predict(X)
ypred

nCtr = NearestCentroid()
nCtr.fit(X, ypred)

centroids = nCtr.centroids_
centroids

array([[ 18.78787879],
       [360.5       ],
       [224.        ]])

In [5]:
ctr = pd.DataFrame(columns=["X"], data=centroids)
ctr.sort_values(by=["X"], inplace=True)
ctr

Unnamed: 0,X
0,18.787879
2,224.0
1,360.5


In [6]:
import altair as alt

label = pd.DataFrame(columns=["Cluster"], data=ypred)
label = label.replace({0: "Low",1: "High",2: "Medium"})
transaksi['Class'] = label

alt.Chart(transaksi).mark_point().encode(
    x="Country", y="TransactionCount",
    color=alt.Color("Class",
        legend=alt.Legend(title="Transaction Cluster")
    )
)