In [42]:
import pandas as pd
import numpy as np

dataset = pd.read_csv('transaction.csv')
dataset

Unnamed: 0,InvoiceNo,StockCode,Qty,InvoiceDate,CustomerID,Country
0,537626,22725,830,12/7/2010 14:57,12347,Iceland
1,537626,22729,948,12/7/2010 14:57,12347,Iceland
2,537626,22195,695,12/7/2010 14:57,12347,Iceland
3,542237,22725,636,1/26/2011 14:30,12347,Iceland
4,542237,22729,536,1/26/2011 14:30,12347,Iceland
...,...,...,...,...,...,...
10541,543911,21700,455,2/14/2011 12:46,17829,United Arab Emirates
10542,543911,22111,578,2/14/2011 12:46,17829,United Arab Emirates
10543,543911,22112,163,2/14/2011 12:46,17829,United Arab Emirates
10544,564428,23296,545,8/25/2011 11:27,17844,Canada


In [43]:
country = dataset["Country"].unique()
dataset["Country"].value_counts()

Germany                 2269
France                  2109
EIRE                    1620
Netherlands              634
Spain                    539
Belgium                  486
Switzerland              434
Portugal                 367
Australia                356
Norway                   239
Italy                    190
Channel Islands          184
Finland                  152
Cyprus                   113
Sweden                   109
Denmark                   98
Japan                     92
Austria                   88
Poland                    80
Israel                    61
USA                       47
Singapore                 45
Unspecified               44
Canada                    36
Iceland                   35
Greece                    33
United Arab Emirates      23
Malta                     15
RSA                       14
Lithuania                  8
Brazil                     8
Lebanon                    5
European Community         5
Czech Republic             4
Bahrain       

In [44]:
transaksi = dataset.groupby(["Country","InvoiceNo"]).mean().reset_index()[["Country","InvoiceNo","Qty"]]
transaksi

Unnamed: 0,Country,InvoiceNo,Qty
0,Australia,536389,560.000000
1,Australia,537676,492.000000
2,Australia,539419,413.333333
3,Australia,540267,559.357143
4,Australia,540700,440.250000
...,...,...,...
1560,Unspecified,559929,385.666667
1561,Unspecified,561658,399.533333
1562,Unspecified,561661,658.000000
1563,Unspecified,563947,213.000000


In [45]:
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist, pdist

X = transaksi[["Qty"]]

cluster_i = []
cluster_val = []

for i in range(1,10+1):
    ypred = KMeans(n_clusters=3, n_init=1, max_iter=100).fit(X)
    cluster_i.append(ypred.labels_)
    cluster_val.append(ypred.inertia_)

print(cluster_i)
print(cluster_val)

[array([2, 2, 2, ..., 0, 1, 2], dtype=int32), array([2, 2, 1, ..., 0, 1, 2], dtype=int32), array([2, 2, 0, ..., 1, 0, 2], dtype=int32), array([0, 0, 0, ..., 1, 2, 0], dtype=int32), array([0, 0, 0, ..., 1, 2, 0], dtype=int32), array([1, 1, 1, ..., 0, 2, 1], dtype=int32), array([1, 1, 1, ..., 0, 2, 1], dtype=int32), array([0, 0, 0, ..., 1, 2, 0], dtype=int32), array([2, 2, 2, ..., 1, 0, 2], dtype=int32), array([0, 0, 0, ..., 1, 2, 0], dtype=int32)]
[8239757.514159832, 8234151.316930465, 8234151.316930465, 8240706.488261077, 8240706.488261077, 8239757.514159831, 8239757.514159831, 8239757.514159832, 8239757.514159832, 8240706.488261077]


In [46]:
idx = np.where(cluster_val == np.amin(cluster_val))[0][0]
cluster = cluster_i[idx]
cluster

array([2, 2, 1, ..., 0, 1, 2], dtype=int32)

In [47]:
from sklearn.neighbors import NearestCentroid

X = transaksi[["Qty"]]

nCtr = NearestCentroid()
nCtr.fit(X, ypred.labels_)

centroids = nCtr.centroids_
centroids

array([[520.30158399],
       [732.84470888],
       [300.76256669]])

In [48]:
ctr = pd.DataFrame(columns=["X"], data=centroids)
ctr.sort_values(by=["X"], inplace=True)
ctr

Unnamed: 0,X
2,300.762567
0,520.301584
1,732.844709


In [49]:
# print("Negara dengan transaksi \"Tingi\"")
# print(country[np.where(ypred.labels_ == 2)])
# transaksi.iloc[np.where(ypred.labels_ == 2)]["InvoiceNo"]

# print("Negara dengan transaksi \"Sedang\"")
# print(country[np.where(ypred.labels_ == 1)])

# print("Negara dengan transaksi \"Rendah\"")
# print(country[np.where(ypred.labels_ == 0)])

In [50]:
import altair as alt

label = pd.DataFrame(columns=["Cluster"], data=ypred.labels_)
label = label.replace({0: "Sedang",1: "Tinggi",2: "Rendah"})
transaksi['Class'] = label

alt.Chart(transaksi).mark_point().encode(
    x="Country", y="Qty",
    color=alt.Color("Class",
        legend=alt.Legend(title="Average Transaction Item")
    )
)