In [1]:
# import libs
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoderEstimator
from pyspark.sql.functions import col
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
import datetime
from pyspark.ml.feature import PCA
from pyspark.sql import Row
import csv
import MyKmeans

In [2]:
# load dataset
df = spark.read.csv(
    "data/cus-prod-type-totalqty.csv", header=True, mode="DROPMALFORMED"
)
df.show(1)

+---------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|customer_number| AA| AB| AC| AD| AE| AF| AG| AH| AI| AJ| AK| AL| AM| AN| AO| AP| AQ| AR| AS| AT| AU| AV| AW| AX| AY| AZ| BA| BB| BC| BD| BE| BF| BG| BH| BI| BJ| BK| BL| BM| BN| BO| BP| BQ| BR| BS| BT| BU| BV| BW| BX| BY| BZ| CA| CC| CD| CE| CF| CG| CH| CI| CJ| CK| CL| CM| CN|
+---------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|          10000|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0| 

In [3]:
# run k-means
result_stats = []

# calculate time useds
start_time = datetime.datetime.now() 

for k in range(2, 100):
    # train by my Kmeans - pearson distance
    kmeans_self = MyKmeans.MyKmeans(K=k, distanceMeasure='pearson', maxIter=50, tol=0.0001)
    centers_self = kmeans_self.train(df)
    result_self = kmeans_self.predict(df, centers_self)
    summarize_self = kmeans_self.summarize(result_self)
    cost_self = kmeans_self.computeCost(df, centers_self)
    entropy_self = kmeans_self.computeEntropy(summarize_self['count'], summarize_self['sum_count'], summarize_self['sum_set'])
    purity_self = kmeans_self.computePurity(summarize_self['count'], summarize_self['sum_count'], summarize_self['sum_set'])
    
    
    row = {'k': k, 'Cost': cost_self, 'Entropy': entropy_self, 'Purity': purity_self}
    result_stats.append(row)
    
    print('k = {} ------- Cost: {} ------- Entropy: {} ------- Purity: {}'.format(k, cost_self, entropy_self, purity_self))
    print('{} passed'.format(datetime.datetime.now() - start_time))

k = 2 ------- Cost: 32245.661974708484 ------- Entropy: 3.903965833880489 ------- Purity: 0.04728788993423664
0:00:55.001073 passed
k = 3 ------- Cost: 30755.910572598954 ------- Entropy: 3.754995169367837 ------- Purity: 0.06462564062937842
0:01:56.958123 passed
k = 4 ------- Cost: 29504.674719395563 ------- Entropy: 3.635405949684358 ------- Purity: 0.08314814196552214
0:03:02.887360 passed
k = 5 ------- Cost: 28306.591448482206 ------- Entropy: 3.5344844174595114 ------- Purity: 0.09978389332122171
0:04:40.421894 passed
k = 6 ------- Cost: 27659.540816127 ------- Entropy: 3.463105549572516 ------- Purity: 0.11166627015661391
0:06:24.192283 passed
k = 7 ------- Cost: 26645.498863668377 ------- Entropy: 3.378374480310452 ------- Purity: 0.12079996870115511
0:08:07.817082 passed
k = 8 ------- Cost: 25864.492027725846 ------- Entropy: 3.3171715133728186 ------- Purity: 0.13390926917051255
0:10:12.896524 passed
k = 9 ------- Cost: 25193.301412286368 ------- Entropy: 3.263071079005526 ---

k = 64 ------- Cost: 6814.821631196607 ------- Entropy: 2.1523411297682533 ------- Purity: 0.586074142958039
7:44:48.483095 passed
k = 65 ------- Cost: 6628.361148003603 ------- Entropy: 2.1452766521428877 ------- Purity: 0.5922637055449085
8:03:14.491314 passed
k = 66 ------- Cost: 7340.498421523024 ------- Entropy: 2.1657734688867594 ------- Purity: 0.5676115117532288
8:16:29.337098 passed
k = 67 ------- Cost: 7093.919558364134 ------- Entropy: 2.152887884500906 ------- Purity: 0.5749566132325522
8:29:58.437588 passed
k = 68 ------- Cost: 6763.742376860193 ------- Entropy: 2.136027885723799 ------- Purity: 0.5831825818108087
8:44:38.912223 passed
k = 69 ------- Cost: 6849.927719826392 ------- Entropy: 2.1550753174071313 ------- Purity: 0.5865286581034663
8:58:25.521627 passed
k = 70 ------- Cost: 6596.417889416586 ------- Entropy: 2.1436035455598845 ------- Purity: 0.5895047336381365
9:14:21.727509 passed
k = 71 ------- Cost: 6367.261499435592 ------- Entropy: 2.120849437320323 -----

In [4]:
keys = result_stats[0].keys()
with open('data/mykmeans-pearson-top100.csv', 'w') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(result_stats)

In [5]:
# end time
end_time = datetime.datetime.now()
print('Job done at: {} ---------- {} passed'.format(end_time, end_time - start_time))

Job done at: 2019-06-25 07:21:29.696873 ---------- 19:01:27.255496 passed
