In [1]:
# import libs
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoderEstimator
from pyspark.sql.functions import col
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
import datetime
from pyspark.ml.feature import PCA
from pyspark.sql import Row
import csv
import MyKmeans

In [2]:
# load dataset
df = spark.read.csv(
    "data/cus-prod-type-totalqty.csv", header=True, mode="DROPMALFORMED"
)
df.show(1)

+---------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|customer_number| AA| AB| AC| AD| AE| AF| AG| AH| AI| AJ| AK| AL| AM| AN| AO| AP| AQ| AR| AS| AT| AU| AV| AW| AX| AY| AZ| BA| BB| BC| BD| BE| BF| BG| BH| BI| BJ| BK| BL| BM| BN| BO| BP| BQ| BR| BS| BT| BU| BV| BW| BX| BY| BZ| CA| CC| CD| CE| CF| CG| CH| CI| CJ| CK| CL| CM| CN|
+---------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|          10000|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0| 

In [3]:
# change product_code column types to int
cols = df.columns[1:]
df = df.select(df.columns[0],*(col(c).cast("int").alias(c) for c in cols))

In [4]:
# assemble feature cols
vecAssembler = VectorAssembler(inputCols=df.columns[1:], outputCol="features")
stream_df = vecAssembler.transform(df)

In [5]:
stream_df = stream_df.drop(*df.columns[1:])

In [7]:
# run k-means
cost_stats = []
entropy_stats = []
purity_stats = []

# calculate time useds
start_time = datetime.datetime.now() 

for k in range(2, 100):
    # train by my Kmeans
    kmeans_self = MyKmeans.MyKmeans(K=k, distanceMeasure='cosine', maxIter=50, tol=0.0001)
    centers_self = kmeans_self.train(df)
    result_self = kmeans_self.predict(df, centers_self)
    summarize_self = kmeans_self.summarize(result_self)
    cost_self = kmeans_self.computeCost(df, centers_self)
    entropy_self = kmeans_self.computeEntropy(summarize_self['count'], summarize_self['sum_count'], summarize_self['sum_set'])
    purity_self = kmeans_self.computePurity(summarize_self['count'], summarize_self['sum_count'], summarize_self['sum_set'])
    
    
    
    # train by spark Kmeans
    kmeans_spark = KMeans(k=k, distanceMeasure='cosine', initMode='random', maxIter=50, tol=0.0001)
    train_model = kmeans_spark.fit(stream_df)
    centers_spark = train_model.clusterCenters()
    # Make predictions
    predictions_spark = train_model.transform(stream_df)  
    # summarize stats
    result_spark = df.join(predictions_spark, ['customer_number']).select('customer_number', 'prediction', *(col(c).cast('int') for c in cols))
    summarize_spark = kmeans_self.summarize(result_spark)
    cost_spark = kmeans_self.computeCost(df, centers_spark)
    entropy_spark = kmeans_self.computeEntropy(summarize_spark['count'], summarize_spark['sum_count'], summarize_spark['sum_set'])
    purity_spark = kmeans_self.computePurity(summarize_spark['count'], summarize_spark['sum_count'], summarize_spark['sum_set'])
    
    
    cost_stat = { 'k':k, 'Spark': cost_spark, 'Self': cost_self }
    entropy_stat = { 'k':k, 'Spark': entropy_spark, 'Self': entropy_self }
    purity_stat = { 'k':k, 'Spark': purity_spark, 'Self': purity_self }
    cost_stats.append(cost_stat)
    entropy_stats.append(entropy_stat)
    purity_stats.append(purity_stat)
    
    
    print('k = {}'.format(k))
    print('Cost------------ Spark: {} ------- Self: {}'.format(cost_spark, cost_self))
    print('Entropy -------- Spark: {} ------- Self: {}'.format(entropy_spark, entropy_self))
    print('Purity --------- Spark: {} ------- Self: {}'.format(purity_spark, purity_self))
    print('{} passed'.format(datetime.datetime.now() - start_time))

k = 2
Cost------------ Spark: 28811.907414950096 ------- Self: 30195.629265562016
Entropy -------- Spark: 4.071832950360835 ------- Self: 3.909885653810936
Purity --------- Spark: 0.050252076385573 ------- Self: 0.05073130763727764
0:00:32.787905 passed
k = 3
Cost------------ Spark: 28303.072262823738 ------- Self: 29140.22959901846
Entropy -------- Spark: 4.03076898640203 ------- Self: 3.761933378631347
Purity --------- Spark: 0.060439938828797175 ------- Self: 0.06500160739902049
0:01:25.992595 passed
k = 4
Cost------------ Spark: 27454.439219881453 ------- Self: 28142.087609022383
Entropy -------- Spark: 3.9307283699721998 ------- Self: 3.6466430470090563
Purity --------- Spark: 0.0732741604286933 ------- Self: 0.08434590874006519
0:02:15.499715 passed
k = 5
Cost------------ Spark: 26512.290093424428 ------- Self: 27197.576618483425
Entropy -------- Spark: 3.8190879053273625 ------- Self: 3.538522621502247
Purity --------- Spark: 0.08951435243827001 ------- Self: 0.09478590185808455

k = 34
Cost------------ Spark: 14365.924042147539 ------- Self: 12744.28286848896
Entropy -------- Spark: 2.860341927177348 ------- Self: 2.5167898651065492
Purity --------- Spark: 0.3579433552632856 ------- Self: 0.4157932306574547
0:58:57.920679 passed
k = 35
Cost------------ Spark: 13782.985713472555 ------- Self: 12276.756547966288
Entropy -------- Spark: 2.731908795798288 ------- Self: 2.4879728101665317
Purity --------- Spark: 0.3648811728733129 ------- Self: 0.43037284838195033
1:02:07.787047 passed
k = 36
Cost------------ Spark: 12938.589359023314 ------- Self: 11718.191212533036
Entropy -------- Spark: 2.674307572605902 ------- Self: 2.457987291087607
Purity --------- Spark: 0.39086851083064944 ------- Self: 0.44418167157130184
1:04:45.910995 passed
k = 37
Cost------------ Spark: 13259.757723711002 ------- Self: 11322.591077445466
Entropy -------- Spark: 2.7088262530159297 ------- Self: 2.4353148828643154
Purity --------- Spark: 0.38150743155818395 ------- Self: 0.454566178253

k = 67
Cost------------ Spark: 6143.494006194143 ------- Self: 6931.505140765464
Entropy -------- Spark: 2.13956099994057 ------- Self: 2.153051120269122
Purity --------- Spark: 0.5861731098425287 ------- Self: 0.5749671436565723
3:12:29.740962 passed
k = 68
Cost------------ Spark: 6000.122761249825 ------- Self: 6623.449358923116
Entropy -------- Spark: 2.1471259775384532 ------- Self: 2.13502002438733
Purity --------- Spark: 0.583347009009852 ------- Self: 0.5832906190304415
3:18:15.793235 passed
k = 69
Cost------------ Spark: 7436.699725683803 ------- Self: 7323.128897946229
Entropy -------- Spark: 2.2367063119814223 ------- Self: 2.1701925797273285
Purity --------- Spark: 0.5448576121377117 ------- Self: 0.5722116800524447
3:24:32.056350 passed
k = 70
Cost------------ Spark: 6956.727944368288 ------- Self: 6049.278787367688
Entropy -------- Spark: 2.2155411916408934 ------- Self: 2.104468497627032
Purity --------- Spark: 0.5571633562767445 ------- Self: 0.6047470779340333
3:30:23.2

In [8]:
keys_cost = cost_stats[0].keys()
with open('data/cost-stats-top100.csv', 'w') as output_file:
    dict_writer = csv.DictWriter(output_file, keys_cost)
    dict_writer.writeheader()
    dict_writer.writerows(cost_stats)

In [9]:
keys_entropy = entropy_stats[0].keys()
with open('data/entropy-stats-top100.csv', 'w') as output_file:
    dict_writer = csv.DictWriter(output_file, keys_entropy)
    dict_writer.writeheader()
    dict_writer.writerows(entropy_stats)

In [10]:
keys_purity = purity_stats[0].keys()
with open('data/purity-stats-top100.csv', 'w') as output_file:
    dict_writer = csv.DictWriter(output_file, keys_purity)
    dict_writer.writeheader()
    dict_writer.writerows(purity_stats)

In [11]:
# end time
end_time = datetime.datetime.now()
print('Job done at: {} ---------- {} passed'.format(end_time, end_time - start_time))

Job done at: 2019-06-24 12:13:42.769682 ---------- 7:07:02.302905 passed
