In [1]:
from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import mean
import os
import time
import pandas as pd

In [2]:
st=time.time()
directory_path = '/home/hadoop/pyspark/sokulee/'
f = open('sokulee.csv', 'w')
f.write('id,sleep,steps,wearing_time\n')
for n in range(1,100):
    name = 'A0' + str(n)
    directory_name = directory_path + name

    if os.path.exists(directory_name):
        df_sleep_temp = spark.read.json(directory_name + '/*_sleep.json')
        df_steps_temp = spark.read.json(directory_name + '/*_steps.json')
        df_heart_temp = spark.read.json(directory_name + '/*_heart.json')

        a = df_sleep_temp.select(mean(df_sleep_temp['summary']['totalTimeInBed']).alias('sleep'))
        b = df_steps_temp.select(mean(df_steps_temp['activities-steps'][0]['value']).alias('steps'))
        c = df_heart_temp.select(mean(df_heart_temp['activities-heart'][0]['value']['heartRateZones'][0]['minutes'] 
                    + df_heart_temp['activities-heart'][0]['value']['heartRateZones'][1]['minutes'] 
                    + df_heart_temp['activities-heart'][0]['value']['heartRateZones'][2]['minutes'] 
                    + df_heart_temp['activities-heart'][0]['value']['heartRateZones'][3]['minutes']).alias('time'))
        
        f.write(name + ',' + str(a.collect()[0][0]) + ',' + str(b.collect()[0][0]) + ',' + str(c.collect()[0][0]) + '\n')
        
f.close()
print(time.time()-st, 'sec')

275.8056843280792 sec


In [3]:
df_sokulee = pd.read_csv('sokulee.csv')
df_sokulee.head()

Unnamed: 0,id,sleep,steps,wearing_time
0,A01,245.306122,7030.36,956.439024
1,A02,206.387755,6357.46,938.333333
2,A03,396.44898,7826.04,1157.32
3,A04,358.22449,13340.86,1312.32
4,A05,64.632653,4116.3,944.071429


In [4]:
sleep_min = df_sokulee['sleep'].min()
sleep_max = df_sokulee['sleep'].max()
steps_min = df_sokulee['steps'].min()
steps_max = df_sokulee['steps'].max()
wearing_time_min = df_sokulee['wearing_time'].min()
wearing_time_max = df_sokulee['wearing_time'].max()
print(sleep_min, sleep_max)
print(steps_min, steps_max)
print(wearing_time_min, wearing_time_max)

13.98 497.510204082
4116.3 21666.4
744.875 1402.24


In [5]:
df_sokulee['sleep'] = (df_sokulee['sleep'] - sleep_min) / (sleep_max - sleep_min)
df_sokulee['steps'] = (df_sokulee['steps'] - steps_min) / (steps_max - steps_min)
df_sokulee['wearing_time'] = (df_sokulee['wearing_time'] - wearing_time_min) / (wearing_time_max - wearing_time_min)
df_sokulee.head()

Unnamed: 0,id,sleep,steps,wearing_time
0,A01,0.478411,0.166042,0.321836
1,A02,0.397923,0.127701,0.294294
2,A03,0.790993,0.21138,0.627422
3,A04,0.71194,0.525613,0.863211
4,A05,0.104756,0.0,0.303023


In [6]:
f = open('test.txt', 'w')
lable = 0

for uid in df_sokulee['id']:
    for r in df_sokulee.loc[df_sokulee['id'] == uid].iterrows():
        f.write('{0} 1:{1} 2:{2} 3:{3}\n'.format(lable, r[1][1], r[1][2], r[1][3]))
        lable += 1
f.close()

In [7]:
dataset = spark.read.format("libsvm")\
                .load("/home/hadoop/pyspark/test.txt")
dataset.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(3,[0,1,2],[0.478...|
|  1.0|(3,[0,1,2],[0.397...|
|  2.0|(3,[0,1,2],[0.790...|
|  3.0|(3,[0,1,2],[0.711...|
|  4.0|(3,[0,2],[0.10475...|
|  5.0|(3,[0,1,2],[0.799...|
|  6.0|(3,[0,1,2],[0.872...|
|  7.0|(3,[0,1,2],[0.821...|
|  8.0|(3,[0,1,2],[0.814...|
|  9.0|(3,[0,1,2],[0.252...|
| 10.0|(3,[0,1,2],[0.827...|
| 11.0|(3,[0,1,2],[0.802...|
| 12.0|(3,[0,1,2],[0.801...|
| 13.0|(3,[0,1,2],[0.623...|
| 14.0|(3,[0,1,2],[0.867...|
| 15.0|(3,[0,1,2],[0.935...|
| 16.0|(3,[0,1,2],[0.454...|
| 17.0|(3,[0,1,2],[0.580...|
| 18.0|(3,[0,1,2],[0.741...|
| 19.0|(3,[0,1,2],[1.0,0...|
+-----+--------------------+
only showing top 20 rows



In [8]:
kmeans = KMeans().setK(4).setSeed(1)
model = kmeans.fit(dataset)

In [9]:
wssse = model.computeCost(dataset)
print("Within Set Sum of Squared Errors = " + str(wssse))

Within Set Sum of Squared Errors = 3.5870179674240017


In [10]:
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[ 0.82739283  0.19190198  0.79634372]
[ 0.69639868  0.95365155  0.83189472]
[ 0.711156    0.54682726  0.78079728]
[ 0.39703327  0.22660912  0.37450526]
