# 1일 수면시간, 1일 발걸음수, 1일 착용시간 데이터에 대해 K means clustering 수행

In [1]:
from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import mean
import os
import time
import pandas as pd

## 1일 평균 수면시간, 1일 평균 발걸음수, 1일 평균 착용시간 구하기

In [2]:
st=time.time()
directory_path = '/home/hadoop/pyspark/sokulee/'
f = open('sokulee.csv', 'w')
f.write('id,sleep,steps,wearing_time\n')
for n in range(1,100):
    name = 'A0' + str(n)
    directory_name = directory_path + name

    if os.path.exists(directory_name):
        df_sleep_temp = spark.read.json(directory_name + '/*_sleep.json')
        df_steps_temp = spark.read.json(directory_name + '/*_steps.json')
        df_heart_temp = spark.read.json(directory_name + '/*_heart.json')

        a = df_sleep_temp.select(mean(df_sleep_temp['summary']['totalTimeInBed']).alias('sleep'))
        b = df_steps_temp.select(mean(df_steps_temp['activities-steps'][0]['value']).alias('steps'))
        c = df_heart_temp.select(mean(df_heart_temp['activities-heart'][0]['value']['heartRateZones'][0]['minutes'] 
                    + df_heart_temp['activities-heart'][0]['value']['heartRateZones'][1]['minutes'] 
                    + df_heart_temp['activities-heart'][0]['value']['heartRateZones'][2]['minutes'] 
                    + df_heart_temp['activities-heart'][0]['value']['heartRateZones'][3]['minutes']).alias('time'))
        
        f.write(name + ',' + str(a.collect()[0][0]) + ',' + str(b.collect()[0][0]) + ',' + str(c.collect()[0][0]) + '\n')
        
f.close()
print(time.time()-st, 'sec')

283.8226888179779 sec


In [3]:
df_sokulee = pd.read_csv('sokulee.csv')
df_sokulee

Unnamed: 0,id,sleep,steps,wearing_time
0,A01,245.306122,7030.360000,956.439024
1,A02,206.387755,6357.460000,938.333333
2,A03,396.448980,7826.040000,1157.320000
3,A04,358.224490,13340.860000,1312.320000
4,A05,64.632653,4116.300000,944.071429
5,A06,400.367347,13777.280000,1318.660000
6,A07,435.795918,11689.240000,1383.880000
7,A08,411.142857,12586.897959,1338.880000
8,A010,407.795918,5334.500000,1166.520000
9,A016,136.244898,10710.520000,959.175000


## 데이터 평준화

In [4]:
sleep_min = df_sokulee['sleep'].min()
sleep_max = df_sokulee['sleep'].max()
steps_min = df_sokulee['steps'].min()
steps_max = df_sokulee['steps'].max()
wearing_time_min = df_sokulee['wearing_time'].min()
wearing_time_max = df_sokulee['wearing_time'].max()
print(sleep_min, sleep_max)
print(steps_min, steps_max)
print(wearing_time_min, wearing_time_max)

13.98 497.510204082
4116.3 21666.4
744.875 1402.24


In [5]:
df_sokulee['sleep'] = (df_sokulee['sleep'] - sleep_min) / (sleep_max - sleep_min)
df_sokulee['steps'] = (df_sokulee['steps'] - steps_min) / (steps_max - steps_min)
df_sokulee['wearing_time'] = (df_sokulee['wearing_time'] - wearing_time_min) / (wearing_time_max - wearing_time_min)
df_sokulee

Unnamed: 0,id,sleep,steps,wearing_time
0,A01,0.478411,0.166042,0.321836
1,A02,0.397923,0.127701,0.294294
2,A03,0.790993,0.211380,0.627422
3,A04,0.711940,0.525613,0.863211
4,A05,0.104756,0.000000,0.303023
5,A06,0.799097,0.550480,0.872856
6,A07,0.872367,0.431504,0.972070
7,A08,0.821382,0.482652,0.903615
8,A010,0.814460,0.069413,0.641417
9,A016,0.252859,0.375737,0.325998


In [6]:
f = open('test.txt', 'w')
lable = 0

for uid in df_sokulee['id']:
    for r in df_sokulee.loc[df_sokulee['id'] == uid].iterrows():
        f.write('{0} 1:{1} 2:{2} 3:{3}\n'.format(lable, r[1][1], r[1][2], r[1][3]))
        lable += 1
f.close()

In [7]:
# Loads data.
dataset = spark.read.format("libsvm")\
                .load("/home/hadoop/pyspark/test.txt")
dataset.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(3,[0,1,2],[0.478...|
|  1.0|(3,[0,1,2],[0.397...|
|  2.0|(3,[0,1,2],[0.790...|
|  3.0|(3,[0,1,2],[0.711...|
|  4.0|(3,[0,2],[0.10475...|
|  5.0|(3,[0,1,2],[0.799...|
|  6.0|(3,[0,1,2],[0.872...|
|  7.0|(3,[0,1,2],[0.821...|
|  8.0|(3,[0,1,2],[0.814...|
|  9.0|(3,[0,1,2],[0.252...|
| 10.0|(3,[0,1,2],[0.827...|
| 11.0|(3,[0,1,2],[0.802...|
| 12.0|(3,[0,1,2],[0.801...|
| 13.0|(3,[0,1,2],[0.623...|
| 14.0|(3,[0,1,2],[0.867...|
| 15.0|(3,[0,1,2],[0.935...|
| 16.0|(3,[0,1,2],[0.454...|
| 17.0|(3,[0,1,2],[0.580...|
| 18.0|(3,[0,1,2],[0.741...|
| 19.0|(3,[0,1,2],[1.0,0...|
+-----+--------------------+
only showing top 20 rows



In [8]:
# Trains a k-means model.
kmeans = KMeans().setK(4).setSeed(1)
model = kmeans.fit(dataset)

In [9]:
# Evaluate clustering by computing 
# Within Set Sum of Squared Errors.
wssse = model.computeCost(dataset)
print("Within Set Sum of Squared Errors = " + str(wssse))

Within Set Sum of Squared Errors = 3.5338372719222386


In [10]:
# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[ 0.80291837  0.24280891  0.79652982]
[ 0.37303319  0.23833184  0.49606394]
[ 0.71362475  0.66430593  0.80831609]
[ 0.5008186   0.25879653  0.20427213]
