##Imports and initialize variables

In [2]:
from numpy import array
from pyspark.sql.types import StructType, StructField, StringType, ArrayType,FloatType
from pyspark.ml.linalg import Vectors
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.clustering import KMeans, KMeansModel
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler

import matplotlib.pyplot as plt
import numpy as np

## File split and mapping

In [4]:
fileName= "/FileStore/tables/all.tsv"
# Load and parse the data
data = sc.textFile(fileName)
parsedData = data.map(lambda line: line.split('\t'))

## Features that we're using are:
features['acousticness'], features['danceability'], features['duration_ms'], features['energy'], features['instrumentalness'], features['key'],
                        features['liveness'], features['loudness'], features['mode'], features['speechiness'], features['tempo'], features['time_signature'], features['valence']

In [6]:
# convert rdd of all.tsv to dataframe
df = sqlContext.createDataFrame(parsedData, ["ID",])
# change dataType from string to float
new_df = df.select(*(col(c).cast("float").alias(c) for c in df.columns if c not in ["ID"])).cache()

In [7]:
# define the vector assembler for kmeans
vecAssembler = VectorAssembler(inputCols=[x for x in new_df.columns if x not in ["ID"]], outputCol='features')
new_results_df = vecAssembler.transform(new_df)
# run kmeans on the df 
kmeans_results = KMeans(k=7000, seed=1)
model_results = kmeans_results.fit(new_results_df.select('features'))

## Evaluation on kmeans

In [9]:
# Evaluate clustering by computing Silhouette score
transformed = model_results.transform(new_results_df)
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(transformed)
print("Silhouette with squared euclidean distance = " + str(silhouette))