In [5]:
import pyspark
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, ArrayType
from pyspark.sql.functions import col, split
from pyspark.ml.clustering import BisectingKMeans
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler, Bucketizer
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf, explode

In [6]:
conf = pyspark.SparkConf().setAppName("App")
conf = (conf.setMaster('local[*]')
        .set('spark.executor.memory', '4G')
        .set('spark.driver.memory', '45G')
        .set('spark.driver.maxResultSize', '10G'))

In [9]:
sc = pyspark.SparkContext(appName="spectral analysis", conf=conf)
sqlContext = pyspark.SQLContext(sc)

In [10]:
df = sqlContext.read.parquet("spectra.parquet")

In [11]:
df.head(1)

[Row(spectraSetOID=u'ObjectId(5cdd4ba5874f2c5d00000000)', mjd=52381, plate=510, fiber=87, z=0.23860563337802887, zerr=2.5307052055723034e-05, ra=168.85673, dec=1.5685734, type=u'QSO   ', subtype=u'STARBURST BROADLINE  ', name=u'SDSS J111525.62+013406.9', size=3842, spectrum=[None, -8.226900844866626e-17, -8.321458136657122e-17, -8.160798587125998e-17, -7.630893998024958e-17, -7.640438104439415e-17, -7.763041341489164e-17, -7.987136168769509e-17, -8.190224416034274e-17, -8.37576876906602e-17, -8.564980068472348e-17, -8.4452429208208e-17, -8.783046634828902e-17, -8.903391163717456e-17, -8.9931038528419e-17, -8.732245046763821e-17, -8.189543111221757e-17, -7.936729825239656e-17, -7.789225080080788e-17, -7.626966292933288e-17, -7.646206423891568e-17, -7.56322974998535e-17, -7.51852885301249e-17, -7.46608162415893e-17, -7.944748771286817e-17, -8.020207665981437e-17, -7.791554102601063e-17, -7.893932174133904e-17, -8.045569393165042e-17, -7.604861044480555e-17, -7.56031785835002e-17, -7.5928

In [12]:
array_to_vector = udf(lambda vs: Vectors.dense([float(i) for i in vs]), VectorUDT())

In [13]:
null_remover = udf(lambda col: [0.0 if l is None else l for l in col], ArrayType(DoubleType()))

Substitute Nones with 0

In [14]:
subs_df = df.select('mjd', 'plate', 'fiber', 'z', 'type', null_remover(col('spectrum')).alias('spectrum'))

Start by selecting continuum, spectrum and type

In [15]:
selected_df = subs_df.select('mjd', 'plate', 'fiber', 'z', 'type', \
                            array_to_vector(subs_df['spectrum']).alias("spectrum"))

Combine mjd, plate, fiber, z, zerr, ra, dec, size, spectrum and continuum info "features" column.

In [16]:
assembler = VectorAssembler(inputCols=['mjd', 'plate', 'fiber', 'z', 'spectrum'], outputCol = "features")
assembled_df = assembler.transform(selected_df).select('features')

In [17]:
assembled_df.dtypes

[('features', 'vector')]

Predict the type.

# Bisecting K-Means

In [18]:
bkm = BisectingKMeans().setK(2).setSeed(1)
model = bkm.fit(assembled_df)

In [19]:
predictions = model.transform(assembled_df)

In [20]:
evaluator = ClusteringEvaluator()

In [21]:
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print("wtf")

Cluster Centers: 
wtf


In [None]:
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

In [None]:
sc.stop()