In [None]:
from pyspark.sql.types import LongType
from pyspark.sql.types import StringType
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import lit
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import regexp_replace
from pyspark.ml.feature import PCA
from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import Vectors
from pyspark.ml.clustering import KMeans
from pyspark.sql.functions import rand, randn

In [None]:
def setHadoopConfig(name):
    prefix = "fs.swift2d.service." + name
    hconf = sc._jsc.hadoopConfiguration()
    hconf.set(prefix + '.auth.url', 'https://identity.open.softlayer.com'+'/v3/auth/tokens')
    hconf.set(prefix + '.auth.endpoint.prefix', 'endpoints')
    hconf.set(prefix + '.tenant', 'a9fb4d478e3d40a8bbd54c5a2ecf25a3')
    hconf.set(prefix + '.username', '6a4cc8251c1940179a6cccc9098a15e0')
    hconf.set(prefix + '.password', 'kDTcKA2H(3eo5.G0')
    hconf.setInt(prefix + '.http.port', 8080)
    hconf.set(prefix + '.region', 'dallas')
    hconf.setBoolean(prefix + '.public', False)

name = "keystone"
setHadoopConfig(name)

seven_cases = spark.read.parquet("swift2d://MGH." + name + "/tempParq/7cases.parquet")

cases = spark.read.csv("swift2d://MGH." + name + "/csv/csv_case1.csv",
                           header=False,inferSchema=True,nanValue=0,nullValue=0,negativeInf=0,positiveInf=0)\
                        .withColumn("timeId", monotonically_increasing_id().cast(LongType()))\
                        .withColumn("patId", lit(1).cast(LongType()))

In [None]:
caseNumbers = [2,8,9,10,11,12]
for num in caseNumbers:
    caseTemp = spark.read.csv("swift2d://MGH." + name + "/csv/csv_case"+str(num)+".csv",
                           header=False,inferSchema=True,nanValue=0,nullValue=0,negativeInf=0,positiveInf=0)\
                        .withColumn("timeId", monotonically_increasing_id().cast(LongType()))\
                        .withColumn("patId", lit(1).cast(LongType()))
    cases = cases.union(caseTemp)

In [None]:
for col in cases.schema:
    #print col.dataType
    #print StringType
    if str(col.dataType) == "StringType":
        #print col
        cases = cases.withColumn(col.name + "temp", 
                                 regexp_replace(
                                     regexp_replace(cases[col.name],"Inf","0")
                                     ,"NaN","0"))
        cases = cases.withColumn(col.name + "temp", cases[col.name + "temp"].cast(DoubleType()))
        cases = cases.drop(col.name).withColumnRenamed(col.name + "temp",col.name)

cases = cases.na.fill(0)

In [None]:
feature_cols = [i for i in cases.columns if i != "timeId" and i != "patId"]
assembler = VectorAssembler().setInputCols(feature_cols).setOutputCol("features")
cases = assembler.transform(cases)

In [None]:
normalizer = Normalizer().setInputCol("features").setOutputCol("normFeatures")
cases = normalizer.transform(cases)

pcaModel = PCA().setInputCol("normFeatures").setOutputCol("pcaFeatures").setK(100).fit(cases)

cases = pcaModel.transform(cases)
cases = cases.cache()

In [None]:
kmeans = KMeans().setFeaturesCol("pcaFeatures").setK(18)
kmeansModel = kmeans.fit(cases)
cases = kmeansModel.transform(cases)

In [None]:
cases.repartition(150)
seven_cases.repartition(150)
print

In [None]:
print(cases.take(2))
print(seven_cases.take(2))

In [None]:
df = spark.range(1000000000)

In [None]:
df = df.select("id", rand().alias("uniform"), randn().alias("normal")).where("normal < 2")

In [None]:
df.cache()
df.count()

In [None]:
cases.count()

In [None]:
df.show()

In [None]:
spark.conf.get("spark.executor.memory")

In [None]:
sc._conf.getAll()

In [None]:
print 1

In [1]:
!ls -l

total 11967144
-rw------- 1 seff-34c2f0d3dcc620-a916a00b641d users 1478690037 Jun 27 15:58 Case18_seg1.mat
-rw------- 1 seff-34c2f0d3dcc620-a916a00b641d users  733859235 Jun 28 12:38 Case1_seg11.mat
-rw------- 1 seff-34c2f0d3dcc620-a916a00b641d users   57286534 Jun 29 15:28 Case1_seg12.mat
-rw------- 1 seff-34c2f0d3dcc620-a916a00b641d users  983503825 Jun 27 16:44 Case1_seg1.mat
-rw------- 1 seff-34c2f0d3dcc620-a916a00b641d users  826333198 Jun 27 16:19 Case5_seg1.mat
-rw------- 1 seff-34c2f0d3dcc620-a916a00b641d users       4329 Jun 28 13:18 compute_spectrogram_sunhaoqi.py
-rw------- 1 seff-34c2f0d3dcc620-a916a00b641d users       3569 Jun 28 13:58 compute_spectrogram_sunhaoqi.pyc
drwx------ 2 seff-34c2f0d3dcc620-a916a00b641d users       4096 Jun 29 12:42 debug
-rw------- 1 seff-34c2f0d3dcc620-a916a00b641d users        703 Jun 29 08:46 fcn_shannon_entro.py
-rw------- 1 seff-34c2f0d3dcc620-a916a00b641d users      15695 Jun 29 15:27 full_pipeline.py
-rw------- 1 seff-34c2f0d3d

In [7]:
!ls -l TeraGen-1TB

total 2929694400
-rw-r--r-- 1 seff-34c2f0d3dcc620-a916a00b641d users 3333333300 Jun 29 13:27 part-r-00000
-rw-r--r-- 1 seff-34c2f0d3dcc620-a916a00b641d users 3333333300 Jun 29 13:25 part-r-00001
-rw-r--r-- 1 seff-34c2f0d3dcc620-a916a00b641d users 3333333300 Jun 29 13:35 part-r-00002
-rw-r--r-- 1 seff-34c2f0d3dcc620-a916a00b641d users 3333333300 Jun 29 13:31 part-r-00003
-rw-r--r-- 1 seff-34c2f0d3dcc620-a916a00b641d users 3333333300 Jun 29 13:33 part-r-00004
-rw-r--r-- 1 seff-34c2f0d3dcc620-a916a00b641d users 3333333300 Jun 29 13:26 part-r-00005
-rw-r--r-- 1 seff-34c2f0d3dcc620-a916a00b641d users 3333333300 Jun 29 13:35 part-r-00006
-rw-r--r-- 1 seff-34c2f0d3dcc620-a916a00b641d users 3333333300 Jun 29 13:37 part-r-00007
-rw-r--r-- 1 seff-34c2f0d3dcc620-a916a00b641d users 3333333300 Jun 29 13:37 part-r-00008
-rw-r--r-- 1 seff-34c2f0d3dcc620-a916a00b641d users 3333333300 Jun 29 13:37 part-r-00009
-rw-r--r-- 1 seff-34c2f0d3dcc620-a916a00b641d users 3333333300 Jun 29 13:22 part-r

In [5]:
teraFile = spark.read.text("TeraGen-1TB/")

In [8]:
#teraFile.sample(True,.3).show()
teraFile.groupBy("value").count().take(100)

[Row(value=u'\x00\x00\x00\x110000000000000000000000000047E02A\ufffd\ufffd\ufffd\ufffdBBBBEEEE4444FFFF7777BBBB6666222211111111DDDD7777\ufffd\ufffd\ufffd\ufffd\ufffdrZ\u0234\ufffd\ufffd\ufffd.\ufffd\x00\x110000000000000000000000000047E02B\ufffd\ufffd\ufffd\ufffd44448888555533335555FFFF0000777788888888FFFF4444\ufffd\ufffd\ufffd\ufffd\ufffd1\ufffd0!\ufffd\ufffdb?\ufffd\x00\x110000000000000000000000000047E02C\ufffd\ufffd\ufffd\ufffd2222FFFF4444AAAAAAAA99993333222288881111CCCC5555\ufffd\ufffd\ufffd\ufffd\x15s\ufffd\ufffd9\ufffd-\ufffd\ufffd\x00\x110000000000000000000000000047E02D\ufffd\ufffd\ufffd\ufffd1111CCCCAAAA3333000077776666CCCC666688881111AAAA\ufffd\ufffd\ufffd\ufffd=v\x13\ufffd[\ufffdp\ufffd\ufffd\ufffd\x00\x110000000000000000000000000047E02E\ufffd\ufffd\ufffd\ufffd111100006666666666667777111199992222BBBB00003333\ufffd\ufffd\ufffd\ufffd!5a\ufffd\ufffd3\ufffdu\ufffd\ufffd\x00\x110000000000000000000000000047E02F\ufffd\ufffd\ufffd\ufffd22226666BBBBDDDD3333CCCCAAAA111199999999DDDD0000\uf