In [1]:
import numpy as np
import pandas as pd
from pyspark import SparkContext
from pyspark.ml.clustering import KMeans
from pyspark.ml.clustering import BisectingKMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SQLContext
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import ClusteringEvaluator
import plotly.plotly as py
import plotly.graph_objs as go
import itertools as it

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
9,application_1542212552589_0010,pyspark3,idle,Link,Link,✔


SparkSession available as 'spark'.


In [2]:
spark.version

'2.3.2'

In [3]:
%%info

ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
9,application_1542212552589_0010,pyspark,idle,Link,Link,✔


In [4]:
path = 's3://gdc-emr/final_matrix.csv'
df = spark.read.csv(
    path, header=True, mode="DROPMALFORMED",inferSchema=True)

In [5]:
df.printSchema()

root
 |-- level_0: integer (nullable = true)
 |-- index: double (nullable = true)
 |-- hsa-let-7a-1: double (nullable = true)
 |-- hsa-let-7a-2: double (nullable = true)
 |-- hsa-let-7a-3: double (nullable = true)
 |-- hsa-let-7b: double (nullable = true)
 |-- hsa-let-7c: double (nullable = true)
 |-- hsa-let-7d: double (nullable = true)
 |-- hsa-let-7e: double (nullable = true)
 |-- hsa-let-7f-1: double (nullable = true)
 |-- hsa-let-7f-2: double (nullable = true)
 |-- hsa-let-7g: double (nullable = true)
 |-- hsa-let-7i: double (nullable = true)
 |-- hsa-mir-1-1: double (nullable = true)
 |-- hsa-mir-1-2: double (nullable = true)
 |-- hsa-mir-100: double (nullable = true)
 |-- hsa-mir-101-1: double (nullable = true)
 |-- hsa-mir-101-2: double (nullable = true)
 |-- hsa-mir-103a-1: double (nullable = true)
 |-- hsa-mir-103a-2: double (nullable = true)
 |-- hsa-mir-103b-1: double (nullable = true)
 |-- hsa-mir-103b-2: double (nullable = true)
 |-- hsa-mir-105-1: double (nullable = true

In [6]:
df_feat = df.drop('index','level_0','sample_type', 'disease_type', 'primary_diagnosis', 'case_id')

In [7]:
label_list = ['sample_type', 'disease_type', 'primary_diagnosis']
num_label_list = [s + '_index' for s in label_list]
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(df) for column in label_list ]
pipeline = Pipeline(stages=indexers)
df_label = pipeline.fit(df).transform(df)

In [8]:
label_key_table = dict()
for num_label in num_label_list:  
    meta = [
        f.metadata for f in df_label.schema.fields if f.name == num_label
    ]
    label_key_table[num_label[:-6]]=meta[0]['ml_attr']['vals']

In [9]:
print(len(label_key_table['sample_type']))
print(len(label_key_table['disease_type']))
print(len(label_key_table['primary_diagnosis']))

6
32
135

In [10]:
label_key_table

{'disease_type': ['Breast Invasive Carcinoma', 'Kidney Renal Clear Cell Carcinoma', 'Thyroid Carcinoma', 'Lung Adenocarcinoma', 'Uterine Corpus Endometrial Carcinoma', 'Head and Neck Squamous Cell Carcinoma', 'Prostate Adenocarcinoma', 'Brain Lower Grade Glioma', 'Lung Squamous Cell Carcinoma', 'Ovarian Serous Cystadenocarcinoma', 'Stomach Adenocarcinoma', 'Colon Adenocarcinoma', 'Skin Cutaneous Melanoma', 'Bladder Urothelial Carcinoma', 'Liver Hepatocellular Carcinoma', 'Kidney Renal Papillary Cell Carcinoma', 'Cervical Squamous Cell Carcinoma and Endocervical Adenocarcinoma', 'Sarcoma', 'Esophageal Carcinoma', 'Pheochromocytoma and Paraganglioma', 'Pancreatic Adenocarcinoma', 'Rectum Adenocarcinoma', 'Testicular Germ Cell Tumors', 'Thymoma', 'Kidney Chromophobe', 'Mesothelioma', 'Uveal Melanoma', 'Adrenocortical Carcinoma', 'Acute Myeloid Leukemia', 'Uterine Carcinosarcoma', 'Lymphoid Neoplasm Diffuse Large B-cell Lymphoma', 'Cholangiocarcinoma'], 'sample_type': ['Primary Tumor', 'So

In [11]:
df_label.select(num_label_list).show(10)

+-----------------+------------------+-----------------------+
|sample_type_index|disease_type_index|primary_diagnosis_index|
+-----------------+------------------+-----------------------+
|              0.0|               8.0|                    1.0|
|              0.0|               8.0|                    1.0|
|              0.0|               8.0|                   44.0|
|              0.0|               8.0|                    1.0|
|              0.0|               8.0|                   44.0|
|              0.0|               8.0|                    1.0|
|              0.0|               5.0|                    1.0|
|              0.0|               5.0|                    1.0|
|              1.0|               5.0|                    1.0|
|              0.0|               5.0|                    1.0|
+-----------------+------------------+-----------------------+
only showing top 10 rows

In [12]:
df_feat = df.drop('index','level_0','sample_type', 'disease_type', 'primary_diagnosis', 'case_id')
FEATURE_NAMES = df_feat.schema.names
# print(FEATURE_NAMES)
FEATURE_SIZE = len(FEATURE_NAMES)
print(len(FEATURE_NAMES))
# print(FEATURE_NAMES)

1929

In [13]:
assembler = VectorAssembler(inputCols=FEATURE_NAMES, outputCol="features")
assembled_data = assembler.transform(df)

In [14]:
assembled_data.select('features').show(1,truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [15]:
# feature scaling
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures')
scaler_model = scaler.fit(assembled_data)
scaled_data = scaler_model.transform(assembled_data)

In [16]:
scaled_data.select('scaledFeatures').show(2,truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [17]:
from pyspark.ml.feature import PCA as PCAml
from pyspark.ml.linalg import Vectors  # Pre 2.0 pyspark.mllib.linalg

pca = PCAml(k=50, inputCol="scaledFeatures", outputCol="pca")
model = pca.fit(scaled_data)
transformed = model.transform(scaled_data)

In [48]:
pca_data=transformed.select('pca').collect()
type(pca_data)

<class 'list'>

In [49]:
pca_array = np.array([i.pca[:] for i in pca_data])

In [50]:
pca_array.shape

(10718, 50)

In [19]:
from sklearn.manifold import TSNE
def tsneSelection(X_train, n, v):
    '''
    t-distributed Stochastic Neighbor Embedding feature selection.  Select n features.
    Input:
    X_train - dataset with dimension [P-samples x Q-features]
    n - number of desired features after feature reduction
    v - 1 for verbose, 0 for slience
    Output:
    X_train_new - dataset with dimension [P-samples x n-features] 
    '''
    tsne = TSNE(n_components=n,verbose=v)
    X_train_new = tsne.fit_transform(X_train)
    print("X_train size after tSNE: {}".format(X_train_new.shape))
    return X_train_new

In [51]:
tsne_data=tsneSelection(pca_array,2,1)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 10718 samples in 0.019s...
[t-SNE] Computed neighbors for 10718 samples in 9.890s...
[t-SNE] Computed conditional probabilities for sample 1000 / 10718
[t-SNE] Computed conditional probabilities for sample 2000 / 10718
[t-SNE] Computed conditional probabilities for sample 3000 / 10718
[t-SNE] Computed conditional probabilities for sample 4000 / 10718
[t-SNE] Computed conditional probabilities for sample 5000 / 10718
[t-SNE] Computed conditional probabilities for sample 6000 / 10718
[t-SNE] Computed conditional probabilities for sample 7000 / 10718
[t-SNE] Computed conditional probabilities for sample 8000 / 10718
[t-SNE] Computed conditional probabilities for sample 9000 / 10718
[t-SNE] Computed conditional probabilities for sample 10000 / 10718
[t-SNE] Computed conditional probabilities for sample 10718 / 10718
[t-SNE] Mean sigma: 2.089728
[t-SNE] KL divergence after 250 iterations with early exaggeration: 82.615601
[t-SNE] KL 

In [52]:
tsne_labels=df_label.select(num_label_list).collect()

In [53]:
tsne_labels[0].sample_type_index

0.0

In [54]:
sample_type = [int(i.sample_type_index) for i in tsne_labels]
disease_type = [int(i.disease_type_index) for i in tsne_labels]
primary_diagnosis = [int(i.primary_diagnosis_index) for i in tsne_labels]
label_vals = [sample_type,disease_type,primary_diagnosis]

In [55]:
label_array = array(label_vals)
label_array=np.transpose(label_array)
label_array.shape

(10718, 3)

In [57]:
tsne_data.shape

(10718, 2)

In [58]:
arr = np.concatenate([tsne_data,label_array],axis=1)
arr.shape

(10718, 5)

In [59]:
dff=spark.createDataFrame(pd.DataFrame(arr,columns=['tsne_x','tsne_y','sample_type_idx','disease_type_idx','diagonosis_idx']))

In [60]:
dff.show(2)

+-------------------+-----------------+---------------+----------------+--------------+
|             tsne_x|           tsne_y|sample_type_idx|disease_type_idx|diagonosis_idx|
+-------------------+-----------------+---------------+----------------+--------------+
|-11.228962898254395| 74.4780502319336|            0.0|             8.0|           1.0|
|-53.291316986083984|2.178873300552368|            0.0|             8.0|           1.0|
+-------------------+-----------------+---------------+----------------+--------------+
only showing top 2 rows

In [62]:
dff.repartition(1).write.csv("s3://gdc-emr/tsne3", mode="overwrite", header=True, sep=",")
# plot in python3

#### Clustering

In [None]:
# norm_costs = {}
# raw_costs = {}
# centroids = {}
# # feel free to comment out some of the below if it runs for too long
# models = {'km_2':KMeans(featuresCol='scaledFeatures', k=2),
#           'km_32':KMeans(featuresCol='scaledFeatures', k=4),
#           'km_130':KMeans(featuresCol='scaledFeatures', k=10),
#           'bkm_2':BisectingKMeans().setK(2).setSeed(1),
#           'bkm_32':BisectingKMeans().setK(4).setSeed(1),
#           'bkm_130':BisectingKMeans().setK(10).setSeed(1),
# }

# for key in models:
#     # Trains a k-means model
#     model = models[key].fit(scaled_data)
#     # Make predictions
#     predictions = model.transform(scaled_data)
#     # Evaluate clustering
#     # sum of squared distances of points to their nearest center
#     cost = model.computeCost(scaled_data)
#     raw_costs[key] = cost
#     # Evaluate clustering by computing Silhouette score
#     # squared Euclidean distances ranges between 1 and -1, where a value close to 1 means that the points in a cluster are close to the other points in the same cluster and far from the points of the other clusters.
#     evaluator = ClusteringEvaluator()
#     silhouette = evaluator.evaluate(predictions)
#     norm_costs[key] = silhouette
#     # Shows the result.
#     centroids[key] = model.clusterCenters()

In [None]:
# Define Graph layout
layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
        )
)

In [None]:
# norm_costs

In [None]:
# cost = np.zeros(20)
# for k in range(2,10):
#     kmeans = BisectingKMeans().setK(k).setSeed(1).setFeaturesCol("scaledFeatures")
#     model = kmeans.fit(scaled_data)
#     cost[k] = model.computeCost(scaled_data) # requires Spark 2.0 or later

In [None]:
import os
cwd = os.getcwd()
cwd

In [None]:
import plotly
plotly.tools.set_credentials_file(username='ee542team', api_key='ydS5pDO23ozEfM0vqoFM')

In [None]:
colors = it.cycle(["aquamarine", "crimson", "darkseagreen", "deeppink","wheat","violet","fuchsia","turquoise",\
                   "ivory", "honeydew", "rosybrown","red","lemonchiffon","darkorchid","mintcream","papayawhip",\
                   "beige","darkcyan","firebrick","deepskyblue","seashell","mediumpurple","goldenrod","lightcoral",\
                   "limegreen","cadetblue","darkmagenta","ghostwhite","gainsboro","paleturquoise","teal","peru",\
                  "maroon","olivedrab","springgreen","yellowgreen"])
classes = it.cycle(['Normal','Breast', 'Uterine Corpus', 'Head', 'Kidney Renal Clear', 'Lung Adenocarcinoma', 'Brain', 'Thyroid', 'Prostate', 'Ovarian', 'Lung Squamous', 'Skin', 'Colon', 'Stomach', 'Bladder', 'Liver', 'Cervical', 'Kidney Renal Papillary', 'Leukemia', 'Sarcoma', 'Esophageal', 'Pheochromocytoma', 'Pancreatic', 'Rectum', 'Testicular', 'Wilms', 'Thymoma', 'Mesothelioma', 'Adrenocortical', 'Uveal', 'Kidney Chromophobe', 'Uterine Carcinosarcoma', 'Lymphoid', 'Rhabdoid', 'Cholangiocarcinoma'])

classes_labels = ['Normal','Breast', 'Uterine Corpus', 'Head', 'Kidney Renal Clear', 'Lung Adenocarcinoma', 'Brain', 'Thyroid', 'Prostate', 'Ovarian', 'Lung Squamous', 'Skin', 'Colon', 'Stomach', 'Bladder', 'Liver', 'Cervical', 'Kidney Renal Papillary', 'Leukemia', 'Sarcoma', 'Esophageal', 'Pheochromocytoma', 'Pancreatic', 'Rectum', 'Testicular', 'Wilms', 'Thymoma', 'Mesothelioma', 'Adrenocortical', 'Uveal', 'Kidney Chromophobe', 'Uterine Carcinosarcoma', 'Lymphoid', 'Rhabdoid', 'Cholangiocarcinoma']

# x = tsne_data[:,0]
# y = tsne_data[:,1]
# for i in range(0,32):
#     idx=np.where(label_array[:,2]==i)
#     x_sel =[x[i] for i in idx]
#     y_sel =[y[i] for i in idx]

def scatter2D(X_train_2d,y_train,num_class):
    '''
    Function to genrate traces for 2D scatter plot
    Args: 2-feature X_train of dimension [?,2]
    Return: list of scatter plot trace objects
    '''
    data=[]
    for label in range(0,num_class):
        filtered_idx = np.where(y_train==label)
        trace = go.Scatter(
            x=X_train_2d[filtered_idx,0],
            y=X_train_2d[filtered_idx,1],
            mode='markers',
            marker=dict(
                size=5,
                line=dict(
                    color=next(colors),
                    width=0.1
                    ),
                opacity=0.5
                ),
            name=next(classes)
            )
        data.append(trace)
    return data