In [1]:
import numpy as np
import pandas as pd
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer,IndexToString
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# from pyspark.ml.feature import IDF
# from pyspark.ml.feature import DCT
# from pyspark.ml.feature import PolynomialExpansion
# from pyspark.ml.feature import ChiSqSelector
from pyspark.ml import Pipeline
import itertools as it
import pyspark.sql.functions as f
import boto3

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
0,application_1543238914088_0001,pyspark3,idle,Link,Link,✔


SparkSession available as 'spark'.


In [2]:
# TODO 1. rename column names containing '.' (GS1-279B7.1, GS1-600G8.3 CAND1.11, HY.1)
#      2. read csv sep = ',' for cpv final matrix

In [3]:
spark = SparkSession.builder.appName("DiseaseApp").getOrCreate()

In [4]:
spark

<pyspark.sql.session.SparkSession object at 0x7f145a3bb6d8>

In [5]:
%%info

ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
0,application_1543238914088_0001,pyspark,idle,Link,Link,✔


In [6]:
SparkContext.setSystemProperty('spark.driver.memory', '15g')

In [7]:
seed = 0

In [8]:
SparkContext.setSystemProperty('spark.executor.memory', '15g')
#sc._conf.getAll()

In [9]:
# helper function to get all stored variables
def list_dataframes():
    from pyspark.sql import DataFrame
    return [k for (k, v) in globals().items() if isinstance(v, DataFrame)]

In [10]:
from botocore.exceptions import ClientError

def check(s3, bucket, key):
    try:
        s3.head_object(Bucket=bucket, Key=key)
    except ClientError as e:
        return int(e.response['Error']['Code']) != 404
    return True

In [11]:
def getTrainingMetrics(trainingSummary,printout=True):
    # for multiclass, we can inspect metrics on a per-label basis
#     print("False positive rate by label:")
#     for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
#         print("label %d: %s" % (i, rate))

#     print("True positive rate by label:")
#     for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
#         print("label %d: %s" % (i, rate))

#     print("Precision by label:")
#     for i, prec in enumerate(trainingSummary.precisionByLabel):
#         print("label %d: %s" % (i, prec))

#     print("Recall by label:")
#     for i, rec in enumerate(trainingSummary.recallByLabel):
#         print("label %d: %s" % (i, rec))

    print("F-measure by label:")
    for i, f in enumerate(trainingSummary.fMeasureByLabel()):
        print("label %d: %s" % (i, f))

    accuracy = trainingSummary.accuracy
    falsePositiveRate = trainingSummary.weightedFalsePositiveRate
    truePositiveRate = trainingSummary.weightedTruePositiveRate
    fMeasure = trainingSummary.weightedFMeasure()
    precision = trainingSummary.weightedPrecision
    recall = trainingSummary.weightedRecall
    if printout is True:
        print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
          % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))
    return {"accuracy": accuracy, "fpr": falsePositiveRate, "tpr": truePositiveRate, "fmeasure": fMeasure, \
            "precision": precision, "recall": recall}

In [12]:
# read data file
mirna_path = 's3://gdc-emr0/mirna_filtered_matrix.csv'
cpv_path = 's3://gdc-emr0/cpv_filtered_matrix.csv'
mrna_path = 's3://gdc-emr0/mrna_filtered_matrix.csv'
# read mirna
df_mirna = spark.read.option("maxColumns", 22400).csv(
    mirna_path, header=True, sep = ',',mode="DROPMALFORMED",inferSchema=True)
# read cpv
df_cpv = spark.read.option("maxColumns", 22400).csv(
    cpv_path, header=True, sep = ',',mode="DROPMALFORMED",inferSchema=True)
# # read mrna
# df_mrna = spark.read.option("maxColumns", 22400).csv(
#     mrna_path, header=True, sep = ',',mode="DROPMALFORMED",inferSchema=True)

In [13]:
df_cpv = df_cpv.toDF(*(c.replace('.', '_') for c in df_cpv.columns))
# df_mrna = df_mrna.toDF(*(c.replace('.', '_') for c in df_mrna.columns))

In [14]:
# group columns by label, identifier and feature
label_columns = ['sample_type', 'disease_type', 'primary_diagnosis']
mirna_identifier_columns = ['sample_id','case_id']
mirna_feature_columns = [x for x in df_mirna.columns if x not in (label_columns+mirna_identifier_columns)]

In [15]:
# group columns by label, identifier and feature
cpv_identifier_columns= ['_c0','sample_id','case_id']
cpv_feature_columns = [x for x in df_cpv.columns if x not in (label_columns+cpv_identifier_columns)]
df_cpv = df_cpv.withColumnRenamed("sample_type", "sample_type_cpv").withColumnRenamed("disease_type", "disease_type_cpv").withColumnRenamed("primary_diagnosis","primary_diagnosis_cpv").withColumnRenamed("case_id", "case_id_cpv")
cpv_label_columns = ['sample_type_cpv', 'disease_type_cpv', 'primary_diagnosis_cpv']
cpv_identifier_columns=['_c0','sample_id','case_id_cpv']

In [None]:
# # group columns by label, identifier and feature
# # cpv_label_columns = ['sample_type', 'disease_type', 'primary_diagnosis']
# mrna_identifier_columns= ['_c0','sample_id','case_id']
# mrna_feature_columns = [x for x in df_mrna.columns if x not in (label_columns+mrna_identifier_columns)]

In [16]:
# convert features into (sparse) vectors
# mirna
assembler = VectorAssembler(inputCols=mirna_feature_columns, outputCol='features_mirna')
df_mirna = assembler.transform(df_mirna)
df_mirna=df_mirna.drop(*mirna_feature_columns)
# eventually we should store/load from HDFS

In [17]:
# cpv
assembler = VectorAssembler(inputCols=cpv_feature_columns, outputCol='features_cpv')
df_cpv = assembler.transform(df_cpv)
df_cpv = df_cpv.drop(*cpv_feature_columns)

In [18]:
df = df_cpv.join(df_mirna, on=['sample_id'], how='left_outer')

In [19]:
# convert string labels to numerical labels
# keep track of the column names of numerical labels
label_idx_columns = [s + '_idx' for s in cpv_label_columns]

# declare indexers for 3 columns
labelIndexer = [StringIndexer(inputCol=column, outputCol=column+'_idx',handleInvalid="error",
                              stringOrderType="frequencyDesc") for column in cpv_label_columns ]
# pipeline is needed to process a list of 3 labels
pipeline = Pipeline(stages=labelIndexer)
# transform 3 label columns from string to number catagoies 
df = pipeline.fit(df).transform(df)

# create dictionary containing 3 label lists
label_dict = {c.name: c.metadata["ml_attr"]["vals"]
for c in df.schema.fields if c.name.endswith("_idx")}

In [20]:
# full-feature
full_feature_columns=['features_mirna','features_cpv']
assembler = VectorAssembler(inputCols=full_feature_columns, outputCol='full_features')
df = assembler.transform(df)
# df = df.drop(*cpv_feature_columns)

In [21]:
# Standardization 
scaler = StandardScaler(inputCol='full_features', outputCol='scaledFeatures', withStd=True, withMean=True)

# # # Convert indexed labels back to original labels
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=label_dict['disease_type_cpv_idx'])

In [22]:
# Evaluators
f1_evaluator=MulticlassClassificationEvaluator(predictionCol="prediction",labelCol='disease_type_cpv_idx', metricName='f1')
acc_evaluator=MulticlassClassificationEvaluator(predictionCol="prediction",labelCol='disease_type_cpv_idx', metricName='accuracy')
precision_evaluator=MulticlassClassificationEvaluator(predictionCol="prediction",labelCol='disease_type_cpv_idx', metricName='weightedPrecision')
recall_evaluator=MulticlassClassificationEvaluator(predictionCol="prediction",labelCol='disease_type_cpv_idx', metricName='weightedRecall')


In [23]:
# test/train split 
Xtest,Xtrain = df.randomSplit([0.3, 0.7], seed)

In [24]:
s3 = boto3.client('s3')
stdmodelPath = 'full_std_model/data/_SUCCESS'
if check(s3, 'gdc-emr0', stdmodelPath) == False:
    print("saving StandardScalar model...")
#     stdmodel = scaler.fit(Xtrain)
    stdmodel.save('s3://gdc-emr0/full_std_model')
else:
    from pyspark.ml.feature import StandardScalerModel
    print("loading StandardScalar model...")
    stdmodel = StandardScalerModel.load("s3://gdc-emr0/full_std_model")

loading StandardScalar model...

In [25]:
Xtrain = stdmodel.transform(Xtrain)
Xtest = stdmodel.transform(Xtest)

In [26]:
from pyspark.ml.feature import IDF
idf = IDF(inputCol="scaledFeatures", outputCol="idffeatures")
s3 = boto3.client('s3')
idfmodelPath = 'full_idf_model/data/_SUCCESS'
if check(s3, 'gdc-emr0', idfmodelPath) == False:
    print("saving IDF model...")
    idfModel = idf.fit(Xtrain)
    idfModel.save('s3://gdc-emr0/full_idf_model')
else:
    from pyspark.ml.feature import IDFModel
    print("loading IDF model...")
    idfModel = IDFModel.load("s3://gdc-emr0/full_idf_model")
Xtrain = idfModel.transform(Xtrain)
Xtest = idfModel.transform(Xtest)

saving IDF model...

## Logistic Regression

In [29]:
# Logistic Regression model 
lr = LogisticRegression(aggregationDepth= 3,maxIter=1000, regParam=0.4, elasticNetParam=0.5,
                        featuresCol='idffeatures',labelCol ='disease_type_cpv_idx',
                       family='multinomial',tol=1e-06)

# Hyperparameters to test
paramGrid = ParamGridBuilder().addGrid(lr.elasticNetParam, [0.0,0.5,1.0])\
                            .addGrid(lr.regParam,[0.1,0.5,1.0])\
                            .build()

# K-fold cross validation 
crossval = CrossValidator(estimator=lr,
                          estimatorParamMaps=paramGrid,
                          evaluator=f1_evaluator,
                          numFolds=3,seed=seed)  # use 3+ folds in practice

# Put steps in a pipeline
pipeline = Pipeline(stages=[crossval])


In [None]:
# train model
pipModel = pipeline.fit(Xtrain)

In [None]:
# predict
predictions = pipModel.transform(Xtest)

In [None]:
# get hyperparameters for best model
cvModel = pipModel.stages[-1]
bestParams = cvModel.extractParamMap()
# print ('Best Param (regParam): ', bestModel._java_obj.getRegParam())
bestModel = cvModel.bestModel
# feature_importance = bestModel.featureImportances
# num_trees = bestModel.getNumTrees
# tree_weights = bestModel.treeWeights
# trees =  bestModel.trees
# # save model
# bestModel.save('cpv1600_rf')

In [None]:
s3 = boto3.client('s3')
modelPath = 'full_std_idf_logistic_model/data/_SUCCESS'
if check(s3, 'gdc-emr0', modelPath) == False:
    print("saving std-idf Logistic Regression model...")
    bestModel.save('s3://gdc-emr0/full_std_idf_logistic_model')
else:
    print(modelPath+" already exists...")

In [None]:
f1_score = f1_evaluator.evaluate(predictions)
acc_score = acc_evaluator.evaluate(predictions)
precision_score = precision_evaluator.evaluate(predictions)
recall_score = recall_evaluator.evaluate(predictions)

In [None]:
print(f1_score)
print(acc_score)
print(precision_score)
print(recall_score)

In [None]:
coeff = bestModel.coefficientMatrix
intercepts = bestModel.interceptVector

In [None]:
coeff_np = coeff.toArray()
intercepts_np = intercepts.values

In [None]:
full_feature_columns = mirna_feature_columns+cpv_feature_columns
coeff_df = pd.DataFrame(data=coeff_np,
                        index=label_dict['disease_type_cpv_idx'],
                        columns=full_feature_columns)  

In [None]:
coeff_df['intercept'] = pd.Series(intercepts_np, index=coeff_df.index)

In [107]:
from io import StringIO
import boto3

csv_buffer = StringIO()
coeff_df.to_csv(csv_buffer)
s3_resource = boto3.resource('s3')
s3_resource.Object("gdc-emr0", 'full_logistic_coeff.csv').put(Body=csv_buffer.getvalue())

{'ETag': '"fa49bb0788163cadb8640fe560a4afd4"', 'ResponseMetadata': {'HTTPStatusCode': 200, 'RetryAttempts': 0, 'HostId': 'cWXl2nJRH29xqUvxQ4CC51Csjnlfa8XbEI9t4PYCSjBTgAnIZgVcZ8NJ4aFpCq36nZ+J7Kpw9jU=', 'HTTPHeaders': {'etag': '"fa49bb0788163cadb8640fe560a4afd4"', 'x-amz-id-2': 'cWXl2nJRH29xqUvxQ4CC51Csjnlfa8XbEI9t4PYCSjBTgAnIZgVcZ8NJ4aFpCq36nZ+J7Kpw9jU=', 'date': 'Mon, 26 Nov 2018 09:43:16 GMT', 'content-length': '0', 'x-amz-request-id': '4C96E9183A9FB755', 'server': 'AmazonS3'}, 'RequestId': '4C96E9183A9FB755'}}

In [None]:
df2.join(df1,predictions.col("predictedLabel")!=predictions.col("disease_type_cpv"),"left")

In [None]:
high_scores1['is_score_chased'] = np.where(predictions['predictedLabel']!=predictions['disease_type_cpv'])

In [None]:
# to save
# df.rdd.saveAsPickleFile(filename)
# to load
#pickleRdd = sc.pickleFile(filename).collect()
# df2 = spark.createDataFrame(pickleRdd)

### Random Forest 

In [43]:
from pyspark.ml.classification import RandomForestClassifier
# Random Forest Classifier
rf = RandomForestClassifier(cacheNodeIds=True, featuresCol='scaledFeatures',labelCol ='disease_type_cpv_idx',\
                           seed=seed,maxDepth=3)


# # Hyperparameters to test
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [200,500])\
            .build()

# # K-fold cross validation 
crossval = CrossValidator(estimator=rf,
                          estimatorParamMaps=paramGrid,
                          evaluator=f1_evaluator,
                          numFolds=2,seed=seed)  # use 3+ folds in practice

# # Put steps in a pipeline
pipeline = Pipeline(stages=[crossval])

In [44]:
# train model
pipModel = pipeline.fit(Xtrain)

In [45]:
# predict
predictions = pipModel.transform(Xtest)

In [46]:
# get hyperparameters for best model
cvModel = pipModel.stages[-1]
bestParams = cvModel.extractParamMap()
# print ('Best Param (regParam): ', bestModel._java_obj.getRegParam())
bestModel = cvModel.bestModel
feature_importance = bestModel.featureImportances
num_trees = bestModel.getNumTrees
tree_weights = bestModel.treeWeights
trees =  bestModel.trees
# # save model
# bestModel.save('cpv1600_rf')

In [48]:
s3 = boto3.client('s3')
modelPath = 'full_rf_model/data/_SUCCESS'
if check(s3, 'gdc-emr0', modelPath) == False:
    print("saving Random Forest model...")
    bestModel.save('s3://gdc-emr0/full_rf_model')
else:
    print(modelPath+" already exists...")

saving Random Forest model...

In [50]:
f1_score = f1_evaluator.evaluate(predictions)
acc_score = acc_evaluator.evaluate(predictions)
precision_score = precision_evaluator.evaluate(predictions)
recall_score = recall_evaluator.evaluate(predictions)

In [51]:
print(f1_score)
print(acc_score)
print(precision_score)
print(recall_score)

0.31670977514005116
0.38067254325824357
0.48265841425755696
0.3806725432582435

In [67]:
# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=label_dict['disease_type_cpv_idx'])
predictions = labelConverter.transform(predictions)

In [108]:
labelConverter.save('s3://gdc-emr0/disease_type_full_index2string')

In [69]:
predictions.select('predictedLabel','disease_type_cpv').show(100)


+--------------------+--------------------+
|      predictedLabel|    disease_type_cpv|
+--------------------+--------------------+
|              Normal|Kidney Renal Papi...|
|Colon Adenocarcinoma|Rectum Adenocarci...|
|Pancreatic Adenoc...|        Mesothelioma|
|Uterine Corpus En...|Uterine Corpus En...|
|Lung Squamous Cel...|Lung Squamous Cel...|
|Head and Neck Squ...|Head and Neck Squ...|
|Testicular Germ C...|Testicular Germ C...|
|Ovarian Serous Cy...|Ovarian Serous Cy...|
|Breast Invasive C...|Breast Invasive C...|
|Kidney Renal Clea...|Kidney Renal Clea...|
|Breast Invasive C...|Breast Invasive C...|
|Lung Squamous Cel...|Lung Squamous Cel...|
|Breast Invasive C...|Lung Squamous Cel...|
|Kidney Renal Clea...|Kidney Renal Clea...|
|   Thyroid Carcinoma|   Thyroid Carcinoma|
|Uterine Corpus En...|Cervical Squamous...|
|Lung Squamous Cel...|Lung Squamous Cel...|
|Bladder Urothelia...|Bladder Urothelia...|
|             Sarcoma|             Sarcoma|
|Head and Neck Squ...|Bladder Ur

In [49]:
feature_rd_df = pd.DataFrame(feature_importance, columns=['feature_importance'])
# cpv_feature_columns+mirna_feature_columns
from io import StringIO

csv_buffer = StringIO()
feature_rd_df.to_csv(csv_buffer)
s3_resource = boto3.resource('s3')
s3_resource.Object('gdc-emr0', 'rf_feature_impt.csv').put(Body=csv_buffer.getvalue())

DataFrame constructor not properly called!
Traceback (most recent call last):
  File "/usr/local/lib64/python3.4/site-packages/pandas/core/frame.py", line 404, in __init__
    raise ValueError('DataFrame constructor not properly called!')
ValueError: DataFrame constructor not properly called!

