In [1]:
import numpy as np
import pandas as pd
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer,IndexToString
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# from pyspark.ml.feature import IDF
# from pyspark.ml.feature import DCT
# from pyspark.ml.feature import PolynomialExpansion
# from pyspark.ml.feature import ChiSqSelector
from pyspark.ml import Pipeline
import itertools as it
import pyspark.sql.functions as f

In [2]:
# TODO 1. rename column names containing '.' (GS1-279B7.1, GS1-600G8.3 CAND1.11, HY.1)
#      2. read csv sep = ',' for cpv final matrix

In [26]:
spark = SparkSession.builder.appName("VisualApp").getOrCreate()

In [27]:
spark

In [None]:
%%info

In [5]:
seed = 0

In [6]:
SparkContext.setSystemProperty('spark.executor.memory', '7g')
#sc._conf.getAll()

In [30]:
SparkContext

pyspark.context.SparkContext

In [7]:
# helper function to get all stored variables
def list_dataframes():
    from pyspark.sql import DataFrame
    return [k for (k, v) in globals().items() if isinstance(v, DataFrame)]

In [8]:
from botocore.exceptions import ClientError

def check(s3, bucket, key):
    try:
        s3.head_object(Bucket=bucket, Key=key)
    except ClientError as e:
        return int(e.response['Error']['Code']) != 404
    return True

ModuleNotFoundError: No module named 'botocore'

In [9]:
def getTrainingMetrics(trainingSummary,printout=True):
    # for multiclass, we can inspect metrics on a per-label basis
#     print("False positive rate by label:")
#     for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
#         print("label %d: %s" % (i, rate))

#     print("True positive rate by label:")
#     for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
#         print("label %d: %s" % (i, rate))

#     print("Precision by label:")
#     for i, prec in enumerate(trainingSummary.precisionByLabel):
#         print("label %d: %s" % (i, prec))

#     print("Recall by label:")
#     for i, rec in enumerate(trainingSummary.recallByLabel):
#         print("label %d: %s" % (i, rec))

    print("F-measure by label:")
    for i, f in enumerate(trainingSummary.fMeasureByLabel()):
        print("label %d: %s" % (i, f))

    accuracy = trainingSummary.accuracy
    falsePositiveRate = trainingSummary.weightedFalsePositiveRate
    truePositiveRate = trainingSummary.weightedTruePositiveRate
    fMeasure = trainingSummary.weightedFMeasure()
    precision = trainingSummary.weightedPrecision
    recall = trainingSummary.weightedRecall
    if printout is True:
        print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
          % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))
    return {"accuracy": accuracy, "fpr": falsePositiveRate, "tpr": truePositiveRate, "fmeasure": fMeasure, \
            "precision": precision, "recall": recall}

In [10]:
# read data file
mirna_path = 'gdc-emr0/mirna_filtered_matrix.csv'
cpv_path = 'gdc-emr0/cpv_filtered_matrix.csv'
mrna_path = 'gdc-emr0/mrna_filtered_matrix.csv'
# read mirna
df_mirna = spark.read.option("maxColumns", 22400).csv(
    mirna_path, header=True, sep = ',',mode="DROPMALFORMED",inferSchema=True)
# read cpv
df_cpv = spark.read.option("maxColumns", 22400).csv(
    cpv_path, header=True, sep = ',',mode="DROPMALFORMED",inferSchema=True)
# # read mrna
# df_mrna = spark.read.option("maxColumns", 22400).csv(
#     mrna_path, header=True, sep = ',',mode="DROPMALFORMED",inferSchema=True)

In [11]:
df_cpv = df_cpv.toDF(*(c.replace('.', '_') for c in df_cpv.columns))
# df_mrna = df_mrna.toDF(*(c.replace('.', '_') for c in df_mrna.columns))

In [12]:
# group columns by label, identifier and feature
label_columns = ['sample_type', 'disease_type', 'primary_diagnosis']
mirna_identifier_columns = ['sample_id','case_id']
mirna_feature_columns = [x for x in df_mirna.columns if x not in (label_columns+mirna_identifier_columns)]

In [13]:
# group columns by label, identifier and feature
cpv_identifier_columns= ['_c0','sample_id','case_id']
cpv_feature_columns = [x for x in df_cpv.columns if x not in (label_columns+cpv_identifier_columns)]
df_cpv = df_cpv.withColumnRenamed("sample_type", "sample_type_cpv").withColumnRenamed("disease_type", "disease_type_cpv").withColumnRenamed("primary_diagnosis","primary_diagnosis_cpv").withColumnRenamed("case_id", "case_id_cpv")
cpv_label_columns = ['sample_type_cpv', 'disease_type_cpv', 'primary_diagnosis_cpv']
cpv_identifier_columns=['_c0','sample_id','case_id_cpv']

In [None]:
# # group columns by label, identifier and feature
# # cpv_label_columns = ['sample_type', 'disease_type', 'primary_diagnosis']
# mrna_identifier_columns= ['_c0','sample_id','case_id']
# mrna_feature_columns = [x for x in df_mrna.columns if x not in (label_columns+mrna_identifier_columns)]

In [14]:
# convert features into (sparse) vectors
# mirna
assembler = VectorAssembler(inputCols=mirna_feature_columns, outputCol='features_mirna')
df_mirna = assembler.transform(df_mirna)
df_mirna=df_mirna.drop(*mirna_feature_columns)
# eventually we should store/load from HDFS

In [15]:
# cpv
assembler = VectorAssembler(inputCols=cpv_feature_columns, outputCol='features_cpv')
df_cpv = assembler.transform(df_cpv)
df_cpv = df_cpv.drop(*cpv_feature_columns)

In [16]:
df = df_cpv.join(df_mirna, on=['sample_id'], how='inner')

In [19]:
sc

''

In [31]:
from pyspark.mllib.stat import Statistics

seriesX = SparkContext.parallelize([1.0, 2.0, 3.0, 3.0, 5.0])  # a series
# seriesY must have the same number of partitions and cardinality as seriesX
seriesY = SparkContext.parallelize([11.0, 22.0, 33.0, 33.0, 555.0])

# # Compute the correlation using Pearson's method. Enter "spearman" for Spearman's method.
# # If a method is not specified, Pearson's method will be used by default.
# print("Correlation is: " + str(Statistics.corr(seriesX, seriesY, method="pearson")))

# data = sc.parallelize(
#     [np.array([1.0, 10.0, 100.0]), np.array([2.0, 20.0, 200.0]), np.array([5.0, 33.0, 366.0])]
# )  # an RDD of Vectors

# calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
# If a method is not specified, Pearson's method will be used by default.
# df_mirna= Statistics.corr(df_mirna.rdd, method="pearson")

TypeError: parallelize() missing 1 required positional argument: 'c'

In [29]:
spark

In [None]:
# convert string labels to numerical labels
# keep track of the column names of numerical labels
label_idx_columns = [s + '_idx' for s in cpv_label_columns]

# declare indexers for 3 columns
labelIndexer = [StringIndexer(inputCol=column, outputCol=column+'_idx',handleInvalid="error",
                              stringOrderType="frequencyDesc") for column in cpv_label_columns ]
# pipeline is needed to process a list of 3 labels
pipeline = Pipeline(stages=labelIndexer)
# transform 3 label columns from string to number catagoies 
df = pipeline.fit(df).transform(df)

# create dictionary containing 3 label lists
label_dict = {c.name: c.metadata["ml_attr"]["vals"]
for c in df.schema.fields if c.name.endswith("_idx")}

In [25]:
# full-feature
full_feature_columns=['features_mirna','features_cpv']
assembler = VectorAssembler(inputCols=full_feature_columns, outputCol='full_features')
df = assembler.transform(df)
# df = df.drop(*cpv_feature_columns)

In [None]:
import numpy as np

from pyspark.mllib.stat import Statistics

mat = sc.parallelize(
    [np.array([1.0, 10.0, 100.0]), np.array([2.0, 20.0, 200.0]), np.array([3.0, 30.0, 300.0])]
)  # an RDD of Vectors

# Compute column summary statistics.
summary = Statistics.colStats(mat)
print(summary.mean())  # a dense vector containing the mean value for each column
print(summary.variance())  # column-wise variance
print(summary.numNonzeros())  # number of nonzeros in each column

In [18]:
# Standardization 
scaler = StandardScaler(inputCol='full_features', outputCol='scaledFeatures', withStd=True, withMean=True)

# # # Convert indexed labels back to original labels
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=label_dict['disease_type_cpv_idx'])

In [19]:
# Evaluators
f1_evaluator=MulticlassClassificationEvaluator(predictionCol="prediction",labelCol='disease_type_cpv_idx', metricName='f1')
acc_evaluator=MulticlassClassificationEvaluator(predictionCol="prediction",labelCol='disease_type_cpv_idx', metricName='accuracy')
precision_evaluator=MulticlassClassificationEvaluator(predictionCol="prediction",labelCol='disease_type_cpv_idx', metricName='weightedPrecision')
recall_evaluator=MulticlassClassificationEvaluator(predictionCol="prediction",labelCol='disease_type_cpv_idx', metricName='weightedRecall')


In [20]:
# test/train split 
Xtest,Xtrain = df.randomSplit([0.3, 0.7], seed)

In [None]:
Xtest_sample_id = Xtest.select('sample_id').collect()

In [None]:
s3 = boto3.client('s3')
stdmodelPath = 'full_std_model'
    print("loading StandardScalar model...")
    stdmodel = StandardScalerModel.load(stdmodelPath)
# if check(s3, 'gdc-emr0', stdmodelPath) == False:
#     print("saving StandardScalar model...")
#     stdmodel = scaler.fit(Xtrain)
#     stdmodel.save('s3://gdc-emr0/full_std_model')
# else:
#     from pyspark.ml.feature import StandardScalerModel
#     print("loading StandardScalar model...")
#     stdmodel = StandardScalerModel.load(stdmodelPath)

In [None]:
Xtrain = stdmodel.transform(Xtrain)
Xtest = stdmodel.transform(Xtest)

In [None]:
# to save
# df.rdd.saveAsPickleFile(filename)
# to load
#pickleRdd = sc.pickleFile(filename).collect()
# df2 = spark.createDataFrame(pickleRdd)

### Random Forest 

In [None]:
from pyspark.ml.classification import RandomForestClassifier
# Random Forest Classifier
rf = RandomForestClassifier(cacheNodeIds=True, featuresCol='scaledFeatures',labelCol ='disease_type_cpv_idx', numTrees=1000,\
                           seed=seed)


# # Hyperparameters to test
paramGrid = ParamGridBuilder().addGrid(rf.maxDepth, [5,7,9])\
            .build()

# # K-fold cross validation 
crossval = CrossValidator(estimator=rf,
                          estimatorParamMaps=paramGrid,
                          evaluator=f1_evaluator,
                          numFolds=5,seed=seed)  # use 3+ folds in practice

# # Put steps in a pipeline
pipeline = Pipeline(stages=[crossval])

In [None]:
# train model
pipModel = pipeline.fit(Xtrain)

In [None]:
# predict
predictions = pipModel.transform(Xtest)

In [None]:
# get hyperparameters for best model
cvModel = pipModel.stages[-1]
bestParams = cvModel.extractParamMap()
# print ('Best Param (regParam): ', bestModel._java_obj.getRegParam())
bestModel = cvModel.bestModel
feature_importance = bestModel.featureImportances
num_trees = bestModel.getNumTrees
tree_weights = bestModel.treeWeights
trees =  bestModel.trees
# # save model
# bestModel.save('cpv1600_rf')

In [None]:
s3 = boto3.client('s3')
modelPath = 'full_rf_model/data/_SUCCESS'
if check(s3, 'gdc-emr0', stdmodelPath) == False:
    print("saving Random Forest model...")
    bestModel.save('s3://gdc-emr0/full_rf_model')
else:
    print(modelPath+" already exists...")

In [None]:
# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=label_dict['disease_type_cpv_idx'])
predictions = labelConverter.transform(predictions)

In [None]:
feature_rd_df = pd.DataFrame(feature_importance, columns=['feature_importance'])
# cpv_feature_columns+mirna_feature_columns
from io import StringIO

csv_buffer = StringIO()
feature_rd_df.to_csv(csv_buffer)
s3_resource = boto3.resource('s3')
s3_resource.Object('gdc-emr0', 'rf_feature_impt.csv').put(Body=csv_buffer.getvalue())