In [1]:
import numpy as np
import pandas as pd
from pyspark import SparkContext
from pyspark.sql import SparkSession
import pyspark as spark
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import IDF
from pyspark.ml.feature import DCT
from pyspark.ml.feature import PolynomialExpansion
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml import Pipeline
import itertools as it
import pyspark.sql.functions as f
from pyspark.sql.types import IntegerType

In [2]:
# TODO 1. rename column names containing '.' (GS1-279B7.1, GS1-600G8.3 CAND1.11, HY.1)
#      2. read csv sep = ',' for cpv final matrix

In [3]:
spark = SparkSession.builder.appName("DiseaseApp").getOrCreate()

In [4]:
spark

In [None]:
%%info

In [5]:
seed = 0

In [6]:
SparkContext.setSystemProperty('spark.executor.memory', '1g')
#sc._conf.getAll()

In [7]:
def list_dataframes():
    from pyspark.sql import DataFrame
    return [k for (k, v) in globals().items() if isinstance(v, DataFrame)]

In [8]:
path = 'CPV/cpv_final_matrix_test.csv'
df = spark.read.option("maxColumns", 22400).csv(
    path, header=True, sep = '\t',mode="DROPMALFORMED",inferSchema=True)

In [None]:
df.printSchema()

In [9]:
label_columns = ['sample_type', 'disease_type', 'primary_diagnosis']
identifier_columns = ['_c0','sample_id','case_id']

In [10]:
df_labels = df.select(label_columns)
df_idx = df.select(identifier_columns)
df = df.drop(*(label_columns+identifier_columns))

In [11]:
# transform 3 label columns from string to number catagoies 
# pipeline is needed to process a list of 3 labels
indexers = [StringIndexer(inputCol=column, outputCol=column+'_idx') for column in label_columns ]
pipeline = Pipeline(stages=indexers)
df_label = pipeline.fit(df_labels).transform(df_labels)
label_idx_columns = [s + '_idx' for s in label_columns]

In [None]:
df_label

In [12]:
# Get string-2-num correspondance 
label_key_table = dict()
for x in label_idx_columns:  
    meta = [
        f.metadata for f in df_label.schema.fields if f.name == x
    ]
    label_key_table[x[:-4]]=meta[0]['ml_attr']['vals']

In [13]:
print("sample_type num classes: {}".format(len(label_key_table['sample_type'])))
print("disease_type num classes: {}".format(len(label_key_table['disease_type'])))
print("primary_diagnosis num classes: {}".format(len(label_key_table['primary_diagnosis'])))

sample_type num classes: 2
disease_type num classes: 34
primary_diagnosis num classes: 77


In [14]:
# convert features into (sparse) vectors
assembler = VectorAssembler(inputCols=df.columns, outputCol='features')
df = assembler.transform(df)

In [15]:
# keep vectorized features
feat_names = df.columns
df = df.select('features')

In [16]:
# test/train split
Xtest,Xtrain = df.randomSplit([0.3, 0.7], seed)
ytest,ytrain = df_label.randomSplit([0.3, 0.7], seed)
idx_test,idx_train = df_idx.randomSplit([0.3, 0.7], seed)

### Feature Transformation 

In [17]:
# standardization 
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures', withStd=True, withMean=True)

# # TF-IDF (Term frequency-inverse document frequency)
# idf = IDF(inputCol="features", outputCol="idfFeatures")

# # DCT 
# dct = DCT(inverse=False, inputCol="features", outputCol="dctFeatures")

# # Polynomial Expansion 
# polyExpansion = PolynomialExpansion(degree=2, inputCol="features", outputCol="polyFeatures")

# # Combos
# # std -> idf
# std_idf = IDF(inputCol=scaler.getOutputCol(), outputCol="stdidfFeatures")
# std_idfer = Pipeline(stages=[scaler,std_idf]) 

# # idf -> std
# idf_std = StandardScaler(inputCol=idf.getOutputCol(), outputCol='idfstdFeatures', withStd=True, withMean=True)
# idf_stder = Pipeline(stages=[idf,idf_std]) 


In [18]:
# # data-transfomer pipeline
# dct_Xtrain = dct.transform(Xtrain)

# model-fitter pipeline
std_model = scaler.fit(Xtrain)
# idf_model = idf.fit(Xtrain)
# std_idf_model = std_idfer.fit(Xtrain)
# idf_std_model = idf_stder.fit(Xtrain)

# model-transformer pipeline
Xtrain =  std_model.transform(Xtrain)
Xtest = std_model.transform(Xtest)
# interaction (can use RFormula: y ~ a + b + a:b - 1)


#### combine dataframes

In [19]:
# combine dataframes
# since there is no common column between these two dataframes add row_index so that it can be joined
df=Xtrain.withColumn('row_index', f.monotonically_increasing_id())
label_df=ytrain.withColumn('row_index', f.monotonically_increasing_id())
XyTrain = df.join(label_df, on=["row_index"]).sort("row_index").drop("row_index")

df=Xtest.withColumn('row_index', f.monotonically_increasing_id())
label_df=ytest.withColumn('row_index', f.monotonically_increasing_id())
XyTest = df.join(label_df, on=["row_index"]).sort("row_index").drop("row_index")

In [20]:
# Sample type 
sample_label_cols = ['scaledFeatures','sample_type_idx']
XyTrain_sample = XyTrain.select(*sample_label_cols)
XyTest_sample = XyTest.select(*sample_label_cols)

disease_label_cols = ['scaledFeatures','disease_type_idx']
XyTrain_disease = XyTrain.select(*disease_label_cols)
XyTest_disease = XyTest.select(*disease_label_cols)

tumor_label_cols = ['scaledFeatures','primary_diagnosis_idx']
XyTrain_tumor = XyTrain.select(*tumor_label_cols)
XyTest_tumor = XyTest.select(*tumor_label_cols)

In [21]:
XyTrain_sample.cache()
XyTest_sample.cache()

DataFrame[scaledFeatures: vector, sample_type_idx: double]

### Feature Selection

In [31]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

rfr = RandomForestRegressor(featuresCol='scaledFeatures',labelCol ='sample_type_idx')

estimatorParam = ParamGridBuilder() \
.addGrid(rfr.maxDepth, [4, 6, 8]) \
.addGrid(rfr.maxBins, [5, 10, 20, 40]) \
.addGrid(rfr.impurity, ["variance"]) \
.build()

evaluator = RegressionEvaluator(labelCol="sample_type_idx", predictionCol="prediction", metricName="r2")

crossval = CrossValidator(estimator=rfr,
                         estimatorParamMaps=estimatorParam,
                         evaluator=evaluator,
                         numFolds=3)

In [32]:
XyTrain_sample

DataFrame[scaledFeatures: vector, sample_type_idx: double]

In [33]:
cvmodel = crossval.fit(XyTrain_sample)

KeyboardInterrupt: 

In [None]:
# feature selection by top percentile
p_selector = ChiSqSelector(selectorType = 'percentile', percentile=0.1, outputCol="percentFeatures",featuresCol="features",labelCol="sample_type_idx")
p_selector_model = p_selector.fit(XyTrain_sample)

# feature seleciton by false-positive-rate threshold
f_selector = ChiSqSelector(selectorType = 'fpr', fpr=0.2, outputCol="fprFeatures",featuresCol="features",labelCol="sample_type_idx")
f_selector_model = f_selector.fit(XyTrain_sample)

In [None]:
print("Percent Selecter:", p_selector.getNumTopFeatures())
print("FPR Selecter:", f_selector.getNumTopFeatures())

In [None]:
len(list(set(p_selector_model.selectedFeatures)&set(f_selector_model.selectedFeatures)))

In [None]:
def save(fpath,fname,obj):
    # selector saver 
    fullpath = fpath + '/'+ fname
    obj.save(fullpath)

# def load(fpath):
#     # selector loader
#     loadedSelector = ChiSqSelector.load(chiSqSelectorPath)
#     loadedSelector.getNumTopFeatures() == selector.getNumTopFeatures()
#     # model loader
#     loadedModel = ChiSqSelectorModel.load(modelPath)
#     loadedModel.selectedFeatures == model.selectedFeatures

In [None]:
cwd = os.getcwd()
save(cwd,'cpv_toy_10percent_chi_selector',p_selector)
save(cwd,'cpv_toy_10percent_chi_model',p_selector_model)
save(cwd,'cpv_toy_2fpr_chi_selector',f_selector)
save(cwd,'cpv_toy_2fpr_model',f_selector_model)

In [None]:
XyTest_sample

### Logistic Regerssion for Cancer/No Cancer

In [None]:
from pyspark.ml.classification import LogisticRegression

# Load training data
lr = LogisticRegression(maxIter=1000, regParam=0.3, elasticNetParam=0.8,featuresCol='scaledFeatures',labelCol ='sample_type_idx')

# Fit the model
lrModel = lr.fit(XyTrain_sample)

# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))


In [None]:
# Extract the summary from the returned LogisticRegressionModel instance trained
# in the earlier example
trainingSummary = lrModel.summary

# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
trainingSummary.roc.show()
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

# Set the model threshold to maximize F-Measure
fMeasure = trainingSummary.fMeasureByThreshold
maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head()
bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']).select('threshold').head()['threshold']
lr.setThreshold(bestThreshold)
print("setting model to best threshold:"+ str(bestThreshold))

### Logistic Regression for Multi-class

In [None]:
# We can also use the multinomial family for binary classification
mlr = LogisticRegression(maxIter=1000, regParam=0.3, elasticNetParam=0.8, family="multinomial",featuresCol='scaledFeatures',labelCol ='disease_type_idx')

# Fit the model
mlrModel = mlr.fit(XyTrain_disease)

# Print the coefficients and intercepts for logistic regression with multinomial family
print("Multinomial coefficients: " + str(mlrModel.coefficientMatrix))
print("Multinomial intercepts: " + str(mlrModel.interceptVector))

# Extract the summary from the returned LogisticRegressionModel instance trained
# in the earlier example
trainingSummary = lrModel.summary

# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
trainingSummary.roc.show()
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

# Set the model threshold to maximize F-Measure
fMeasure = trainingSummary.fMeasureByThreshold
maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head()
bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']).select('threshold').head()['threshold']
lr.setThreshold(bestThreshold)
print("setting model to best threshold:"+ str(bestThreshold))

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label")
accuracy = evaluator.evaluate(predictions)

### Random Forest

In [None]:



# We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
# This will allow us to jointly choose parameters for all Pipeline stages.
# A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
# We use a ParamGridBuilder to construct a grid of parameters to search over.
# With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
# this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
# paramGrid = ParamGridBuilder() \
# #     .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \
#     .addGrid(lr.regParam, [0.1, 0.01]) \
#     .build()





# crossval = CrossValidator(estimator=pipeline,
#                           estimatorParamMaps=paramGrid,
#                           evaluator=BinaryClassificationEvaluator(),
#                           numFolds=2)  # use 3+ folds in practice

# # Run cross-validation, and choose the best set of parameters.
# cvModel = crossval.fit(training)

# # Prepare test documents, which are unlabeled.
# test = spark.createDataFrame([
#     (4, "spark i j k"),
#     (5, "l m n"),
#     (6, "mapreduce spark"),
#     (7, "apache hadoop")
# ], ["id", "text"])

# # Make predictions on test documents. cvModel uses the best model found (lrModel).
# prediction = cvModel.transform(test)
# selected = prediction.select("id", "text", "probability", "prediction")
# for row in selected.collect():
#     print(row)

In [None]:
lr = LinearRegression(maxIter=10)

# We use a ParamGridBuilder to construct a grid of parameters to search over.
# TrainValidationSplit will try all combinations of values and determine best model using
# the evaluator.
paramGrid = ParamGridBuilder()\
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .addGrid(lr.fitIntercept, [False, True])\
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\
    .build()

# In this case the estimator is simply the linear regression.
# A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
tvs = TrainValidationSplit(estimator=lr,
                           estimatorParamMaps=paramGrid,
                           evaluator=RegressionEvaluator(),
                           # 80% of the data will be used for training, 20% for validation.
                           trainRatio=0.8)

# Run TrainValidationSplit, and choose the best set of parameters.
model = tvs.fit(train)

# Make predictions on test data. model is the model with combination of parameters
# that performed best.
model.transform(test)\
    .select("features", "label", "prediction")\
    .show()

In [None]:
list_dataframes()
# spark.catalog.clearCache() # Removes all cached tables from the in-memory cache.
# spark.catalog.uncacheTable('df_feat') # Removes the specified table from the in-memory cache.