In [51]:
import numpy as np
import pandas as pd
from pyspark import SparkContext
from pyspark.sql import SparkSession
import pyspark as spark
from pyspark.ml.feature import StringIndexer,IndexToString
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
# from pyspark.ml.feature import IDF
# from pyspark.ml.feature import DCT
# from pyspark.ml.feature import PolynomialExpansion
# from pyspark.ml.feature import ChiSqSelector
from pyspark.ml import Pipeline
import itertools as it
import pyspark.sql.functions as f

In [2]:
# TODO 1. rename column names containing '.' (GS1-279B7.1, GS1-600G8.3 CAND1.11, HY.1)
#      2. read csv sep = ',' for cpv final matrix

In [3]:
spark = SparkSession.builder.appName("SimpleApp").getOrCreate()

In [4]:
spark

In [None]:
%%info

In [5]:
seed = 0

In [6]:
SparkContext.setSystemProperty('spark.executor.memory', '2g')
#sc._conf.getAll()

In [7]:
# helper function to get all stored variables
def list_dataframes():
    from pyspark.sql import DataFrame
    return [k for (k, v) in globals().items() if isinstance(v, DataFrame)]

In [8]:
# read data file
path = 'CPV/cpv_final_matrix_test.csv'
df = spark.read.option("maxColumns", 22400).csv(
    path, header=True, sep = '\t',mode="DROPMALFORMED",inferSchema=True)

In [None]:
df.printSchema()

In [15]:
# group columns by label, identifier and feature
label_columns = ['sample_type', 'disease_type', 'primary_diagnosis']
identifier_columns = ['_c0','sample_id','case_id']
feature_columns = [x for x in df.columns if x not in (label_columns+identifier_columns)]

In [22]:
# convert features into (sparse) vectors
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')
df = assembler.transform(df)
df=df.drop(*feature_columns)

In [28]:
# convert string labels to numerical labels
# keep track of the column names of numerical labels
label_idx_columns = [s + '_idx' for s in label_columns]

# declare indexers for 3 columns
labelIndexer = [StringIndexer(inputCol=column, outputCol=column+'_idx',handleInvalid="error",
                              stringOrderType="frequencyDesc") for column in label_columns ]
# pipeline is needed to process a list of 3 labels
pipeline = Pipeline(stages=labelIndexer)
# transform 3 label columns from string to number catagoies 
df = pipeline.fit(df).transform(df)

# create dictionary containing 3 label lists
label_dict = {c.name: c.metadata["ml_attr"]["vals"]
for c in df.schema.fields if c.name.endswith("_idx")}

### Binary Logistic Regression - cancer/no-cancer classification 

In [139]:
# test/train split
Xtest,Xtrain = df.randomSplit([0.3, 0.7], seed)

In [140]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegression

# Standardization 
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures', withStd=True, withMean=True)

# Logistic Regression model 
lr = LogisticRegression(aggregationDepth= 3,maxIter=100, regParam=0.4, elasticNetParam=0.5,
                        featuresCol='scaledFeatures',labelCol ='sample_type_idx',
                       family='binomial',tol=1e-06)

# Hyperparameters to test
paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.1,0.4])\
            .build()

# K-fold cross validation 
crossval = CrossValidator(estimator=lr,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="sample_type_idx",metricName="areaUnderROC"),
                          numFolds=2,seed=seed)  # use 3+ folds in practice

# Put steps in a pipeline
pipeline = Pipeline(stages=[scaler, crossval])


In [141]:
# train model
pipModel = pipeline.fit(Xtrain)

In [164]:
# get hyperparameters for best model
cvModel = pipModel.stages[-1]
bestParams = cvModel.extractParamMap()
# print ('Best Param (regParam): ', bestModel._java_obj.getRegParam())
bestModel = cvModel.bestModel
coeff = bestModel.coefficients
intercept = bestModel.intercept
# save model
# bestModel.save('cpv1600_logistic')

NameError: name 'pipModel' is not defined

In [145]:
# predict
predictions = pipModel.transform(Xtest)

In [146]:
# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=label_dict['sample_type_idx'])
predictions = labelConverter.transform(predictions)

In [147]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="sample_type_idx",metricName="areaUnderROC")
roc = evaluator.evaluate(predictions)
print(roc)

0.851932584269663


In [55]:
# predictions.select("sample_type","predictedLabel","sample_type_idx","prediction","rawPrediction","probability").show(2)

DataFrame[_c0: string, sample_id: string, sample_type: string, disease_type: string, primary_diagnosis: string, case_id: string, features: vector, sample_type_idx: double, disease_type_idx: double, primary_diagnosis_idx: double, scaledFeatures: vector, rawPrediction: vector, probability: vector, prediction: double, predictedLabel: string]

### Random Forest 

In [171]:
# test/train split
Xtest,Xtrain = df.randomSplit([0.3, 0.7], seed)

In [170]:
from pyspark.ml.classification import RandomForestClassifier
# Random Forest Classifier
scaler = StandardScaler(inputCol='features', outputCol='scaledFeatures', withStd=True, withMean=True)

rf = RandomForestClassifier(cacheNodeIds=True, featuresCol='scaledFeatures',labelCol ='sample_type_idx', numTrees=200,\
                           seed=seed,)


# # Hyperparameters to test
paramGrid = ParamGridBuilder().addGrid(rf.maxDepth, [10,20])\
            .build()

# # K-fold cross validation 
crossval = CrossValidator(estimator=rf,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="sample_type_idx",metricName="areaUnderROC"),
                          numFolds=2,seed=seed)  # use 3+ folds in practice

# # Put steps in a pipeline
pipeline = Pipeline(stages=[scaler, crossval])


# # Convert indexed labels back to original labels
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=label_dict['sample_type_idx'])


In [172]:
# train model
pipModel = pipeline.fit(Xtrain)

In [173]:
# get hyperparameters for best model
cvModel = pipModel.stages[-1]
bestParams = cvModel.extractParamMap()
# print ('Best Param (regParam): ', bestModel._java_obj.getRegParam())
bestModel = cvModel.bestModel
feature_importance = bestModel.featureImportances
tree_weights = bestModel.treeWeights
trees =  bestModel.trees
# save model
# bestModel.save('cpv1600_logistic')

In [178]:
# predict
predictions = pipModel.transform(Xtest)

In [179]:
# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=label_dict['sample_type_idx'])
predictions = labelConverter.transform(predictions)

In [180]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="sample_type_idx",metricName="areaUnderROC")
roc = evaluator.evaluate(predictions)
print(roc)

0.8918651685393256


### Feature Selection

In [None]:
# feature selection by top percentile
p_selector = ChiSqSelector(selectorType = 'percentile', percentile=0.1, outputCol="percentFeatures",featuresCol="features",labelCol="sample_type_idx")
p_selector_model = p_selector.fit(XyTrain)

# feature seleciton by false-positive-rate threshold
f_selector = ChiSqSelector(selectorType = 'fpr', fpr=0.2, outputCol="fprFeatures",featuresCol="features",labelCol="sample_type_idx")
f_selector_model = f_selector.fit(XyTrain)

In [None]:
print("Percent Selecter:", p_selector.getNumTopFeatures())
print("FPR Selecter:", f_selector.getNumTopFeatures())

In [None]:
len(list(set(p_selector_model.selectedFeatures)&set(f_selector_model.selectedFeatures)))

In [None]:
def save(fpath,fname,obj):
    # selector saver 
    fullpath = fpath + '/'+ fname
    obj.save(fullpath)

# def load(fpath):
#     # selector loader
#     loadedSelector = ChiSqSelector.load(chiSqSelectorPath)
#     loadedSelector.getNumTopFeatures() == selector.getNumTopFeatures()
#     # model loader
#     loadedModel = ChiSqSelectorModel.load(modelPath)
#     loadedModel.selectedFeatures == model.selectedFeatures

In [None]:
cwd = os.getcwd()
save(cwd,'cpv_toy_10percent_chi_selector',p_selector)
save(cwd,'cpv_toy_10percent_chi_model',p_selector_model)
save(cwd,'cpv_toy_2fpr_chi_selector',f_selector)
save(cwd,'cpv_toy_2fpr_model',f_selector_model)

### Logistic Regerssion for Cancer/No Cancer

In [None]:
# Extract the summary from the returned LogisticRegressionModel instance trained
# in the earlier example
trainingSummary = lrModel.summary

# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
trainingSummary.roc.show()
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

# Set the model threshold to maximize F-Measure
fMeasure = trainingSummary.fMeasureByThreshold
maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head()
bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']).select('threshold').head()['threshold']
lr.setThreshold(bestThreshold)
print("setting model to best threshold:"+ str(bestThreshold))

### Logistic Regression for Multi-class

In [None]:
# We can also use the multinomial family for binary classification
mlr = LogisticRegression(maxIter=1000, regParam=0.3, elasticNetParam=0.8, family="multinomial",featuresCol='scaledFeatures',labelCol ='disease_type_idx')

# Fit the model
mlrModel = mlr.fit(XyTrain_disease)

# Print the coefficients and intercepts for logistic regression with multinomial family
print("Multinomial coefficients: " + str(mlrModel.coefficientMatrix))
print("Multinomial intercepts: " + str(mlrModel.interceptVector))

# Extract the summary from the returned LogisticRegressionModel instance trained
# in the earlier example
trainingSummary = lrModel.summary

# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
trainingSummary.roc.show()
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

# Set the model threshold to maximize F-Measure
fMeasure = trainingSummary.fMeasureByThreshold
maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head()
bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']).select('threshold').head()['threshold']
lr.setThreshold(bestThreshold)
print("setting model to best threshold:"+ str(bestThreshold))