### Current Process
1. Read in data --> Done

2. Custom Imputation --> Done

3. Add Binary Class --> Done, Should Add Binary Class Later

4. Summary Statistics Features --> Done

5. Wrapper Functions --> Done, Need to Test Though

6. Sklearn Pipeline Categorical Features --> One Hot Encoding Done

7. Sklearn Pipeline Numerical Features --> StandardScaler Done

8. Create Lagged Features --> Done

9. Modeling --> Currently XgBoost, (Maybe Try: TensorFlow Decision Tree, TensorFlow Probability Model)

10. Model Evaluation --> Accuracy, Precision, Recall, F1, Confusion Matrix (Need to add Variable Importance Based on Variance)

11. PySpark: XGBoost Classification Feature Importance

In [None]:
# # Need to Run These in Notebook Version For Pandas UDF
! pip install pyarrow
! pip install pandas
! pip install scikit-learn
! pip install pyspark
! pip install xgboost
! pip install kaleido
! pip install EntropyHub

In [1]:
from Input_Variables.read_vars import train_data_storage, validation_data_storage, test_data_storage, \
                                      one_hot_encoding_data, \
                                      analysis_group, \
                                      daily_stats_features_lower, daily_stats_features_upper, \
                                      model_storage_location, random_seed, \
                                      time_series_lag_values_created, \
                                      evaluation_metrics_output_storage, \
                                      feature_importance_storage_location, \
                                      overall_feature_importance_plot_location

from Data_Schema.schema import Pandas_UDF_Data_Schema
from Read_In_Data.read_data import Reading_Data
from Data_Pipeline.imputation_pipeline import Date_And_Value_Imputation


from Feature_Generation.create_binary_labels import Create_Binary_Labels
from Feature_Generation.summary_stats import Summary_Stats_Features
from Feature_Generation.lag_features import Create_Lagged_Features
from Feature_Generation.time_series_feature_creation import TS_Features
from Feature_Generation.difference_features import Difference_Features

from Data_Pipeline.encoding_scaling_pipeline import Feature_Transformations

from Model_Creation.pyspark_xgboost import Create_PySpark_XGBoost

from Model_Predictions.pyspark_model_preds import Model_Predictions

from Model_Evaluation.pyspark_model_eval import Evaluate_Model

from Feature_Importance.model_feature_importance import Feature_Importance

from Model_Plots.xgboost_classification_plots import XGBoost_Classification_Plot

# General Modules

In [2]:
# PySpark UDF Schema Activation
pandas_udf_data_schema=Pandas_UDF_Data_Schema()

# Data Location
reading_data=Reading_Data()

# Create Binary y Variables
create_binary_labels=Create_Binary_Labels()

# Imputation
date_and_value_imputation=Date_And_Value_Imputation(reading_data.spark)

# Features Daily Stats Module
summary_stats_features=Summary_Stats_Features()

# Features Complex
ts_features=TS_Features()

# Features Lagged Value
create_lag_features=Create_Lagged_Features()

# Features Differences
difference_features=Difference_Features()

# PySpark XGBoost Model Module
create_pyspark_xgboost=Create_PySpark_XGBoost()

# Classification Evaluation
evaluate_model=Evaluate_Model()

# Model Plots Feature Importance
xgboost_classification_plot=XGBoost_Classification_Plot()

# Feature Transformations
feature_transformations=Feature_Transformations()


pyspark_custom_imputation_schema=pandas_udf_data_schema.custom_imputation_pyspark_schema()


model_predictions=Model_Predictions()

# Feature Importance
feature_importance=Feature_Importance()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/08 19:20:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/05/08 19:20:32 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


# PySpark

### 1. PySpark: Reading In Data

#### Training

In [3]:
training_df=reading_data.read_in_pyspark_data(data_location=train_data_storage)
training_df.show()



+-----+--------------------+-----+-------------------+---------------------+------------------+
|NumId|           PatientId|Value| GlucoseDisplayTime|GlucoseDisplayTimeRaw|GlucoseDisplayDate|
+-----+--------------------+-----+-------------------+---------------------+------------------+
| 2159|+/WgTKHs7vvb24NIL...|121.0|2022-02-04 14:50:00| 2022-02-04T14:50:...|        2022-02-04|
| 2159|+/WgTKHs7vvb24NIL...|115.0|2022-02-04 14:55:00| 2022-02-04T14:55:...|        2022-02-04|
| 2159|+/WgTKHs7vvb24NIL...|111.0|2022-02-04 15:00:00| 2022-02-04T15:00:...|        2022-02-04|
| 2159|+/WgTKHs7vvb24NIL...| 98.0|2022-02-04 15:05:00| 2022-02-04T15:05:...|        2022-02-04|
| 2159|+/WgTKHs7vvb24NIL...| 99.0|2022-02-04 15:10:00| 2022-02-04T15:10:...|        2022-02-04|
| 2159|+/WgTKHs7vvb24NIL...|110.0|2022-02-04 15:15:00| 2022-02-04T15:15:...|        2022-02-04|
| 2159|+/WgTKHs7vvb24NIL...|111.0|2022-02-04 15:20:00| 2022-02-04T15:20:...|        2022-02-04|
| 2159|+/WgTKHs7vvb24NIL...|109.0|2022-0

                                                                                

#### Validation

In [None]:
validation_df=reading_data.read_in_pyspark_data(data_location=validation_data_storage)
validation_df.show()

#### Testing

In [None]:
testing_df=reading_data.read_in_pyspark_data(data_location=test_data_storage)
testing_df.show()

### 2. PySpark: Custom Imputation Pipeline

#### Training

In [4]:
training_custom_imputation_schema=pandas_udf_data_schema.custom_imputation_pyspark_schema()
training_custom_imputation_pipeline=date_and_value_imputation.\
                                        pyspark_custom_imputation_pipeline(df=training_df, 
                                                                           output_schema=pyspark_custom_imputation_schema,
                                                                           analysis_group=analysis_group)

training_custom_imputation_pipeline.show()

[Stage 8:>                                                          (0 + 1) / 1]

+-------------------+--------------------+-----+
| GlucoseDisplayTime|           PatientId|Value|
+-------------------+--------------------+-----+
|2022-04-11 13:15:00|+BPY2YsPzI4b+DwiN...|102.0|
|2022-04-11 13:20:00|+BPY2YsPzI4b+DwiN...| 93.0|
|2022-04-11 13:25:00|+BPY2YsPzI4b+DwiN...| 89.0|
|2022-04-11 13:30:00|+BPY2YsPzI4b+DwiN...| 88.0|
|2022-04-11 13:35:00|+BPY2YsPzI4b+DwiN...| 94.0|
|2022-04-11 13:40:00|+BPY2YsPzI4b+DwiN...| 99.0|
|2022-04-11 13:45:00|+BPY2YsPzI4b+DwiN...| 98.0|
|2022-04-11 13:50:00|+BPY2YsPzI4b+DwiN...| 98.0|
|2022-04-11 13:55:00|+BPY2YsPzI4b+DwiN...| 95.0|
|2022-04-11 14:00:00|+BPY2YsPzI4b+DwiN...| 87.0|
|2022-04-11 14:05:00|+BPY2YsPzI4b+DwiN...| 84.0|
|2022-04-11 14:10:00|+BPY2YsPzI4b+DwiN...| 84.0|
|2022-04-11 14:15:00|+BPY2YsPzI4b+DwiN...| 91.0|
|2022-04-11 14:20:00|+BPY2YsPzI4b+DwiN...| 99.0|
|2022-04-11 14:25:00|+BPY2YsPzI4b+DwiN...|106.0|
|2022-04-11 14:30:00|+BPY2YsPzI4b+DwiN...|103.0|
|2022-04-11 14:35:00|+BPY2YsPzI4b+DwiN...|113.0|
|2022-04-11 14:40:00

                                                                                

#### Testing

In [None]:
testing_custom_imputation_schema=pandas_udf_data_schema.custom_imputation_pyspark_schema()
testing_custom_imputation_pipeline=date_and_value_imputation.\
                                        pyspark_custom_imputation_pipeline(df=testing_df, 
                                                                           output_schema=pyspark_custom_imputation_schema,
                                                                           analysis_group=analysis_group)

testing_custom_imputation_pipeline.show(1)

### 3. PySpark: Adding Binary Labels

#### Training

In [5]:
training_df_added_binary_labels=create_binary_labels.pyspark_binary_labels(df=training_custom_imputation_pipeline)

training_df_added_binary_labels.show(1)

[Stage 15:>                                                        (0 + 4) / 19]

23/05/08 19:24:47 WARN ArrowPythonRunner: Detected deadlock while completing task 2.0 in stage 15 (TID 310): Attempting to kill Python Worker
23/05/08 19:24:47 ERROR Executor: Exception in task 2.0 in stage 15.0 (TID 310)
java.lang.OutOfMemoryError: Java heap space
	at java.base/java.lang.reflect.Array.newInstance(Array.java:78)
	at scala.reflect.ClassTag$GenericClassTag.newArray(ClassTag.scala:171)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:341)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at scala.collection.AbstractTraversable.toArray(Traversable.scala:108)
	at scala.collection.generic.TraversableForwarder.toArray(TraversableForwarder.scala:67)
	at scala.collection.generic.TraversableForwarder.toArray$(TraversableForwarder.scala:67)
	at scala.collection.mutable.ListBuffer.toArray(ListBuffer.scala:47)
	at org.apache.spark.sql.catalyst.util.QuantileSummaries$.org$apache$spark$sql$catalyst$util$QuantileSummaries$$compressImmut(Quanti

Py4JJavaError: An error occurred while calling o76.approxQuantile.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 15.0 failed 1 times, most recent failure: Lost task 2.0 in stage 15.0 (TID 310) (jupyter-cmonsivais-40ucsd-2eedu executor driver): java.lang.OutOfMemoryError: Java heap space
	at java.base/java.lang.reflect.Array.newInstance(Array.java:78)
	at scala.reflect.ClassTag$GenericClassTag.newArray(ClassTag.scala:171)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:341)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at scala.collection.AbstractTraversable.toArray(Traversable.scala:108)
	at scala.collection.generic.TraversableForwarder.toArray(TraversableForwarder.scala:67)
	at scala.collection.generic.TraversableForwarder.toArray$(TraversableForwarder.scala:67)
	at scala.collection.mutable.ListBuffer.toArray(ListBuffer.scala:47)
	at org.apache.spark.sql.catalyst.util.QuantileSummaries$.org$apache$spark$sql$catalyst$util$QuantileSummaries$$compressImmut(QuantileSummaries.scala:388)
	at org.apache.spark.sql.catalyst.util.QuantileSummaries.compress(QuantileSummaries.scala:140)
	at org.apache.spark.sql.catalyst.util.QuantileSummaries.insert(QuantileSummaries.scala:69)
	at org.apache.spark.sql.execution.stat.StatFunctions$.apply$1(StatFunctions.scala:91)
	at org.apache.spark.sql.execution.stat.StatFunctions$.$anonfun$multipleApproxQuantiles$6(StatFunctions.scala:103)
	at org.apache.spark.sql.execution.stat.StatFunctions$$$Lambda$3923/0x0000000801cf74b0.apply(Unknown Source)
	at scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:196)
	at scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:194)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:199)
	at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:192)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1431)
	at scala.collection.TraversableOnce.aggregate(TraversableOnce.scala:260)
	at scala.collection.TraversableOnce.aggregate$(TraversableOnce.scala:260)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1431)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$4(RDD.scala:1236)
	at org.apache.spark.rdd.RDD$$Lambda$3925/0x0000000801cf7a68.apply(Unknown Source)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$6(RDD.scala:1237)
	at org.apache.spark.rdd.RDD$$Lambda$3926/0x0000000801cf85b8.apply(Unknown Source)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:855)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:855)
	at org.apache.spark.rdd.RDD$$Lambda$1637/0x00000008014ebc98.apply(Unknown Source)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2228)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2323)
	at org.apache.spark.rdd.RDD.$anonfun$fold$1(RDD.scala:1174)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
	at org.apache.spark.rdd.RDD.fold(RDD.scala:1168)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$2(RDD.scala:1267)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1228)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$1(RDD.scala:1214)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
	at org.apache.spark.rdd.RDD.treeAggregate(RDD.scala:1214)
	at org.apache.spark.sql.execution.stat.StatFunctions$.multipleApproxQuantiles(StatFunctions.scala:103)
	at org.apache.spark.sql.DataFrameStatFunctions.approxQuantile(DataFrameStatFunctions.scala:104)
	at org.apache.spark.sql.DataFrameStatFunctions.approxQuantile(DataFrameStatFunctions.scala:115)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: java.lang.OutOfMemoryError: Java heap space
	at java.base/java.lang.reflect.Array.newInstance(Array.java:78)
	at scala.reflect.ClassTag$GenericClassTag.newArray(ClassTag.scala:171)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:341)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at scala.collection.AbstractTraversable.toArray(Traversable.scala:108)
	at scala.collection.generic.TraversableForwarder.toArray(TraversableForwarder.scala:67)
	at scala.collection.generic.TraversableForwarder.toArray$(TraversableForwarder.scala:67)
	at scala.collection.mutable.ListBuffer.toArray(ListBuffer.scala:47)
	at org.apache.spark.sql.catalyst.util.QuantileSummaries$.org$apache$spark$sql$catalyst$util$QuantileSummaries$$compressImmut(QuantileSummaries.scala:388)
	at org.apache.spark.sql.catalyst.util.QuantileSummaries.compress(QuantileSummaries.scala:140)
	at org.apache.spark.sql.catalyst.util.QuantileSummaries.insert(QuantileSummaries.scala:69)
	at org.apache.spark.sql.execution.stat.StatFunctions$.apply$1(StatFunctions.scala:91)
	at org.apache.spark.sql.execution.stat.StatFunctions$.$anonfun$multipleApproxQuantiles$6(StatFunctions.scala:103)
	at org.apache.spark.sql.execution.stat.StatFunctions$$$Lambda$3923/0x0000000801cf74b0.apply(Unknown Source)
	at scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:196)
	at scala.collection.TraversableOnce$folder$1.apply(TraversableOnce.scala:194)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.TraversableOnce.foldLeft(TraversableOnce.scala:199)
	at scala.collection.TraversableOnce.foldLeft$(TraversableOnce.scala:192)
	at scala.collection.AbstractIterator.foldLeft(Iterator.scala:1431)
	at scala.collection.TraversableOnce.aggregate(TraversableOnce.scala:260)
	at scala.collection.TraversableOnce.aggregate$(TraversableOnce.scala:260)
	at scala.collection.AbstractIterator.aggregate(Iterator.scala:1431)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$4(RDD.scala:1236)
	at org.apache.spark.rdd.RDD$$Lambda$3925/0x0000000801cf7a68.apply(Unknown Source)
	at org.apache.spark.rdd.RDD.$anonfun$treeAggregate$6(RDD.scala:1237)
	at org.apache.spark.rdd.RDD$$Lambda$3926/0x0000000801cf85b8.apply(Unknown Source)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:855)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:855)
	at org.apache.spark.rdd.RDD$$Lambda$1637/0x00000008014ebc98.apply(Unknown Source)


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 44230)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.10/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.10/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.10/socketserver.py", line 747, in __init__
    self.handle()
  File "/home/jovyan/glucose-data-analysis/glucose_venv/lib/python3.10/site-packages/pyspark/accumulators.py", line 281, in handle
    poll(accum_updates)
  File "/home/jovyan/glucose-data-analysis/glucose_venv/lib/python3.10/site-packages/pyspark/accumulators.py", line 253, in poll
    if func():
  File "/home/jovyan/glucose-data-analysis/glucose

#### Testing

In [None]:
testing_df_added_binary_labels=create_binary_labels.pyspark_binary_labels(df=testing_custom_imputation_pipeline)

testing_df_added_binary_labels.show(1, truncate=False)

### 4. PySpark: Feature Creation

#### Training

##### Complex Features

In [None]:
training_df_differences = difference_features.add_difference_features(training_df_added_binary_labels)
training_df_differences.show(5)

In [None]:
training_df_chunks = summary_stats_features.create_chunk_col(training_df_differences, chunk_val = 288)
training_df_chunks.show(5)

In [None]:
# training_df_poincare = training_df_chunks.groupby(['PatientId', 'Chunk']).apply(ts_features.poincare)
# training_df_poincare.show(5)

# training_df_entropy = training_df_chunks.groupby(['PatientId', 'Chunk']).apply(ts_features.entropy)
# training_df_entropy.show(5)

In [None]:
# training_df_complex_features = training_df_poincare.join(training_df_entropy,['PatientId', 'Chunk'])
# training_df_complex_features.show()

In [None]:
# training_df_sleep = ts_features.process_for_sleep(df=training_df_added_binary_labels)
# training_df_sleep.show(5)

##### Statistical Features

In [None]:
training_features_summary_stats=summary_stats_features.pyspark_summary_statistics(df=training_df_chunks)
# merge complex features and summary stats and demographics and sleep features
# merge in one hot encoded cohort file info demographics
    # '/cephfs/data/cohort_encoded.parquet' (gender, treatment, age category)
    # groupby patientId and chunk

training_features_summary_stats.show(3)

In [None]:
# #add target variable
# training_features_final_summary = summary_stats_features\
#                                     .add_lag_out_of_range(df=training_features_summary_stats, chunk_lag=1)

# training_features_final_summary.show(4)

In [None]:
# Merge these together
# training_features_summary_stats
# training_df_complex_features
# one-hot-encoding 

#### Testing

##### Complex Features

In [None]:
testing_df_differences = difference_features.add_difference_features(testing_df_added_binary_labels)
testing_df_differences.show(5)

In [None]:
testing_df_chunks = summary_stats_features.create_chunk_col(testing_df_differences, chunk_val = 288)
testing_df_chunks.show(5)

# testing_df_poincare = testing_df_chunks.groupby(['PatientId', 'Chunk']).apply(ts_features.poincare)
# testing_df_poincare.show(5)

# testing_df_entropy = testing_df_chunks.groupby(['PatientId', 'Chunk']).apply(ts_features.entropy)
# testing_df_entropy.show(5)

In [None]:
# testing_df_complex_features = testing_df_poincare.join(testing_df_entropy,['PatientId', 'Chunk'])
# testing_df_complex_features.show()

In [None]:
# training_df_sleep = ts_features.process_for_sleep(df=testing_df_added_binary_labels)
# training_df_sleep.show(5)

##### Statistical Features

In [None]:
testing_features_summary_stats=summary_stats_features.pyspark_summary_statistics(df=testing_df_chunks)

# merge complex features and summary stats and demographics and sleep features
# merge in one hot encoded cohort file info demographics
    # '/cephfs/data/cohort_encoded.parquet' (gender, treatment, age category)
    # groupby patientId and chunk

testing_features_summary_stats.show(3)

In [None]:
# Merge these together
# testing_features_summary_stats
# training_df_complex_features
# one-hot-encoding 

### 7. PySpark: Sklearn Regression Pipeline in PySpark

In [None]:
one_hot_encoded_df=reading_data.read_in_one_hot_encoded_data(one_hot_encoding_location=one_hot_encoding_data)
one_hot_encoded_df=one_hot_encoded_df.select('UserId', 
                                             'Sex_Encoded', 
                                             'Treatment_Encoded', 
                                             'AgeGroup_Encoded')

#### Training

In [None]:
training_encoded=training_features_summary_stats.join(one_hot_encoded_df,
                                                       training_features_summary_stats.PatientId==one_hot_encoded_df.UserId,
                                                       "left")


In [None]:
training_encoded.show(4)

In [None]:
# merge training_features_summary with 

#### Testing

In [None]:
testing_encoded=testing_features_summary_stats.join(one_hot_encoded_df,
                                                       testing_features_summary_stats.PatientId==one_hot_encoded_df.UserId,
                                                       "left")

In [None]:
testing_encoded.show(4)

### 8. PySpark: Sklearn Numerical Pipeline in PySpark

In [None]:
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.sql.types import DoubleType, FloatType, LongType
from pyspark.ml import Pipeline

In [None]:
training_encoded.select('PatientId').show(1, truncate=False)

In [None]:
patient_1=training_encoded.filter(training_encoded.PatientId=='8W/rpnb48OMm47W2x4FSkc7+9u2mol061DQuJoMdiK0=')

In [None]:
double_cols=[f.name for f in patient_1.schema.fields if isinstance(f.dataType, DoubleType)]
float_cols=[f.name for f in patient_1.schema.fields if isinstance(f.dataType, FloatType)]
long_cols=[f.name for f in patient_1.schema.fields if isinstance(f.dataType, LongType)]

# all_numerical=list(set(double_cols+float_cols))
# all_numerical_lags=[x for x in all_numerical if "lag" in x]
all_numerical=list(set(double_cols+float_cols+long_cols))
all_numerical.remove('target')

# featureArr = [('scaled_' + f) for f in all_numerical_lags]
featureArr = [('scaled_' + f) for f in all_numerical]+['Sex_Encoded', 'Treatment_Encoded', 'AgeGroup_Encoded']
# featureArr = [('scaled_' + f) for f in all_numerical]

va1 = [VectorAssembler(inputCols=[f], outputCol=('vec_' + f)) for f in all_numerical]
ss = [StandardScaler(inputCol='vec_' + f, outputCol='scaled_' + f, withMean=True, withStd=True) for f in all_numerical]

va2 = VectorAssembler(inputCols=featureArr, outputCol="features")

stages = va1 + ss + [va2]

In [None]:
pipeline=Pipeline(stages=stages)

model=pipeline.fit(training_encoded)

In [None]:
model.transform(training_encoded).select('features').show(3, truncate=False)

In [None]:
model.transform(training_encoded).select('features').show(3, truncate=False)

In [None]:
pipeline=Pipeline(stages=stages)

model=pipeline.fit(patient_1)

In [None]:
model.transform(patient_1).select('features').show(3, truncate=False)

In [None]:
# training_numerical_stages=feature_transformations.numerical_scaling(df=training_encoded)
training_numerical_stages=feature_transformations.numerical_scaling(df=training_features_summary_stats)




### 9. PySpark: XGBoost Model

In [None]:
# xgboost_model=create_pyspark_xgboost.xgboost_classifier(ml_df=training_encoded,
#                                                         stages=training_numerical_stages,
#                                                         model_storage_location=model_storage_location,
#                                                         random_seed=random_seed)

xgboost_model=create_pyspark_xgboost.xgboost_classifier(ml_df=training_features_summary_stats,
                                                        stages=training_numerical_stages,
                                                        model_storage_location=model_storage_location,
                                                        random_seed=random_seed)

### 10. PySpark: Cross Validation

### 11. PySpark: Model Predictions

In [None]:
# testing_predictions=model_predictions.create_predictions_with_model(test_df=testing_encoded, 
#                                                                     model=xgboost_model)
# testing_predictions.show(10)

testing_predictions=model_predictions.create_predictions_with_model(test_df=training_features_summary_stats, 
                                                                    model=xgboost_model)
testing_predictions.show(10)

### 12. PySpark: Model Evaluation

In [None]:
model_evaluation=evaluate_model.regression_evaluation(testing_predictions=testing_predictions, 
                                                          eval_csv_location=evaluation_metrics_output_storage)

In [None]:
model_evaluation.head()

### 13. PySpark: XGBoost Classification Feature Importance

In [None]:
feature_importance_df=feature_importance.\
                        feature_importance_accuracy_gain(xgboost_model=xgboost_model, 
                                                         feature_importance_storage_location=feature_importance_storage_location)


In [None]:
feature_importance_df.head(10)

### 14. PySpark: Feature Importance Plotting

In [None]:
overall_feature_plot=xgboost_classification_plot.feature_overall_importance_plot(feature_importance_df=feature_importance_df,
                                                                                 overall_importance_plot_location=overall_feature_importance_plot_location)


In [None]:
overall_feature_plot.show()

### 15.PySpark: Local Level Feature Importance --> Shap Pandas UDF

In [None]:
# Add to reqs if this works
! pip install shap

In [None]:
xgboost_model.stages[-1]

In [None]:
import shap

In [None]:
explainer = shap.TreeExplainer(xgboost_model.stages[-1])