### Current Process
1. Read in data --> Done

2. Custom Imputation --> Done

3. Add Binary Class --> Done, Should Add Binary Class Later

4. Summary Statistics Features --> Done

5. Wrapper Functions --> Done, Need to Test Though

6. Sklearn Pipeline Categorical Features --> One Hot Encoding Done

7. Sklearn Pipeline Numerical Features --> StandardScaler Done

8. Create Lagged Features --> Done

9. Modeling --> Currently XgBoost, (Maybe Try: TensorFlow Decision Tree, TensorFlow Probability Model)

10. Model Evaluation --> Accuracy, Precision, Recall, F1, Confusion Matrix (Need to add Variable Importance Based on Variance)

11. PySpark: XGBoost Classification Feature Importance

In [1]:
# # Need to Run These in Notebook Version For Pandas UDF
! pip install pyarrow
! pip install pandas
! pip install scikit-learn
! pip install pyspark
! pip install xgboost
! pip install kaleido

Collecting pyspark
  Using cached pyspark-3.4.0-py2.py3-none-any.whl
[31mERROR: Wheel 'pyspark' located at /home/jovyan/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327/pyspark-3.4.0-py2.py3-none-any.whl is invalid.[0m[31m


In [24]:
from Input_Variables.read_vars import train_data_storage, validation_data_storage, test_data_storage, \
                                      analysis_group, \
                                      daily_stats_features_lower, daily_stats_features_upper, \
                                      model_storage_location, random_seed, \
                                      time_series_lag_values_created, \
                                      evaluation_metrics_output_storage, \
                                      feature_importance_storage_location, \
                                      overall_feature_importance_plot_location

from Data_Schema.schema import Pandas_UDF_Data_Schema
from Read_In_Data.read_data import Reading_Data
from Data_Pipeline.imputation_pipeline import Date_And_Value_Imputation

from Feature_Generation.create_binary_labels import Create_Binary_Labels
from Feature_Generation.summary_stats import Summary_Stats_Features
from Feature_Generation.lag_features import Create_Lagged_Features
from Feature_Generation.time_series_feature_creation import TS_Features
from Feature_Generation.difference_features import Difference_Features

from Data_Pipeline.encoding_scaling_pipeline import Feature_Transformations

from Model_Creation.pyspark_xgboost import Create_PySpark_XGBoost

from Model_Predictions.pyspark_model_preds import Model_Predictions

from Model_Evaluation.pyspark_model_eval import Evaluate_Model

from Feature_Importance.model_feature_importance import Feature_Importance

from Model_Plots.xgboost_classification_plots import XGBoost_Classification_Plot

# General Modules

In [25]:
# PySpark UDF Schema Activation
pandas_udf_data_schema=Pandas_UDF_Data_Schema()

# Data Location
reading_data=Reading_Data()

# Create Binary y Variables
create_binary_labels=Create_Binary_Labels()

# Imputation
date_and_value_imputation=Date_And_Value_Imputation()

# Features Daily Stats Module
summary_stats_features=Summary_Stats_Features()

# Features Complex
ts_features=TS_Features()

# Features Lagged Value
create_lag_features=Create_Lagged_Features()

# Features Differences
difference_features=Difference_Features()

# PySpark XGBoost Model Module
create_pyspark_xgboost=Create_PySpark_XGBoost()

# Classification Evaluation
evaluate_model=Evaluate_Model()

# Model Plots Feature Importance
xgboost_classification_plot=XGBoost_Classification_Plot()

# Feature Transformations
feature_transformations=Feature_Transformations()


pyspark_custom_imputation_schema=pandas_udf_data_schema.custom_imputation_pyspark_schema()


model_predictions=Model_Predictions()

# Feature Importance
feature_importance=Feature_Importance()

# PySpark

### 1. PySpark: Reading In Data

#### Training

In [26]:
training_df=reading_data.read_in_pyspark_training(training_data_location=train_data_storage)
training_df.show()

+--------------------+-----+-------------------+---------------------+------------------+
|           PatientId|Value| GlucoseDisplayTime|GlucoseDisplayTimeRaw|GlucoseDisplayDate|
+--------------------+-----+-------------------+---------------------+------------------+
|8W/rpnb48OMm47W2x...|328.0|2022-01-31 17:38:00| 2022-01-31T17:38:...|        2022-01-31|
|8W/rpnb48OMm47W2x...|331.0|2022-01-31 17:43:00| 2022-01-31T17:43:...|        2022-01-31|
|8W/rpnb48OMm47W2x...|329.0|2022-01-31 17:48:00| 2022-01-31T17:48:...|        2022-01-31|
|8W/rpnb48OMm47W2x...|321.0|2022-01-31 17:53:00| 2022-01-31T17:53:...|        2022-01-31|
|8W/rpnb48OMm47W2x...|315.0|2022-01-31 17:58:00| 2022-01-31T17:58:...|        2022-01-31|
|8W/rpnb48OMm47W2x...|313.0|2022-01-31 18:03:00| 2022-01-31T18:03:...|        2022-01-31|
|8W/rpnb48OMm47W2x...|304.0|2022-01-31 18:08:00| 2022-01-31T18:08:...|        2022-01-31|
|8W/rpnb48OMm47W2x...|298.0|2022-01-31 18:13:00| 2022-01-31T18:13:...|        2022-01-31|
|8W/rpnb48

                                                                                

#### Testing

In [27]:
testing_df=reading_data.read_in_pyspark_testing(testing_data_location=test_data_storage)
testing_df.show()

+--------------------+-----+-------------------+---------------------+------------------+
|           PatientId|Value| GlucoseDisplayTime|GlucoseDisplayTimeRaw|GlucoseDisplayDate|
+--------------------+-----+-------------------+---------------------+------------------+
|8W/rpnb48OMm47W2x...|  0.0|2022-02-08 16:59:00| 2022-02-08T16:59:...|        2022-02-08|
|8W/rpnb48OMm47W2x...|  0.0|2022-02-08 17:04:00| 2022-02-08T17:04:...|        2022-02-08|
|8W/rpnb48OMm47W2x...|  0.0|2022-02-08 17:09:00| 2022-02-08T17:09:...|        2022-02-08|
|8W/rpnb48OMm47W2x...|  0.0|2022-02-08 17:14:00| 2022-02-08T17:14:...|        2022-02-08|
|8W/rpnb48OMm47W2x...|  0.0|2022-02-08 17:19:00| 2022-02-08T17:19:...|        2022-02-08|
|8W/rpnb48OMm47W2x...|  0.0|2022-02-08 17:24:00| 2022-02-08T17:24:...|        2022-02-08|
|8W/rpnb48OMm47W2x...|277.0|2022-02-08 17:29:00| 2022-02-08T17:29:...|        2022-02-08|
|8W/rpnb48OMm47W2x...|270.0|2022-02-08 17:34:00| 2022-02-08T17:34:...|        2022-02-08|
|8W/rpnb48

### 2. PySpark: Custom Imputation Pipeline

#### Training

In [28]:
training_custom_imputation_schema=pandas_udf_data_schema.custom_imputation_pyspark_schema()
training_custom_imputation_pipeline=date_and_value_imputation.\
                                        pyspark_custom_imputation_pipeline(df=training_df, 
                                                                           output_schema=pyspark_custom_imputation_schema,
                                                                           analysis_group=analysis_group)

training_custom_imputation_pipeline.show(1)

[Stage 437:>                                                        (0 + 1) / 1]

+-------------------+--------------------+-----+
| GlucoseDisplayTime|           PatientId|Value|
+-------------------+--------------------+-----+
|2022-01-31 17:35:00|8W/rpnb48OMm47W2x...|328.0|
+-------------------+--------------------+-----+
only showing top 1 row



                                                                                

#### Testing

In [29]:
testing_custom_imputation_schema=pandas_udf_data_schema.custom_imputation_pyspark_schema()
testing_custom_imputation_pipeline=date_and_value_imputation.\
                                        pyspark_custom_imputation_pipeline(df=testing_df, 
                                                                           output_schema=pyspark_custom_imputation_schema,
                                                                           analysis_group=analysis_group)

testing_custom_imputation_pipeline.show(1)

[Stage 449:>                                                        (0 + 1) / 1]

+-------------------+--------------------+---------+
| GlucoseDisplayTime|           PatientId|    Value|
+-------------------+--------------------+---------+
|2022-02-08 16:55:00|8W/rpnb48OMm47W2x...|164.20488|
+-------------------+--------------------+---------+
only showing top 1 row



                                                                                

### 3. PySpark: Adding Binary Labels

#### Training

In [60]:
training_df_added_binary_labels=create_binary_labels.pyspark_binary_labels(df=training_custom_imputation_pipeline, 
                                                                          lower=daily_stats_features_lower, 
                                                                          upper=daily_stats_features_upper)

training_df_added_binary_labels.show(1)

[Stage 1272:>                                                       (0 + 1) / 1]

+-------------------+--------------------+-----+--------+
| GlucoseDisplayTime|           PatientId|Value|y_Binary|
+-------------------+--------------------+-----+--------+
|2022-01-31 17:35:00|8W/rpnb48OMm47W2x...|328.0|       1|
+-------------------+--------------------+-----+--------+
only showing top 1 row



                                                                                

#### Testing

In [61]:
testing_df_added_binary_labels=create_binary_labels.pyspark_binary_labels(df=testing_custom_imputation_pipeline, 
                                                                          lower=daily_stats_features_lower, 
                                                                          upper=daily_stats_features_upper)

testing_df_added_binary_labels.show(1, truncate=False)

[Stage 1284:>                                                       (0 + 1) / 1]

+-------------------+--------------------------------------------+---------+--------+
|GlucoseDisplayTime |PatientId                                   |Value    |y_Binary|
+-------------------+--------------------------------------------+---------+--------+
|2022-02-08 16:55:00|8W/rpnb48OMm47W2x4FSkc7+9u2mol061DQuJoMdiK0=|164.20488|0       |
+-------------------+--------------------------------------------+---------+--------+
only showing top 1 row



                                                                                

### 4. PySpark: Feature Creation

#### Training

##### Complex Features

In [62]:
training_df_differences = difference_features.add_difference_features(training_df_added_binary_labels)
training_df_differences.show(5)

[Stage 1296:>                                                       (0 + 1) / 1]

+-------------------+--------------------+-----+--------+---------+-------+
| GlucoseDisplayTime|           PatientId|Value|y_Binary|FirstDiff|SecDiff|
+-------------------+--------------------+-----+--------+---------+-------+
|2022-01-31 17:35:00|8W/rpnb48OMm47W2x...|328.0|       1|      0.0|    0.0|
|2022-01-31 17:40:00|8W/rpnb48OMm47W2x...|331.0|       1|      3.0|    3.0|
|2022-01-31 17:45:00|8W/rpnb48OMm47W2x...|329.0|       1|     -2.0|   -5.0|
|2022-01-31 17:50:00|8W/rpnb48OMm47W2x...|321.0|       1|     -8.0|   -6.0|
|2022-01-31 17:55:00|8W/rpnb48OMm47W2x...|315.0|       1|     -6.0|    2.0|
+-------------------+--------------------+-----+--------+---------+-------+
only showing top 5 rows



                                                                                

In [63]:
training_df_chunks = summary_stats_features.create_chunk_col(training_df_differences, chunk_val = 288)
training_df_chunks.show(5)

training_df_poincare = training_df_chunks.groupby(['PatientId', 'Chunk']).apply(ts_features.poincare)
training_df_poincare.show(5)

training_df_entropy = training_df_chunks.groupby(['PatientId', 'Chunk']).apply(ts_features.entropy)
training_df_entropy.show(5)

+-------------------+--------------------+-----+--------+---------+-------+-----+-----+
| GlucoseDisplayTime|           PatientId|Value|y_Binary|FirstDiff|SecDiff|index|Chunk|
+-------------------+--------------------+-----+--------+---------+-------+-----+-----+
|2022-01-31 17:35:00|8W/rpnb48OMm47W2x...|328.0|       1|      0.0|    0.0|    1|    0|
|2022-01-31 17:40:00|8W/rpnb48OMm47W2x...|331.0|       1|      3.0|    3.0|    2|    0|
|2022-01-31 17:45:00|8W/rpnb48OMm47W2x...|329.0|       1|     -2.0|   -5.0|    3|    0|
|2022-01-31 17:50:00|8W/rpnb48OMm47W2x...|321.0|       1|     -8.0|   -6.0|    4|    0|
|2022-01-31 17:55:00|8W/rpnb48OMm47W2x...|315.0|       1|     -6.0|    2.0|    5|    0|
+-------------------+--------------------+-----+--------+---------+-------+-----+-----+
only showing top 5 rows



  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  ratio = round(short_term_variation / long_term_variation, 3)


+--------------------+-----+-----------------+----------------+-------------+
|           PatientId|Chunk|ShortTermVariance|LongTermVariance|VarianceRatio|
+--------------------+-----+-----------------+----------------+-------------+
|8W/rpnb48OMm47W2x...|    0|            3.619|           4.698|         0.77|
|8W/rpnb48OMm47W2x...|    1|            3.856|           7.476|        0.516|
|8W/rpnb48OMm47W2x...|    2|            2.693|           5.503|        0.489|
|8W/rpnb48OMm47W2x...|    3|            4.064|             6.8|        0.598|
|8W/rpnb48OMm47W2x...|    4|            4.215|           6.338|        0.665|
+--------------------+-----+-----------------+----------------+-------------+
only showing top 5 rows



[Stage 1352:>                                                       (0 + 1) / 1]

+--------------------+-----+----------+
|           PatientId|Chunk|   Entropy|
+--------------------+-----+----------+
|8W/rpnb48OMm47W2x...|    0|0.06102413|
|8W/rpnb48OMm47W2x...|    1|0.16895121|
|8W/rpnb48OMm47W2x...|    2|0.21363738|
|8W/rpnb48OMm47W2x...|    3|0.21762376|
|8W/rpnb48OMm47W2x...|    4|0.11467516|
+--------------------+-----+----------+
only showing top 5 rows



                                                                                

In [64]:
training_df_complex_features = training_df_poincare.join(training_df_entropy,['PatientId', 'Chunk'])
training_df_complex_features.show()

[Stage 1369:>                                                       (0 + 1) / 1]

23/05/03 02:08:02 ERROR Executor: Exception in task 0.0 in stage 1369.0 (TID 498)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/jovyan/glucose-data-analysis/Feature_Generation/time_series_feature_creation.py", line 18, in entropy
    entropy = eH.SampEn(df.Value.values, m=4)[0][-1]
  File "/opt/conda/lib/python3.10/site-packages/EntropyHub/_SampEn.py", line 43, in SampEn
    assert N>10 and Sig.ndim == 1,  "Sig:   must be a numpy vector"
AssertionError: Sig:   must be a numpy vector

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:559)
	at org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:101)
	at org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:85)
	at org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:50)
	at org.apache.spark.api.python.BasePythonRunner$

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/home/jovyan/glucose-data-analysis/Feature_Generation/time_series_feature_creation.py", line 18, in entropy
    entropy = eH.SampEn(df.Value.values, m=4)[0][-1]
  File "/opt/conda/lib/python3.10/site-packages/EntropyHub/_SampEn.py", line 43, in SampEn
    assert N>10 and Sig.ndim == 1,  "Sig:   must be a numpy vector"
AssertionError: Sig:   must be a numpy vector


##### Statistical Features

In [None]:
training_features_summary_stats=summary_stats_features.pyspark_summary_statistics(df=training_df_chunks,
                                                                                 daily_stats_features_lower=daily_stats_features_lower,
                                                                                 daily_stats_features_upper=daily_stats_features_upper)
# merge complex features and summary stats and demographics
# merge in one hot encoded cohort file info demographics
    # '/cephfs/data/cohort_encoded.parquet' (gender, treatment, age category)
    # groupby patientId and chunk

training_features_summary_stats.show(1)

#### Testing

##### Complex Features

In [65]:
testing_df_differences = difference_features.add_difference_features(testing_df_added_binary_labels)
testing_df_differences.show(5)

[Stage 1381:>                                                       (0 + 1) / 1]

+-------------------+--------------------+---------+--------+---------+-------+
| GlucoseDisplayTime|           PatientId|    Value|y_Binary|FirstDiff|SecDiff|
+-------------------+--------------------+---------+--------+---------+-------+
|2022-02-08 16:55:00|8W/rpnb48OMm47W2x...|164.20488|       0|      0.0|    0.0|
|2022-02-08 17:00:00|8W/rpnb48OMm47W2x...|164.20488|       0|      0.0|    0.0|
|2022-02-08 17:05:00|8W/rpnb48OMm47W2x...|164.20488|       0|      0.0|    0.0|
|2022-02-08 17:10:00|8W/rpnb48OMm47W2x...|164.20488|       0|      0.0|    0.0|
|2022-02-08 17:15:00|8W/rpnb48OMm47W2x...|164.20488|       0|      0.0|    0.0|
+-------------------+--------------------+---------+--------+---------+-------+
only showing top 5 rows



                                                                                

In [66]:
testing_df_chunks = summary_stats_features.create_chunk_col(testing_df_differences, chunk_val = 288)
testing_df_chunks.show(5)

testing_df_poincare = testing_df_chunks.groupby(['PatientId', 'Chunk']).apply(ts_features.poincare)
testing_df_poincare.show(5)

testing_df_entropy = testing_df_chunks.groupby(['PatientId', 'Chunk']).apply(ts_features.entropy)
testing_df_entropy.show(5)

+-------------------+--------------------+---------+--------+---------+-------+-----+-----+
| GlucoseDisplayTime|           PatientId|    Value|y_Binary|FirstDiff|SecDiff|index|Chunk|
+-------------------+--------------------+---------+--------+---------+-------+-----+-----+
|2022-02-08 16:55:00|8W/rpnb48OMm47W2x...|164.20488|       0|      0.0|    0.0|    1|    0|
|2022-02-08 17:00:00|8W/rpnb48OMm47W2x...|164.20488|       0|      0.0|    0.0|    2|    0|
|2022-02-08 17:05:00|8W/rpnb48OMm47W2x...|164.20488|       0|      0.0|    0.0|    3|    0|
|2022-02-08 17:10:00|8W/rpnb48OMm47W2x...|164.20488|       0|      0.0|    0.0|    4|    0|
|2022-02-08 17:15:00|8W/rpnb48OMm47W2x...|164.20488|       0|      0.0|    0.0|    5|    0|
+-------------------+--------------------+---------+--------+---------+-------+-----+-----+
only showing top 5 rows



  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


+--------------------+-----+-----------------+----------------+-------------+
|           PatientId|Chunk|ShortTermVariance|LongTermVariance|VarianceRatio|
+--------------------+-----+-----------------+----------------+-------------+
|8W/rpnb48OMm47W2x...|    0|           13.264|          14.547|        0.912|
|8W/rpnb48OMm47W2x...|    1|            8.753|           8.528|        1.026|
|8W/rpnb48OMm47W2x...|    2|            1.732|           4.848|        0.357|
|CzndP9OQqEYW/LY7h...|    0|            4.418|           8.068|        0.548|
|CzndP9OQqEYW/LY7h...|    1|            4.176|           7.156|        0.584|
+--------------------+-----+-----------------+----------------+-------------+
only showing top 5 rows



[Stage 1437:>                                                       (0 + 1) / 1]

23/05/03 02:13:44 ERROR Executor: Exception in task 0.0 in stage 1437.0 (TID 522)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/jovyan/glucose-data-analysis/Feature_Generation/time_series_feature_creation.py", line 18, in entropy
    entropy = eH.SampEn(df.Value.values, m=4)[0][-1]
  File "/opt/conda/lib/python3.10/site-packages/EntropyHub/_SampEn.py", line 43, in SampEn
    assert N>10 and Sig.ndim == 1,  "Sig:   must be a numpy vector"
AssertionError: Sig:   must be a numpy vector

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:559)
	at org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:101)
	at org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:85)
	at org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:50)
	at org.apache.spark.api.python.BasePythonRunner$

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/home/jovyan/glucose-data-analysis/Feature_Generation/time_series_feature_creation.py", line 18, in entropy
    entropy = eH.SampEn(df.Value.values, m=4)[0][-1]
  File "/opt/conda/lib/python3.10/site-packages/EntropyHub/_SampEn.py", line 43, in SampEn
    assert N>10 and Sig.ndim == 1,  "Sig:   must be a numpy vector"
AssertionError: Sig:   must be a numpy vector


In [64]:
testing_df_complex_features = testing_df_poincare.join(testing_df_entropy,['PatientId', 'Chunk'])
testing_df_complex_features.show()

[Stage 1369:>                                                       (0 + 1) / 1]

23/05/03 02:08:02 ERROR Executor: Exception in task 0.0 in stage 1369.0 (TID 498)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/jovyan/glucose-data-analysis/Feature_Generation/time_series_feature_creation.py", line 18, in entropy
    entropy = eH.SampEn(df.Value.values, m=4)[0][-1]
  File "/opt/conda/lib/python3.10/site-packages/EntropyHub/_SampEn.py", line 43, in SampEn
    assert N>10 and Sig.ndim == 1,  "Sig:   must be a numpy vector"
AssertionError: Sig:   must be a numpy vector

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:559)
	at org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:101)
	at org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:85)
	at org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:50)
	at org.apache.spark.api.python.BasePythonRunner$

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/home/jovyan/glucose-data-analysis/Feature_Generation/time_series_feature_creation.py", line 18, in entropy
    entropy = eH.SampEn(df.Value.values, m=4)[0][-1]
  File "/opt/conda/lib/python3.10/site-packages/EntropyHub/_SampEn.py", line 43, in SampEn
    assert N>10 and Sig.ndim == 1,  "Sig:   must be a numpy vector"
AssertionError: Sig:   must be a numpy vector


##### Statistical Features

In [None]:
testing_features_summary_stats=summary_stats_features.pyspark_summary_statistics(df=testing_df_added_binary_labels,
                                                                                 daily_stats_features_lower=daily_stats_features_lower,
                                                                                 daily_stats_features_upper=daily_stats_features_upper)

# merge complex features and summary stats and demographics
# merge in one hot encoded cohort file info demographics
    # '/cephfs/data/cohort_encoded.parquet' (gender, treatment, age category)
    # groupby patientId and chunk

testing_features_summary_stats.show(1)

### 5. PySpark: Lag Features

#### Training

In [None]:
training_lag_features_creation=create_lag_features.pyspark_lag_features(df=training_features_summary_stats,
                                                                       time_series_lag_values_created=time_series_lag_values_created)
training_lag_features_creation.show(5)

#### Testing

In [None]:
testing_lag_features_creation=create_lag_features.pyspark_lag_features(df=testing_features_summary_stats,
                                                                       time_series_lag_values_created=time_series_lag_values_created)
testing_lag_features_creation.show(5)

### 7. PySpark: Sklearn Categorical Pipeline in PySpark

### 8. PySpark: Sklearn Numerical Pipeline in PySpark

In [None]:
training_numerical_stages=feature_transformations.numerical_scaling(df=training_lag_features_creation)

### 9. PySpark: XGBoost Model

In [None]:
xgboost_model=create_pyspark_xgboost.xgboost_classifier(ml_df=training_lag_features_creation,
                                                        stages=training_numerical_stages,
                                                        model_storage_location=model_storage_location,
                                                        random_seed=random_seed)

### 10. PySpark: Cross Validation

### 11. PySpark: Model Predictions

In [None]:
testing_predictions=model_predictions.create_predictions_with_model(test_df=testing_lag_features_creation, 
                                                                    model=xgboost_model)
testing_predictions.show(10)

### 12. PySpark: Model Evaluation

In [None]:
model_evaluation=evaluate_model.classification_evaluation(testing_predictions=testing_predictions, 
                                                          eval_csv_location=evaluation_metrics_output_storage)

In [None]:
model_evaluation.head()

### 13. PySpark: XGBoost Classification Feature Importance

In [None]:
feature_importance_df=feature_importance.\
                        feature_importance_accuracy_gain(xgboost_model=xgboost_model, 
                                                         feature_importance_storage_location=feature_importance_storage_location)


In [None]:
feature_importance_df.head(10)

### 14. PySpark: Feature Importance Plotting

In [None]:
overall_feature_plot=xgboost_classification_plot.feature_overall_importance_plot(feature_importance_df=feature_importance_df,
                                                                                 overall_importance_plot_location=overall_feature_importance_plot_location)


In [None]:
overall_feature_plot.show()

### 15.PySpark: Local Level Feature Importance --> Shap Pandas UDF

In [None]:
# Add to reqs if this works
! pip install shap

In [None]:
xgboost_model.stages[-1]

In [None]:
import shap

In [None]:
explainer = shap.TreeExplainer(xgboost_model.stages[-1])