### Current Process
1. Read in data --> Done

2. Custom Imputation --> Done

3. Add Binary Class --> Done, Should Add Binary Class Later

4. Summary Statistics Features --> Done

5. Wrapper Functions --> Done, Need to Test Though

6. Sklearn Pipeline Categorical Features --> One Hot Encoding Done

7. Sklearn Pipeline Numerical Features --> StandardScaler Done

8. Create Lagged Features --> Done

9. Modeling --> Currently XgBoost, (Maybe Try: TensorFlow Decision Tree, TensorFlow Probability Model)

10. Model Evaluation --> Accuracy, Precision, Recall, F1, Confusion Matrix (Need to add Variable Importance Based on Variance)

11. PySpark: XGBoost Classification Feature Importance

In [1]:
# # Need to Run These in Notebook Version For Pandas UDF
! pip install pyarrow
! pip install pandas
! pip install scikit-learn
! pip install pyspark
! pip install xgboost



In [2]:
from Input_Variables.read_vars import raw_data_storage, \
                                      analysis_group, \
                                      daily_stats_features_lower, daily_stats_features_upper, \
                                      ml_models_train_split, ml_models_test_split, model_storage_location, \
                                      time_series_lag_values_created

from Data_Schema.schema import Pandas_UDF_Data_Schema
from Read_In_Data.read_data import Reading_Data
from Data_Pipeline.sklearn_pipeline import Sklearn_Pipeline
from Feature_Generation.create_binary_labels import Create_Binary_Labels
from Feature_Generation.summary_stats import Summary_Stats_Features
from Feature_Generation.lag_features import Create_Lagged_Features
from Model_Evaluation.classification_evaluation import Classification_Evalaution_Metrics
from Model_Plots.xgboost_classification_plots import XGBoost_Classification_Plot

from Data_Pipeline.encoding_scaling_pipeline import Feature_Transformations

from Model_Creation.pyspark_xgboost import Create_PySpark_XGBoost

# General Modules

In [3]:
# PySpark UDF Schema Activation
pandas_udf_data_schema=Pandas_UDF_Data_Schema()

# Data Location
reading_data=Reading_Data(data_location=raw_data_storage)

# Create Binary y Variables
create_binary_labels=Create_Binary_Labels()

# Sklearn Pipeline
pandas_sklearn_pipeline=Sklearn_Pipeline()

# Features Daily Stats Module
summary_stats_features=Summary_Stats_Features()

# Features Lagged Value
create_lag_features=Create_Lagged_Features()

# PySpark XGBoost Model Module
create_pyspark_xgboost=Create_PySpark_XGBoost()

# Classification Evaluation
classification_evalaution_metrics=Classification_Evalaution_Metrics()

# Model Plots Feature Importance
xgboost_classification_plot=XGBoost_Classification_Plot()

# Feature Transformations
feature_transformations=Feature_Transformations()

# PySpark

### 1. PySpark: Reading In Data

In [4]:
####### PySpark
pyspark_df=reading_data.read_in_pyspark()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/20 18:04:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
print((pyspark_df.count(), len(pyspark_df.columns)))

(5734, 5)


In [6]:
from pyspark.ml.functions import vector_to_array

In [7]:
from pyspark.sql.functions import date_trunc, col, udf
pyspark_df=pyspark_df.withColumn("GlucoseDisplayTime", date_trunc("minute", col("GlucoseDisplayTime")))

In [8]:
pyspark_df=pyspark_df.distinct()

In [9]:
print((pyspark_df.count(), len(pyspark_df.columns)))

(5734, 5)


In [10]:
pyspark_df.show(1)

+--------------------+-----+-------------------+---------------------+------------------+
|           PatientId|Value| GlucoseDisplayTime|GlucoseDisplayTimeRaw|GlucoseDisplayDate|
+--------------------+-----+-------------------+---------------------+------------------+
|vH4j/sVPDk4luo9wf...|157.0|2022-12-28 02:52:00| 2022-12-28T02:52:...|        2022-12-28|
+--------------------+-----+-------------------+---------------------+------------------+
only showing top 1 row



### 2. PySpark: Custom Imputation Pipeline

In [11]:
pyspark_df=pyspark_df.orderBy("PatientId", 
                              "GlucoseDisplayTime",
                              ascending=True)

In [12]:
####### PySpark
pyspark_custom_imputation_schema=pandas_udf_data_schema.custom_imputation_pyspark_schema()
pyspark_custom_imputation_pipeline=pandas_sklearn_pipeline.pyspark_custom_imputation_pipeline(df=pyspark_df, 
                                                                                              output_schema=pyspark_custom_imputation_schema,
                                                                                              analysis_group=analysis_group)




In [13]:
pyspark_custom_imputation_pipeline.show(1)

[Stage 24:>                                                         (0 + 1) / 1]

+-------------------+--------------------+-----+
| GlucoseDisplayTime|           PatientId|Value|
+-------------------+--------------------+-----+
|2022-02-18 17:05:00|Zw997clFRcTAHrWiO...|260.0|
+-------------------+--------------------+-----+
only showing top 1 row



                                                                                

### 3. PySpark: Adding Binary Labels

In [14]:
pyspark_df_added_binary_labels=create_binary_labels.pyspark_binary_labels(df=pyspark_custom_imputation_pipeline, 
                                                                          lower=daily_stats_features_lower, 
                                                                          upper=daily_stats_features_upper)

pyspark_df_added_binary_labels.show(1)

+-------------------+--------------------+-----+--------+
| GlucoseDisplayTime|           PatientId|Value|y_Binary|
+-------------------+--------------------+-----+--------+
|2022-02-18 17:05:00|Zw997clFRcTAHrWiO...|260.0|       1|
+-------------------+--------------------+-----+--------+
only showing top 1 row



                                                                                

### 4. PySpark: Features: Summary Statistics

In [15]:
####### PySpark
pyspark_features_summary_stats=summary_stats_features.pyspark_summary_statistics(df=pyspark_df_added_binary_labels,
                                                                                 daily_stats_features_lower=daily_stats_features_lower,
                                                                                 daily_stats_features_upper=daily_stats_features_upper)



In [16]:
pyspark_features_summary_stats.show(1)

[Stage 58:>                                                         (0 + 1) / 1]

+--------------------+-----+-------------------+-----+--------+-----+----------------+------------------+-----------------+------+-----+-----+----------+----------+---------------+------------------+
|           PatientId|Chunk| GlucoseDisplayTime|Value|y_Binary|index|y_summary_binary|              Mean|          Std Dev|Median|  Min|  Max|CountBelow|CountAbove|PercentageBelow|   PercentageAbove|
+--------------------+-----+-------------------+-----+--------+-----+----------------+------------------+-----------------+------+-----+-----+----------+----------+---------------+------------------+
|Zw997clFRcTAHrWiO...|    0|2022-02-18 17:55:00|271.0|       1|   11|               1|273.45454545454544|8.017027333914188| 272.0|260.0|284.0|         0|        11|            0.0|0.9166666666666666|
+--------------------+-----+-------------------+-----+--------+-----+----------------+------------------+-----------------+------+-----+-----+----------+----------+---------------+------------------+


                                                                                

### 5. PySpark: Wrapper Functions

### 6. PySpark: Lag Features

In [17]:
####### PySpark
pyspark_lag_features_creation=create_lag_features.pyspark_lag_features(df=pyspark_features_summary_stats,
                                                                       time_series_lag_values_created=time_series_lag_values_created)
pyspark_lag_features_creation.show(5)

23/04/20 18:04:30 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


[Stage 80:>                                                         (0 + 1) / 1]

+--------------------+-----+-------------------+-----+--------+-----+----------------+------------------+-----------------+------+-----+-----+----------+----------+---------------+------------------+-----------+------------------+-----------------+---------+---------+---------+-------------+-------------+---------------+------------------+-----------+------------------+-----------------+---------+---------+---------+-------------+-------------+---------------+------------------+
|           PatientId|Chunk| GlucoseDisplayTime|Value|y_Binary|index|y_summary_binary|              Mean|          Std Dev|Median|  Min|  Max|CountBelow|CountAbove|PercentageBelow|   PercentageAbove|value_lag_1|        mean_lag_1|    std_dev_lag_1|med_lag_1|min_lag_1|max_lag_1|cnt_bel_lag_1|cnt_abv_lag_1|perc_belw_lag_1|    perc_abv_lag_1|value_lag_2|        mean_lag_2|    std_dev_lag_2|med_lag_2|min_lag_2|max_lag_2|cnt_bel_lag_2|cnt_abv_lag_2|perc_belw_lag_2|    perc_abv_lag_2|
+--------------------+-----+----

                                                                                

### 7. PySpark: Sklearn Categorical Pipeline in PySpark

### 8. PySpark: Sklearn Numerical Pipeline in PySpark

In [18]:
####### PySpark
pyspark_numerical_features=feature_transformations.numerical_scaling(df=pyspark_lag_features_creation)
pyspark_numerical_features.show(1)

                                                                                

+--------------------+-----+-------------------+-----+--------+-----+----------------+------------------+-----------------+------+-----+-----+----------+----------+---------------+------------------+-----------+------------------+-----------------+---------+---------+---------+-------------+-------------+---------------+------------------+-----------+------------------+-----------------+---------+---------+---------+-------------+-------------+---------------+------------------+-------------------+--------------------+-------------+-------------+--------------------+-------------------+-------------------+-------------+-------------+--------------------+-------------+--------------------+---------------+-------------------+-------------+---------------+--------------------+---------------------+--------------------+--------------------+---------------------+----------------------+--------------------+-------------------+-------------------+--------------------+--------------------+----

### 9. PySpark: XGBoost Model

In [19]:
xgboost_model_creation=create_pyspark_xgboost.xgboost_classifier(ml_df=pyspark_numerical_features,
                                                                 model_storage_location=model_storage_location)

You enabled use_gpu in spark local mode. Please make sure your local node has at least 2 GPUs
[18:05:16] task 1 got new rank 0                                    (0 + 2) / 2]
[18:05:16] task 0 got new rank 1
You enabled use_gpu in spark local mode. Please make sure your local node has at least 2 GPUs


  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the model by calling `Booster.save_model` from that version
  first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.



In [23]:
xgboost_model_creation.select('prediction').show(2)

                                                                                

23/04/20 18:14:58 WARN DAGScheduler: Broadcasting large task binary with size 1016.1 KiB


  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the model by calling `Booster.save_model` from that version
  first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.

[Stage 685:>                                                        (0 + 1) / 1]

+------------------+
|        prediction|
+------------------+
|1.0005996227264404|
|1.0005996227264404|
+------------------+
only showing top 2 rows



                                                                                

In [22]:
xgboost_model_creation.show()

                                                                                

23/04/20 18:14:31 WARN DAGScheduler: Broadcasting large task binary with size 1008.0 KiB


  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the model by calling `Booster.save_model` from that version
  first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.



+--------------------+-----+-------------------+-----+--------+-----+----------------+------------------+------------------+------+-----+-----+----------+----------+---------------+------------------+-----------+------------------+------------------+---------+---------+---------+-------------+-------------+---------------+------------------+-----------+------------------+------------------+---------+---------+---------+-------------+-------------+---------------+------------------+--------------------+--------------------+-------------+-------------+--------------------+-------------------+--------------------+-------------+-------------+--------------------+-------------+--------------------+---------------+-------------------+-------------+---------------+--------------------+---------------------+--------------------+--------------------+---------------------+----------------------+--------------------+-------------------+-------------------+--------------------+--------------------

                                                                                

### 10. PySpark: Model Evaluation

In [None]:
####### PySpark
pyspark_classification_metric_schema=pandas_udf_data_schema.classification_metric_schema()

classification_metric_df=classification_evalaution_metrics.pyspark_classification_model_evaluation_metrics(df=classification_model_outputs, 
                                                                                                           output_schema=pyspark_classification_metric_schema)
classification_metric_df.show()                    

### 11. PySpark: XGBoost Classification Feature Importance

In [None]:
xgboost_classification_plot.read_model_plot_variance(model_storage_location=model_storage_location)

# Pandas

### Pandas: Reading in Data

In [None]:
####### Pandas
pandas_df=reading_data.read_in_pandas()
pandas_df.head(1)

### Pandas: Custom Imputation Pipeline

In [None]:
test_pat=pandas_df[pandas_df['PatientId']=='tHu8WPnIffml5CL+AbOBkXcbFApQnP06KdrHbjinta4=']

In [None]:
pandas_custom_imputation_pipeline=pandas_sklearn_pipeline.pandas_custom_imputation_pipeline(df=pandas_df)
pandas_custom_imputation_pipeline.head(1)

In [None]:
####### Pandas
pandas_custom_imputation_pipeline=pandas_sklearn_pipeline.pandas_custom_imputation_pipeline(df=pandas_df)
pandas_custom_imputation_pipeline.head(1)

### Pandas: Aggregate Data at Level

### Pandas: Adding Binary Labels

In [None]:
pandas_df_added_binary_labels=create_binary_labels.pandas_binary_labels(df=pandas_custom_imputation_pipeline, 
                                                                        lower=daily_stats_features_lower, 
                                                                        upper=daily_stats_features_upper)
pandas_df_added_binary_labels.head(1)

### Pandas: Features: Summary Statistics

In [None]:
pandas_features_summary_stats=summary_stats_features.pandas_compressDailyValues(data=pandas_df_added_binary_labels, 
                                                                                lower=daily_stats_features_lower, 
                                                                                upper=daily_stats_features_upper)
pandas_features_summary_stats.head(1)

### Pandas: Wrapper Functions

### Pandas: Sklearn Categorical Pipeline in Pandas

In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
df=pandas_features_summary_stats[['PatientId', 'Value', 'GlucoseDisplayTime', 'GlucoseDisplayDate', 'inserted', 
        'missing', 'y_Binary', 'Median', 'Mean', 'Std Dev', 'Max', 'Min', 'AreaBelow', 'AreaAbove']]

In [None]:
df.head()

In [None]:
for patient_id in df['PatientId'].unique():
    # Categorical Features
    categorical_features=['inserted', 'missing']
    categorical_transformer=Pipeline([('imputer_cat', SimpleImputer(strategy='constant', fill_value=np.nan)),
                                        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor_2=ColumnTransformer([('categorical', categorical_transformer, categorical_features)],
                                    remainder = 'passthrough')

    cat_pipe_pipeline=Pipeline([('preprocessing_2', preprocessor_2)])

    transformed_data1=cat_pipe_pipeline.fit_transform(df)

    transformed_data_df=pd.DataFrame(transformed_data1)

    transformed_data_df['combine_inserted']=transformed_data_df[[0,1]].values.tolist()
    transformed_data_df['combine_missing']=transformed_data_df[[2,3]].values.tolist()
    transformed_data_df=transformed_data_df.drop(transformed_data_df.iloc[:, 0:4],axis = 1)

    transformed_data_df.columns=['PatientId', 'Value', 'GlucoseDisplayTime', 'GlucoseDisplayDate', 
                                    'y_Binary', 'Median', 'Mean', 'Std Dev', 'Max', 'Min', 'AreaBelow', 
                                    'AreaAbove', 'inserted', 'missing']

In [None]:
transformed_data_df.columns

In [None]:
####### Pandas
pandas_custom_categorical_pipeline=pandas_sklearn_pipeline.pandas_transform_categorical_features(df=pandas_features_summary_stats)
pandas_custom_categorical_pipeline.head(1)

### Pandas: Sklearn Numerical Pipeline in Pandas

In [None]:
####### Pandas
pandas_custom_numerical_pipeline=pandas_sklearn_pipeline.pandas_transform_numerical_features(df=pandas_custom_categorical_pipeline)
pandas_custom_numerical_pipeline.head(1)

### Pandas: Models