### Install

In [1]:
# Install the following if you have not done so, otherwise leave commented
# ! pip install EntropyHub

### Libraries

In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, TimestampType, ArrayType, IntegerType
from pyspark.sql.functions import pandas_udf, PandasUDFType, lit

import pandas as pd
import numpy as np
import EntropyHub as EH
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

### Reading Data

In [3]:
class Reading_Data:
    """
    A class used to read in data from a location into a PySpark DataFrame

    ...

    Attributes
    ----------
    data_location : str
        string location of where data csv is stored

    Methods
    -------
    data_schema()
        Returns a Spark Dataframe with the correct schema for our input file
    """
    
    
    def __init__(self, data_location):
        """
        Parameters
        ----------
        data_location : str
            string location of where data csv is stored
        """
    
        self.data_location=data_location

        
    def data_schema(self):
        """
        Returns the data schema we will use on the input csv

        Parameters
        ----------
        None

        Returns
        ------
        StructType
            This is a PySpark data schema
        """
        
        glucose_data_schema=StructType([StructField('PostDate', TimestampType(),True),
                                        StructField('IngestionDate', TimestampType(),True),
                                        StructField('PostID', StringType(),True),
                                        StructField('PostTime', TimestampType(), True),
                                        StructField('PatientID', StringType(), True),
                                        StructField('Stram', StringType(), True),
                                        StructField('SequenceNumber', StringType(), True),
                                        StructField('TransmitterNumber', StringType(), True),
                                        StructField('ReceiverNumber', StringType(), True),                                       
                                        StructField('RecordedSystemTime', TimestampType(), True),
                                        StructField('RecordedDisplayTime', TimestampType(), True),
                                        StructField('RecordedDisplayTimeRaw', TimestampType(), True),
                                        StructField('TransmitterId', StringType(), True),
                                        StructField('TransmitterTime', StringType(), True),
                                        StructField('GlucoseSystemTime', TimestampType(), True),
                                        StructField('GlucoseDisplayTime', TimestampType(), True),
                                        StructField('GlucoseDisplayTimeRaw', TimestampType(), True),
                                        StructField('Value', FloatType(), True),
                                        StructField('Status', StringType(), True),
                                        StructField('TrendArrow', StringType(), True),
                                        StructField('TrendRate', FloatType(), True),
                                        StructField('IsBackFilled', StringType(), True),
                                        StructField('InternalStatus', StringType(), True),
                                        StructField('SessionStartTime', StringType(), True)])
        return glucose_data_schema
            
        
    def read_in_data(self):
        """
        Returns the PySpark dataframe that will be used throughout the project

        Parameters
        ----------
        None

        Returns
        ------
        pyspark.sql.dataframe.DataFrame
            This is a PySpark Dataframe created based on the data schema and input file specified
        """        
        
        spark=SparkSession.builder.master("local"). \
                           appName('Resd_Glucose_Data'). \
                           getOrCreate()
        
        glucose_data=spark.read.csv(self.data_location, 
                                    header=True,
                                    sep=',', 
                                    schema=self.data_schema())
        
        return glucose_data
        
    

### Wrapper Functions

In [4]:
# Can convert this into a PySpark process like the Data Transformation class below but need to install packages
# in the console directly, will do that in the requirements file, needs to be located in the same place
# as the udf()
class Statistical_Time_Series_Methods:
    """
    A class used to calculate entropy, fucntion2, function3, function4

    ...

    Attributes
    ----------
    glucose_data : pandas.DataFrame
        Pandas dataframe with the glucose data

    Methods
    -------
    entropy_calculation()
        Returns a pandas dataframe with the entropy value calculated based on the input data.
    """
    
    def __init__(self, glucose_data):
        """
        Parameters
        ----------
        glucose_data : pandas.DataFrame
            Pandas dataframe with the glucose data
        """
        
        self.glucose_data=glucose_data
    
    
    def entropy_calculation(self):
        """
        Returns a pandas dataframe wit the entropy calculation value

        Parameters
        ----------
        None

        Returns
        ------
        pandas.DataFrame
            This is a pandas dataframe with the entropy value
        """  
        
        entropy=EH.SampEn(self.glucose_data['Value'].values, m=4)[0][-1]
        ent_df=pd.DataFrame()
        ent_df['Entropy']=[entropy]

        return ent_df

    
    ### Add other functions here, will create a dataframe out of them once added, also will wrap them in a PySpark 
    ### pandas_udf wrapper to do the whole thing in PySpark instead of swiching from Pandas to PySpark and so on.
    
    def calculation2(self):
        
        return None

    
    def calculation3(self):
        
        return None
    
    
    def calculation4(self):
        
        return None

### Data Transformation PySpark --> Sklearn --> PySpark

In [5]:
class Data_Transformations:
    """
    A class used to create the sklearn data pipeline transformation in PySpark while still using a non Pyspark
    library (sklearn), we are completing this process fully in PySpark by using the @pandas_udf() PySpark
    wrapper. This will be useful when we are able to complete this process using groups such as gender, sex,
    etc.. However for now I have assigned a dummy group to let the process run.

    ...

    Attributes
    ----------
    glucose_data : pyspark.sql.dataframe.DataFrame
        PySpark dataframe with our glucose data from the Reading_Data class above
        
    transform_schema : StructType
        The data schema that will be used on the @pandas_udf() wrapper function when outputting our PySpark
        data from the transformed variables pipeline

    Methods
    -------
    sklearn_pipeline()
        Returns a PySpark dataframe with transformed values using the sklearn library, however process is distributed
        in PySpark resources.
    """
    
    def __init__(self, glucose_data):
        """
        Parameters
        ----------
        glucose_data : pyspark.sql.dataframe.DataFrame
            PySpark dataframe with our glucose data from the Reading_Data class above
            
        transform_schema : StructType
            The data schema that will be used on the @pandas_udf() wrapper function when outputting our PySpark
            data from the transformed variables pipeline
        """
        
        self.glucose_data=glucose_data
        self.transform_schema=StructType([StructField('Value', FloatType(),True),
                                          StructField('TrendRate', FloatType(),True),
                                          StructField('PatientID', StringType(),True),
                                          StructField('GlucoseDisplayTimeRaw', TimestampType(),True),
                                          StructField('TrendArrow', ArrayType(IntegerType()),True)])
    
    def sklearn_pipeline(self):
        """
        Returns a PySpark dataframe with the transformed values. The values are transformed in sklearn by using
        OneHotEncoding, Imputations, and StandardScaler methods in sklearn, however we are applying this to a 
        PySpark dataframe without having to convert it into a pandas dataframe. We are doing this by creating a 
        dummy group, this group function will come into use when we are grouping our data by sex, gender, etc.

        Parameters
        ----------
        None

        Returns
        ------
        pyspark.sql.dataframe.DataFrame
            This is a PySpark Dataframe with the transformed values
        """  
        @pandas_udf(self.transform_schema, PandasUDFType.GROUPED_MAP)
        def transform_features(pdf):
            df=pdf[['PatientID','Value','GlucoseDisplayTimeRaw','TrendArrow','TrendRate']]
   
            categorical_features=['TrendArrow']
            categorical_transformer=Pipeline([('imputer_cat', SimpleImputer(strategy='constant', fill_value=np.nan)),
                                              ('onehot', OneHotEncoder(handle_unknown='ignore'))])

            numeric_features=['Value', 'TrendRate']
            numeric_transformer=Pipeline([('imputer_num', SimpleImputer(strategy='median')),
                                          ('scaler', StandardScaler())])

            preprocessor=ColumnTransformer([('categorical', categorical_transformer, categorical_features),
                                            ('numerical', numeric_transformer, numeric_features)],
                                            remainder = 'passthrough')

            pipeline=Pipeline([('preprocessing', preprocessor)])

            transformed_data_array=pipeline.fit_transform(df)
            transformed_data_df=pd.DataFrame(transformed_data_array)

            transformed_data_df['combine']=transformed_data_df[[0,1,2,3,4,5,6]].values.tolist()
            transformed_data_df=transformed_data_df.drop(transformed_data_df.iloc[:, 0:7],axis = 1)
            transformed_data_df.columns=['Value', 'TrendRate', 'PatientID', 'GlucoseDisplayTimeRaw', 'TrendArrow']
            
            return transformed_data_df
        
        self.glucose_data=self.glucose_data.withColumn('Group', lit(1))
        transformed_data=self.glucose_data.groupby('Group').apply(transform_features)
        
        return transformed_data

### Run Modules

In [6]:
# Reading Data Class
data_bucket_loc='gs://glucose_data_dse/ahr414_glucose_sample - ahr414_glucose_sample.csv'
reading_data=Reading_Data(data_location=data_bucket_loc)
glucose_df=reading_data.read_in_data()

glucose_df.show(3)

                                                                                

+-------------------+-------------------+--------------------+-------------------+--------------------+------+--------------+--------------------+--------------+-------------------+-------------------+----------------------+--------------------+---------------+-------------------+-------------------+---------------------+-----+------+----------+---------+------------+--------------+----------------+
|           PostDate|      IngestionDate|              PostID|           PostTime|           PatientID| Stram|SequenceNumber|   TransmitterNumber|ReceiverNumber| RecordedSystemTime|RecordedDisplayTime|RecordedDisplayTimeRaw|       TransmitterId|TransmitterTime|  GlucoseSystemTime| GlucoseDisplayTime|GlucoseDisplayTimeRaw|Value|Status|TrendArrow|TrendRate|IsBackFilled|InternalStatus|SessionStartTime|
+-------------------+-------------------+--------------------+-------------------+--------------------+------+--------------+--------------------+--------------+-------------------+-------------

In [7]:
# Statistical_Time_Series_Methods Class 
statistical_time_series_methods=Statistical_Time_Series_Methods(glucose_data=glucose_df.toPandas())
entropy=statistical_time_series_methods.entropy_calculation()

entropy

                                                                                

Unnamed: 0,Entropy
0,0.140083


In [8]:
# Data_Transformations Class
data_transformations=Data_Transformations(glucose_data=glucose_df)
dt_1=data_transformations.sklearn_pipeline()
dt_1.show(3)

23/01/24 21:10:26 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 4:>                                                          (0 + 1) / 1]

+-----------+----------+--------------------+---------------------+--------------------+
|      Value| TrendRate|           PatientID|GlucoseDisplayTimeRaw|          TrendArrow|
+-----------+----------+--------------------+---------------------+--------------------+
| -0.7103891|0.28279045|tHu8WPnIffml5CL+A...|  2022-09-13 23:15:45|[0, 1, 0, 0, 0, 0...|
|-0.73639596|0.15356635|tHu8WPnIffml5CL+A...|  2022-09-13 23:20:45|[0, 1, 0, 0, 0, 0...|
| -0.7103891|0.15356635|tHu8WPnIffml5CL+A...|  2022-09-13 23:25:45|[0, 1, 0, 0, 0, 0...|
+-----------+----------+--------------------+---------------------+--------------------+
only showing top 3 rows



                                                                                