### One Hot Encoding

In [1]:
# !sudo apt update
# !sudo apt install openjdk-17-jre-headless -y
import pyspark
from pyspark import pandas as ps
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import col, lag, when, isnull
from pyspark.sql.types import StructType, StructField, \
StringType, IntegerType, TimestampType, DateType, FloatType
from pyspark import SparkContext, SparkConf
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.sql.window import Window



In [2]:
conf = pyspark.SparkConf().setAll([\
            ('spark.master', 'local[*]'),\
            ('spark.app.name', 'Glucose_Analysis_Spark')])\
            .set('spark.sql.shuffle.partitions', '1500')
spark = SparkSession.builder.config(conf=conf)\
    .getOrCreate()  

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/01 19:20:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df = spark.read.parquet('/cephfs/stepped_glucose_data/step0_load_daily/parquet_0/part-00000-1c9c2511-4c2e-40c2-b7d1-7827039567e8-c000.snappy.parquet')

                                                                                

In [4]:
df.dtypes

[('PatientId', 'string'),
 ('Value', 'float'),
 ('GlucoseDisplayTime', 'timestamp'),
 ('GlucoseDisplayTimeRaw', 'string'),
 ('GlucoseDisplayDate', 'date')]

In [5]:
patientIds = df.select('PatientId').distinct().select(col('PatientId')).collect()
print(len(patientIds))



5906


                                                                                

In [18]:
my_window = Window.partitionBy('PatientId').orderBy("GlucoseDisplayTime")
df = df.withColumn("prev_value", lag(df.Value).over(my_window))
df = df.withColumn("FirstDiff", when(isnull(df.Value - df.prev_value), 0)
                          .otherwise(df.Value - df.prev_value))

df = df.withColumn("prev_val_sec", lag(df.FirstDiff).over(my_window))
df = df.withColumn("SecDiff", when(isnull(df.first_diff - df.prev_val_sec), 0)
                          .otherwise(df.first_diff - df.prev_val_sec))

df = df.drop('prev_value', 'prev_val_sec')

df.show(10)

[Stage 79:>                                                         (0 + 1) / 1]

+--------------------+-----+-------------------+---------------------+------------------+---------+-------+
|           PatientId|Value| GlucoseDisplayTime|GlucoseDisplayTimeRaw|GlucoseDisplayDate|FirstDiff|SecDiff|
+--------------------+-----+-------------------+---------------------+------------------+---------+-------+
|++KoJktK3094kd1l4...|144.0|2022-01-31 17:44:40| 2022-01-31T17:44:...|        2022-01-31|      0.0|    0.0|
|++KoJktK3094kd1l4...|150.0|2022-01-31 17:49:39| 2022-01-31T17:49:...|        2022-01-31|      6.0|    6.0|
|++KoJktK3094kd1l4...|151.0|2022-01-31 17:54:40| 2022-01-31T17:54:...|        2022-01-31|      1.0|   -5.0|
|++KoJktK3094kd1l4...|152.0|2022-01-31 17:59:40| 2022-01-31T17:59:...|        2022-01-31|      1.0|    0.0|
+--------------------+-----+-------------------+---------------------+------------------+---------+-------+
only showing top 4 rows



                                                                                