### Save Raw Data to Patient Specific Storage

In [107]:
# load in imports
#!sudo apt-get update
#!sudo apt-get install openjdk-8-jdk
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import StructType, StructField, \
StringType, IntegerType, TimestampType, DateType, FloatType
import time
from pyspark.sql.functions import col, lag, when, isnull, lit
import pathlib
from pyspark.sql.functions import col, to_date
from pyspark.sql.window import Window

#from DifferenceFeature import add_difference_features

In [3]:
conf = pyspark.SparkConf().setAll([\
    ('spark.app.name', 'ReduceData')])
spark = SparkSession.builder.config(conf=conf)\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/01 20:20:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/05/01 20:20:43 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


grab from parquet files

In [29]:
allPaths = []
for x in list(pathlib.Path('/cephfs/stepped_glucose_data/step0_load_daily').iterdir()):
    allPaths.append([str(f) for f in list(x.iterdir()) if str(f).endswith('.parquet')])

[['/cephfs/stepped_glucose_data/step0_load_daily/parquet_69/part-00000-3307b135-e04e-4652-9347-5b6570ce6af4-c000.snappy.parquet'], ['/cephfs/stepped_glucose_data/step0_load_daily/parquet_45/part-00000-1289f826-6f4a-4aef-9685-ebb35eefa1bc-c000.snappy.parquet'], ['/cephfs/stepped_glucose_data/step0_load_daily/parquet_118/part-00000-1c3d35f8-215c-4024-bebd-c9722ce2f656-c000.snappy.parquet'], ['/cephfs/stepped_glucose_data/step0_load_daily/parquet_60/part-00000-9f07ca32-193d-4c25-a0de-68639b872326-c000.snappy.parquet'], ['/cephfs/stepped_glucose_data/step0_load_daily/parquet_18/part-00000-e714659e-da18-43bc-814f-eb0b4da693b6-c000.snappy.parquet'], ['/cephfs/stepped_glucose_data/step0_load_daily/parquet_235/part-00000-8ef146cc-9f02-44dc-b4b2-700e970c070b-c000.snappy.parquet'], ['/cephfs/stepped_glucose_data/step0_load_daily/parquet_221/part-00000-55cf3904-ae54-4c46-9fba-fc882d0b3225-c000.snappy.parquet'], ['/cephfs/stepped_glucose_data/step0_load_daily/parquet_32/part-00000-2bd378d7-01cb-48

In [30]:
# get all csvs to load in
len(allPaths)

270

In [33]:
glucose_data_schema=StructType([StructField('PatientId', StringType(), True),
                                StructField('GlucoseDisplayTime', TimestampType(), True),
                                StructField('GlucoseDisplayTimeRaw', StringType(), True),
                                StructField('Value', FloatType(), True)
                                ])

grab from raw csv

In [114]:
# get all csvs to load in
allPaths = [str(x) for x in list(pathlib.Path('/cephfs/data').glob('*.csv')) if 'glucose_records' in str(x)]
allPaths.sort()

In [115]:
raw_schema=StructType([StructField('_c0', IntegerType(),True),
                                StructField('PostDate', TimestampType(),True),
                                StructField('IngestionDate', TimestampType(),True),
                                StructField('PostId', StringType(),True),
                                StructField('PostTime', TimestampType(), True),
                                StructField('PatientId', StringType(), True),
                                StructField('Stream', StringType(), True),
                                StructField('SequenceNumber', StringType(), True),
                                StructField('TransmitterNumber', StringType(), True),
                                StructField('ReceiverNumber', StringType(), True),
                                StructField('RecordedSystemTime', TimestampType(), True),
                                StructField('RecordedDisplayTime', TimestampType(), True),
                                StructField('RecordedDisplayTimeRaw', TimestampType(), True),
                                StructField('TransmitterId', StringType(), True),
                                StructField('TransmitterTime', StringType(), True),
                                StructField('GlucoseSystemTime', TimestampType(), True),
                                StructField('GlucoseDisplayTime', TimestampType(), True),
                                StructField('GlucoseDisplayTimeRaw', StringType(), True),
                                StructField('Value', FloatType(), True),
                                StructField('Status', StringType(), True),
                                StructField('TrendArrow', StringType(), True),
                                StructField('TrendRate', FloatType(), True),
                                StructField('IsBackFilled', StringType(), True),
                                StructField('InternalStatus', StringType(), True),
                                StructField('SessionStartTime', StringType(), True)])

In [44]:
def add_difference_features(df):
        my_window = Window.partitionBy('PatientId').orderBy("GlucoseDisplayTime")
        df = df.withColumn("prev_value", lag(df.Value).over(my_window))
        df = df.withColumn("FirstDiff", when(isnull(df.Value - df.prev_value), 0)
                                  .otherwise(df.Value - df.prev_value))

        df = df.withColumn("prev_val_sec", lag(df.FirstDiff).over(my_window))
        df = df.withColumn("SecDiff", when(isnull(df.FirstDiff - df.prev_val_sec), 0)
                                  .otherwise(df.FirstDiff - df.prev_val_sec))

        df = df.drop('prev_value', 'prev_val_sec')
        
        return df

In [99]:
cohortSchema = StructType([StructField('', IntegerType(), True),
                        StructField('UserId', StringType(), True),
                        StructField('Gender', StringType(), True),
                        StructField('DOB', TimestampType(), True),
                        StructField('Age', IntegerType(), True),
                        StructField('DiabetesType', StringType(), True),
                        StructField('Treatment', StringType(), True)
                        ])

In [128]:
cohortDf = spark.read.options(delimiter=',')\
        .csv('/cephfs/data/cohort.csv', header=True, schema=cohortSchema)\
        .withColumnRenamed('', 'NumId')

In [129]:
cohortDf.dtypes

[('NumId', 'int'),
 ('UserId', 'string'),
 ('Gender', 'string'),
 ('DOB', 'timestamp'),
 ('Age', 'int'),
 ('DiabetesType', 'string'),
 ('Treatment', 'string')]

In [103]:
cohortDf = cohortDf.withColumnRenamed('', 'NumId')

In [104]:
cohortDf.show(10)

+-----+--------------------+------+-------------------+---+------------+---------+
|NumId|              UserId|Gender|                DOB|Age|DiabetesType|Treatment|
+-----+--------------------+------+-------------------+---+------------+---------+
|    0|5lZPrCk6qk8L6Jw+S...|Female|1931-01-01 00:00:00| 92|    type-two|       no|
|    1|9qY9mZ+GV5Kd/O/NB...|  Male|1937-01-01 00:00:00| 86|    type-two|       no|
|    2|uhsyLhr4Zl6NfGbNB...|Female|1938-01-01 00:00:00| 85|    type-two|       no|
|    3|9uAVHBOgoCJ9hfcrL...|  Male|1938-01-01 00:00:00| 85|    type-two|       no|
|    4|Fyb156jU1edGykL7N...|Female|1939-01-01 00:00:00| 84|    type-two|       no|
|    5|86XfZ0fNI0VWOzWrl...|Female|1939-01-01 00:00:00| 84|    type-two|       no|
|    6|JfJMH1qCpiYNuPOp/...|Female|1940-01-01 00:00:00| 83|    type-two|       no|
|    7|EkW0PD80req7mL/5S...|  Male|1940-01-01 00:00:00| 83|    type-two|       no|
|    8|OyqSKorAj1OPZaevj...|Female|1941-01-01 00:00:00| 82|    type-two|       no|
|   

In [125]:
# load in data, select columns to save, create date column, save to parquet format
#!rm -r /cephfs/patient_data/ 
#bc we are appending, we want to delete before we do a full run to remove duplicates

startTime = time.time()
for idx in range(len(allPaths)):
    path = allPaths[idx]
    
    df = spark.read\
            .format('csv')\
            .option('delimiter', ',')\
            .option("mode", "DROPMALFORMED")\
            .option("header", True)\
            .schema(raw_schema)\
            .load(path)\
            .select(col("PatientId"), col("Value"), \
                    col("GlucoseDisplayTime"), col("GlucoseDisplayTimeRaw"))
            
    df = df.withColumn('GlucoseDisplayDate',
                           to_date(col('GlucoseDisplayTime')))
    
    patientIds = df.select('PatientId').distinct().select(col('PatientId')).collect()
    
    for patIds in patientIds:
        numId = cohortDf.filter(cohortDf.UserId == patIds.PatientId)\
            .first()\
            .NumId
        
        patData = df.filter(df.PatientId == patIds.PatientId)
        #patData = add_difference_features(df)
        patData = patData.withColumn('Num_Id', lit(numId))
        
        patData.show(4)
        patData.repartition(1).write\
                    .mode('append')\
                    .parquet('/cephfs/patient_data/patient_' + str(numId)) 
        
        break;
        
    break;

endTime = time.time()
print(endTime-startTime)

#one parquet files == 167 sec... without saving

                                                                                

+--------------------+-----+-------------------+---------------------+------------------+---------+-------+------+
|           PatientId|Value| GlucoseDisplayTime|GlucoseDisplayTimeRaw|GlucoseDisplayDate|FirstDiff|SecDiff|Num_Id|
+--------------------+-----+-------------------+---------------------+------------------+---------+-------+------+
|++4U2++WbRmSmSRv+...|142.0|2022-01-31 17:42:33| 2022-01-31T17:42:...|        2022-01-31|      0.0|    0.0|  6452|
|++4U2++WbRmSmSRv+...|138.0|2022-01-31 17:47:33| 2022-01-31T17:47:...|        2022-01-31|     -4.0|   -4.0|  6452|
|++4U2++WbRmSmSRv+...|150.0|2022-01-31 17:52:32| 2022-01-31T17:52:...|        2022-01-31|     12.0|   16.0|  6452|
|++4U2++WbRmSmSRv+...|148.0|2022-01-31 17:57:32| 2022-01-31T17:57:...|        2022-01-31|     -2.0|  -14.0|  6452|
+--------------------+-----+-------------------+---------------------+------------------+---------+-------+------+
only showing top 4 rows



[Stage 4858:>                                                       (0 + 1) / 1]

20.970186710357666


                                                                                