### Save Raw Data to Patient Specific Storage

In [1]:
# load in imports
#!sudo apt-get update
#!sudo apt-get install openjdk-8-jdk
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import StructType, StructField, \
StringType, IntegerType, TimestampType, DateType, FloatType
import time
from pyspark.sql.functions import col, lag, when, isnull, lit
import pathlib
from pyspark.sql.functions import col, to_date
from pyspark.sql.window import Window

#from DifferenceFeature import add_difference_features

In [2]:
conf = pyspark.SparkConf().setAll([\
    ('spark.app.name', 'ReduceData')])
spark = SparkSession.builder.config(conf=conf)\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/03 21:51:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/05/03 21:51:06 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
glucose_data_schema=StructType([StructField('PatientId', StringType(), True),
                                StructField('GlucoseDisplayTime', TimestampType(), True),
                                StructField('GlucoseDisplayTimeRaw', StringType(), True),
                                StructField('Value', FloatType(), True)
                                ])

grab from raw csv

In [4]:
# get all csvs to load in
allPaths = [str(x) for x in list(pathlib.Path('/cephfs/data').glob('*.csv')) if 'glucose_records' in str(x)]
allPaths.sort()

In [5]:
raw_schema=StructType([StructField('_c0', IntegerType(),True),
                                StructField('PostDate', TimestampType(),True),
                                StructField('IngestionDate', TimestampType(),True),
                                StructField('PostId', StringType(),True),
                                StructField('PostTime', TimestampType(), True),
                                StructField('PatientId', StringType(), True),
                                StructField('Stream', StringType(), True),
                                StructField('SequenceNumber', StringType(), True),
                                StructField('TransmitterNumber', StringType(), True),
                                StructField('ReceiverNumber', StringType(), True),
                                StructField('RecordedSystemTime', TimestampType(), True),
                                StructField('RecordedDisplayTime', TimestampType(), True),
                                StructField('RecordedDisplayTimeRaw', TimestampType(), True),
                                StructField('TransmitterId', StringType(), True),
                                StructField('TransmitterTime', StringType(), True),
                                StructField('GlucoseSystemTime', TimestampType(), True),
                                StructField('GlucoseDisplayTime', TimestampType(), True),
                                StructField('GlucoseDisplayTimeRaw', StringType(), True),
                                StructField('Value', FloatType(), True),
                                StructField('Status', StringType(), True),
                                StructField('TrendArrow', StringType(), True),
                                StructField('TrendRate', FloatType(), True),
                                StructField('IsBackFilled', StringType(), True),
                                StructField('InternalStatus', StringType(), True),
                                StructField('SessionStartTime', StringType(), True)])

In [6]:
cohortSchema = StructType([StructField('', IntegerType(), True),
                        StructField('UserId', StringType(), True),
                        StructField('Gender', StringType(), True),
                        StructField('DOB', TimestampType(), True),
                        StructField('Age', IntegerType(), True),
                        StructField('DiabetesType', StringType(), True),
                        StructField('Treatment', StringType(), True)
                        ])

In [7]:
cohortDf = spark.read.options(delimiter=',')\
        .csv('/cephfs/data/cohort.csv', header=True, schema=cohortSchema)\
        .withColumnRenamed('', 'NumId')

In [8]:
cohortDf.dtypes

[('NumId', 'int'),
 ('UserId', 'string'),
 ('Gender', 'string'),
 ('DOB', 'timestamp'),
 ('Age', 'int'),
 ('DiabetesType', 'string'),
 ('Treatment', 'string')]

In [9]:
cohortDf = cohortDf.withColumnRenamed('', 'NumId')

In [16]:
patientIds = cohortDf.select(col('UserId'), col('NumId')).distinct()

In [11]:
cohortDf.show(10)

+-----+--------------------+------+-------------------+---+------------+---------+
|NumId|              UserId|Gender|                DOB|Age|DiabetesType|Treatment|
+-----+--------------------+------+-------------------+---+------------+---------+
|    0|5lZPrCk6qk8L6Jw+S...|Female|1931-01-01 00:00:00| 92|    type-two|       no|
|    1|9qY9mZ+GV5Kd/O/NB...|  Male|1937-01-01 00:00:00| 86|    type-two|       no|
|    2|uhsyLhr4Zl6NfGbNB...|Female|1938-01-01 00:00:00| 85|    type-two|       no|
|    3|9uAVHBOgoCJ9hfcrL...|  Male|1938-01-01 00:00:00| 85|    type-two|       no|
|    4|Fyb156jU1edGykL7N...|Female|1939-01-01 00:00:00| 84|    type-two|       no|
|    5|86XfZ0fNI0VWOzWrl...|Female|1939-01-01 00:00:00| 84|    type-two|       no|
|    6|JfJMH1qCpiYNuPOp/...|Female|1940-01-01 00:00:00| 83|    type-two|       no|
|    7|EkW0PD80req7mL/5S...|  Male|1940-01-01 00:00:00| 83|    type-two|       no|
|    8|OyqSKorAj1OPZaevj...|Female|1941-01-01 00:00:00| 82|    type-two|       no|
|   

In [13]:
#read in all paths
startTime = time.time()
df = spark.read\
    .format('csv')\
    .option('delimiter', ',')\
    .option("mode", "DROPMALFORMED")\
    .option("header", True)\
    .schema(raw_schema)\
    .load(allPaths)\
    .select(col("PatientId"), col("Value"), \
            col("GlucoseDisplayTime"), col("GlucoseDisplayTimeRaw"))
print(time.time() - startTime)

                                                                                

1.3498187065124512


In [14]:
#add date
df = df.withColumn('GlucoseDisplayDate',
                       to_date(col('GlucoseDisplayTime')))


In [21]:
#add numId to df
joined = df.join(patientIds, df.PatientId == patientIds.UserId)\
            .select(df.PatientId, df.Value, df.GlucoseDisplayTime, df.GlucoseDisplayTimeRaw, \
                    df.GlucoseDisplayDate, patientIds.NumId)
joined.show(4)

+--------------------+-----+--------------------+---------------------+------------------+-----+
|           PatientId|Value|  GlucoseDisplayTime|GlucoseDisplayTimeRaw|GlucoseDisplayDate|NumId|
+--------------------+-----+--------------------+---------------------+------------------+-----+
|1Jxgxke6R3Uh2c9aR...|  0.0|2022-02-01 14:45:...| 2022-02-01T14:45:...|        2022-02-01| 1560|
|toBStbTTYI2GU28Yd...|  0.0|2022-02-01 17:46:...| 2022-02-01T17:46:...|        2022-02-01| 6802|
|+XAhHhm+BkhqusxsZ...|  0.0|2022-02-01 14:58:...| 2022-02-01T14:58:...|        2022-02-01|  988|
|+XAhHhm+BkhqusxsZ...|  0.0|2022-01-31 22:53:...| 2022-01-31T22:53:...|        2022-01-31|  988|
+--------------------+-----+--------------------+---------------------+------------------+-----+
only showing top 4 rows



In [22]:
type(joined)

pyspark.sql.dataframe.DataFrame

In [25]:
startTime = time.time()

joined.repartition('PatientId')\
    .write.parquet('/cephfs/data_by_patient/') 

print(time.time()-startTime)

                                                                                

4661.077612400055
