In [1]:
from datetime import datetime, timedelta
import time
import pathlib
import pyspark
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import when, col, to_date, date_trunc, rank, monotonically_increasing_id, date_trunc, min, max
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, DateType, FloatType, BooleanType

# spark.stop()

In [2]:
conf = pyspark.SparkConf().setAll([\
    ('spark.app.name', 'ReduceData')])
spark = SparkSession.builder.config(conf=conf)\
    .getOrCreate()

spark.version

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/17 01:38:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


'3.3.1'

In [3]:
"""the raw data's column descriptions"""
test = spark.read.option("multiline","true").json('/cephfs/data/glucose_records.json')
# test = test.filter(col('title')=="GlucoseSystemTime")
test.show(24,truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------+-------------------------+
|description                                                                

### Structs we might need

In [5]:
glucose_data_schema=StructType([StructField('NumId', IntegerType(), True),
                                StructField('PatientId', StringType(), True),
                                StructField('Value', FloatType(), True),
                                StructField('GlucoseDisplayTime', TimestampType(), True),
                                StructField('GlucoseDisplayTimeRaw', StringType(), True),
                                StructField('GlucoseDisplayDate', DateType(), True)])

temp_schema = StructType([StructField('PatientId', StringType(), True),
                          StructField('count', IntegerType(), True)])


# glucose_data_schema=StructType([StructField('PatientId', StringType(), True),
#                                 StructField('Value', FloatType(), True),
#                                 StructField('GlucoseDisplayTime', TimestampType(), True),
#                                 StructField('GlucoseDisplayTimeRaw', StringType(), True),
#                                 StructField('GlucoseDisplayDate', DateType(), True),
#                                 StructField('NumId', IntegerType(), True)
#                                 ])

raw_schema=StructType([StructField('_c0', IntegerType(),True),
                                StructField('PostDate', TimestampType(),True),
                                StructField('IngestionDate', TimestampType(),True),
                                StructField('PostId', StringType(),True),
                                StructField('PostTime', TimestampType(), True),
                                StructField('PatientId', StringType(), True),
                                StructField('Stream', StringType(), True),
                                StructField('SequenceNumber', StringType(), True),
                                StructField('TransmitterNumber', StringType(), True),
                                StructField('ReceiverNumber', StringType(), True),
                                StructField('RecordedSystemTime', TimestampType(), True),
                                StructField('RecordedDisplayTime', TimestampType(), True),
                                StructField('RecordedDisplayTimeRaw', TimestampType(), True),
                                StructField('TransmitterId', StringType(), True),
                                StructField('TransmitterTime', StringType(), True),
                                StructField('GlucoseSystemTime', TimestampType(), True),
                                StructField('GlucoseDisplayTime', TimestampType(), True),
                                StructField('GlucoseDisplayTimeRaw', StringType(), True),
                                StructField('Value', FloatType(), True),
                                StructField('Status', StringType(), True),
                                StructField('TrendArrow', StringType(), True),
                                StructField('TrendRate', FloatType(), True),
                                StructField('IsBackFilled', BooleanType(), True),
                                StructField('InternalStatus', StringType(), True),
                                StructField('SessionStartTime', StringType(), True)])

cohortSchema = StructType([StructField('', IntegerType(), True),
                        StructField('UserId', StringType(), True),
                        StructField('Gender', StringType(), True),
                        StructField('DOB', TimestampType(), True),
                        StructField('Age', IntegerType(), True),
                        StructField('DiabetesType', StringType(), True),
                        StructField('Treatment', StringType(), True)
                        ])

### Get all paths in to read from

In [6]:
'''all CSVs of the raw data'''
allCSVPaths = [str(x) for x in list(pathlib.Path('/cephfs/data').glob('*.csv')) if 'glucose_records' in str(x)]

allCSVPaths.sort()

len(allCSVPaths)

365

# Load in

## Read in the Cohort dataframe

In [10]:
# read in cohort dataframe, with Number ID properly labeled
startTime = time.time()

cohortDf = spark.read.options(delimiter=',')\
        .csv('/cephfs/data/cohort.csv', header=True, schema=cohortSchema)\
        .withColumnRenamed('', 'NumId')

print(cohortDf.dtypes)
cohortDf.show(10)

# make mini dataframe of the string IDs and number IDs
patientIds = cohortDf.select(col('UserId'), col('NumId')).distinct()

print(time.time() - startTime)

[('NumId', 'int'), ('UserId', 'string'), ('Gender', 'string'), ('DOB', 'timestamp'), ('Age', 'int'), ('DiabetesType', 'string'), ('Treatment', 'string')]


[Stage 0:>                                                          (0 + 1) / 1]

+-----+--------------------+------+-------------------+---+------------+---------+
|NumId|              UserId|Gender|                DOB|Age|DiabetesType|Treatment|
+-----+--------------------+------+-------------------+---+------------+---------+
|    0|5lZPrCk6qk8L6Jw+S...|Female|1931-01-01 00:00:00| 92|    type-two|       no|
|    1|9qY9mZ+GV5Kd/O/NB...|  Male|1937-01-01 00:00:00| 86|    type-two|       no|
|    2|uhsyLhr4Zl6NfGbNB...|Female|1938-01-01 00:00:00| 85|    type-two|       no|
|    3|9uAVHBOgoCJ9hfcrL...|  Male|1938-01-01 00:00:00| 85|    type-two|       no|
|    4|Fyb156jU1edGykL7N...|Female|1939-01-01 00:00:00| 84|    type-two|       no|
|    5|86XfZ0fNI0VWOzWrl...|Female|1939-01-01 00:00:00| 84|    type-two|       no|
|    6|JfJMH1qCpiYNuPOp/...|Female|1940-01-01 00:00:00| 83|    type-two|       no|
|    7|EkW0PD80req7mL/5S...|  Male|1940-01-01 00:00:00| 83|    type-two|       no|
|    8|OyqSKorAj1OPZaevj...|Female|1941-01-01 00:00:00| 82|    type-two|       no|
|   

                                                                                

## Read in the CSVs

## Read in the counter parquet (temp_data)

In [None]:
startTime = time.time()

df = spark.read\
    .parquet('temp_data/part-00000-bcc637a6-2bd8-4c20-bc2c-42de80cad98b-c000.snappy.parquet')

df = df.withColumnRenamed('PatientId','UserId')
df = df.withColumn('count',col('count').cast(IntegerType()))

print("row count: ", df.count())
df.printSchema()
df.show()
patIds = [i.UserId for i in df.select('UserId').distinct().collect()]
len(patIds)

print(time.time() - startTime)

row count:  7957
root
 |-- UserId: string (nullable = true)
 |-- count: integer (nullable = true)

+--------------------+------+
|              UserId| count|
+--------------------+------+
|+Gr/1qOf9OWMa4LOL...| 94433|
|+TH+y0M8bQb2EpRVJ...| 84416|
|+UJpU9Owd5VBvkNtJ...| 99362|
|+PvW5D8b0wZANElHi...|101336|
|+9Rtj/cdJMLF2raCJ...| 89139|
|+JcspO9meT7uENVtz...| 97617|
|+BPY2YsPzI4b+DwiN...| 82083|
|+Ld/+ikzUehNOZ5f+...|100467|
|+VCEKhqF9R2oI0VGv...| 91595|
|+UgMn9AIL2d78WOS4...| 94862|
|+M7reUgRS+P3suzYg...| 86859|
|+O6WjkJ9Gb5esPEgG...|101657|
|+9ipwJiIRcZTJrck+...|103118|
|+F5NQV8YRae91R0Og...|100282|
|+XcQw7Zbj9WPeWsDq...| 99547|
|+DkuZFXuoapNjKE9g...| 85436|
|+HzKQWGJels56+L6u...| 95393|
|+8cP93h83g3DYhiXN...| 83085|
|+7nbCvVCmsMWaCjcH...| 92185|
|+XHDtL8igY38vN48Q...|102874|
+--------------------+------+
only showing top 20 rows



7957

# EDA

## see % of the data to be interpolated

can go through all the training data (in `/cephfs/train_test_val/train`) and check what % of the data is getting interpolated

In [6]:
"""read in the training parquets"""
data_location = "/cephfs/train_test_val/train"
allPaths = [str(x) for x in list(pathlib.Path(data_location).glob('*.parquet')) if 'part-00' in str(x)]
allPaths.sort()
len(allPaths)

199

In [None]:
"""rerun this cell and change j (=0 to =199) and run the next cell too"""
startTime = time.time()

# for j in range(len(allPaths)):
j = 1

pyspark_glucose_data = spark.read \
                       .schema(glucose_data_schema) \
                       .format('parquet') \
                       .load(allPaths[j])
patIds = [i.NumId for i in pyspark_glucose_data.select('NumId').distinct().collect()]
print(j,": ",len(patIds))

print(time.time()-startTime)

1 :  39
0.1841890811920166


In [68]:
"""use the min and max GlucoseDisplayTime values and see how much is gonna get interpolated"""
startTime = time.time()

min_max = pyspark_glucose_data.groupby('PatientId')\
            .agg(min("GlucoseDisplayTime").alias("Min"),\
                 max("GlucoseDisplayTime").alias("Max"))
print("starting row count: ", min_max.count())

min_max = min_max.withColumn('ExpectedRecords',(col("Max").cast("long") - col('Min').cast("long"))/(60*5))

temp = min_max.join(df, min_max.PatientId == df.UserId)#.select(min_max.PatientId, min_max.ExpectedRecords, df.count)
temp = temp.drop("Min","Max","UserId")

temp = temp.withColumn('discrepancy%',100*(col("ExpectedRecords").cast("integer") / col('count'))-100)
temp = temp.orderBy('discrepancy%', ascending=False)

length = temp.count()
print("ending row count: ", length)
temp.show(length, truncate=False)

print(time.time()-startTime)

starting row count:  39
ending row count:  39
+--------------------------------------------+---------------+------+------------------+
|PatientId                                   |ExpectedRecords|count |discrepancy%      |
+--------------------------------------------+---------------+------+------------------+
|aRcdptX0j0XO5JiZjvcRhh/2hhLNDeIFccfFLPKXYG4=|104988.4       |71212 |47.430208391844076|
|Z9bKeYamn2rfzIvygFMtUMAnmvNWlLv5i0L6F491CDE=|94616.0        |71093 |33.08764575977946 |
|R8bQL1K4vkc1t1A1LppgK7zgjOQ3aApOV635AZaXzo8=|105041.8       |80634 |30.268869211498867|
|rWBn04Ywl5IV9Dnzg2UrZr2hv0znnTM9TZAcYnxrnKQ=|101467.0       |80818 |25.550001237348113|
|O3bOa6QuOQxus7y6YAwm/GYJMKbHIkQQ8mvhTRapn3Y=|105009.2       |83691 |25.472273004265688|
|XYvnckGwCuSRi9d72KltoyJAQoKj0ae1HC5WgqfgEjQ=|104311.4       |87478 |19.24255241317816 |
|uU3BWosfaHwu6vcHPMpXI1A8WrJIFJrOSrG+wo4Q6xk=|100855.8       |84924 |18.75912580660355 |
|kcnYD5XlN/B3VCJLdEgkjayJ8bRiCinjDwxJe2vmPs0=|104379.6       |88

## is it a good idea to truncate CSVs by their date?

In [7]:
startTime = time.time()

path_i = allCSVPaths[0]

df = spark.read\
    .format('csv')\
    .option('delimiter', ',')\
    .option("mode", "DROPMALFORMED")\
    .option("header", True)\
    .schema(raw_schema)\
    .load(path_i)\
    .select(col("PatientId"), col("Value"), \
            col("GlucoseDisplayTime"), col("GlucoseSystemTime"))

df = df.where("Value > 0")
df = df.na.drop(subset=['PatientId','Value','GlucoseDisplayTime'])

patIds = [i.PatientId for i in df.select('PatientId').distinct().collect()]
print(len(patIds), "total patients")
print("row count: ", df.count())
df.show(truncate=False)

print("time: ", time.time()-startTime)

                                                                                

5708 total patients


                                                                                

row count:  1578582
+--------------------------------------------+-----+-------------------+-------------------+
|PatientId                                   |Value|GlucoseDisplayTime |GlucoseSystemTime  |
+--------------------------------------------+-----+-------------------+-------------------+
|d0dv9A8no1AasCnoLNVfbrtVIQ5FjWrqiLvf0NOZLFQ=|134.0|2022-02-01 00:39:42|2022-02-01 05:39:42|
|d0dv9A8no1AasCnoLNVfbrtVIQ5FjWrqiLvf0NOZLFQ=|103.0|2022-02-01 04:19:42|2022-02-01 09:19:42|
|d0dv9A8no1AasCnoLNVfbrtVIQ5FjWrqiLvf0NOZLFQ=|126.0|2022-02-01 02:29:41|2022-02-01 07:29:41|
|d0dv9A8no1AasCnoLNVfbrtVIQ5FjWrqiLvf0NOZLFQ=|93.0 |2022-02-01 08:24:43|2022-02-01 13:24:43|
|d0dv9A8no1AasCnoLNVfbrtVIQ5FjWrqiLvf0NOZLFQ=|94.0 |2022-02-01 05:54:42|2022-02-01 10:54:42|
|d0dv9A8no1AasCnoLNVfbrtVIQ5FjWrqiLvf0NOZLFQ=|103.0|2022-02-01 12:24:45|2022-02-01 17:24:45|
|d0dv9A8no1AasCnoLNVfbrtVIQ5FjWrqiLvf0NOZLFQ=|90.0 |2022-02-01 11:59:45|2022-02-01 16:59:45|
|d0dv9A8no1AasCnoLNVfbrtVIQ5FjWrqiLvf0NOZLFQ=|102.

In [8]:
dt_start= datetime.strptime(path_i[29:39] + " 00:00:00", '%Y-%m-%d %H:%M:%S') - timedelta(hours=1)
dt_end  = datetime.strptime(path_i[29:39] + " 23:59:59", '%Y-%m-%d %H:%M:%S')
str(dt_start)

'2022-01-31 23:00:00'

In [9]:
startTime = time.time()

# save the "overflow"
before = df.where("GlucoseSystemTime <= '" + str(dt_start) + "'")
after = df.where("GlucoseSystemTime >= '" + path_i[29:39] + " 23:59:59'")

# actually filter the df
df0 = df.where("GlucoseSystemTime >= '" + path_i[29:39] + " 00:00:00'")
df0 = df.where("GlucoseSystemTime <= '" + path_i[29:39] + " 23:59:59'")

patIds = [i.PatientId for i in df0.select('PatientId').distinct().collect()]
print(len(patIds), "total patients")
print("row count: ", df0.count())
df0.show()

print("time: ", time.time()-startTime)

                                                                                

5708 total patients


                                                                                

row count:  1578582
+--------------------+-----+-------------------+-------------------+
|           PatientId|Value| GlucoseDisplayTime|  GlucoseSystemTime|
+--------------------+-----+-------------------+-------------------+
|d0dv9A8no1AasCnoL...|134.0|2022-02-01 00:39:42|2022-02-01 05:39:42|
|d0dv9A8no1AasCnoL...|103.0|2022-02-01 04:19:42|2022-02-01 09:19:42|
|d0dv9A8no1AasCnoL...|126.0|2022-02-01 02:29:41|2022-02-01 07:29:41|
|d0dv9A8no1AasCnoL...| 93.0|2022-02-01 08:24:43|2022-02-01 13:24:43|
|d0dv9A8no1AasCnoL...| 94.0|2022-02-01 05:54:42|2022-02-01 10:54:42|
|d0dv9A8no1AasCnoL...|103.0|2022-02-01 12:24:45|2022-02-01 17:24:45|
|d0dv9A8no1AasCnoL...| 90.0|2022-02-01 11:59:45|2022-02-01 16:59:45|
|d0dv9A8no1AasCnoL...|102.0|2022-02-01 08:54:43|2022-02-01 13:54:43|
|d0dv9A8no1AasCnoL...| 95.0|2022-02-01 05:49:42|2022-02-01 10:49:42|
|d0dv9A8no1AasCnoL...|201.0|2022-01-31 20:19:41|2022-02-01 01:19:41|
|d0dv9A8no1AasCnoL...|115.0|2022-02-01 03:54:42|2022-02-01 08:54:42|
|d0dv9A8no1Aas

In [16]:
after = df.where("GlucoseSystemTime >= '" + path_i[29:39] + " 23:00:00'")
# after = after.orderBy("PatientId", "GlucoseDisplayTime", ascending=True)
print("row count: ", after.count())
after.show(truncate=False)

row count:  26143
+--------------------------------------------+-----+-------------------+-------------------+
|PatientId                                   |Value|GlucoseDisplayTime |GlucoseSystemTime  |
+--------------------------------------------+-----+-------------------+-------------------+
|f/odpYRZXAvqV2R8ACkoBHIuhYQZeBf3HWPwTldya4Q=|138.0|2022-02-01 16:19:16|2022-02-01 23:19:16|
|f/odpYRZXAvqV2R8ACkoBHIuhYQZeBf3HWPwTldya4Q=|144.0|2022-02-01 16:09:16|2022-02-01 23:09:16|
|3rbpIFNw48RfNHesImaGrV7qlOKuvu82Qtruj8arChs=|150.0|2022-02-01 17:00:24|2022-02-01 23:00:24|
|b7vxHPEbZUCsAduByMXg3jaCG29POUCICc3jdyW6HAw=|214.0|2022-02-01 15:02:33|2022-02-01 23:02:33|
|zAJ4HtjLqb8RGL12tYWk8f2KxgnZa5lC0B4yyb0ggDk=|92.0 |2022-02-01 18:17:48|2022-02-01 23:17:48|
|nST9cqhaXsuhZbe/9SAjRdQLLtNZ1BU4qS6aSsdShhA=|130.0|2022-02-01 17:20:54|2022-02-01 23:20:54|
|pmxSamcIl3qpJF5zN0s+H8Fuqg6T3ElqPBkDVW7EIdc=|123.0|2022-02-01 17:09:43|2022-02-01 23:09:43|
|Rjp5oXD4KpRyaN/DMiA4PcmBSDY+3HG7Af2hT3MCdfA=|142.0|

In [29]:
startTime = time.time()

path_i = allCSVPaths[1]

df1 = spark.read\
    .format('csv')\
    .option('delimiter', ',')\
    .option("mode", "DROPMALFORMED")\
    .option("header", True)\
    .schema(raw_schema)\
    .load(path_i)\
    .select(col("PatientId"), col("Value"), \
            col("GlucoseDisplayTime"), col("GlucoseSystemTime"))

df1 = df1.where("Value > 0")
df1 = df1.na.drop(subset=['PatientId','Value','GlucoseDisplayTime'])
before = df1.where("GlucoseSystemTime <= '" + path_i[29:39] + " 00:00:00'")

print("time: ", time.time()-startTime)

time:  0.042630910873413086


In [30]:
# dt_start = datetime.strptime(path_i[29:39] + " 00:00:00", '%Y-%m-%d %H:%M:%S') - timedelta(hours=2)
# print(dt_start)

before = df1.where("GlucoseSystemTime <= '" + str(dt_start) + "'")
beforePatIds = [i.PatientId for i in before.select('PatientId').distinct().collect()]

before = before.orderBy("PatientId", "GlucoseDisplayTime", ascending=True)
print("row count: ", before.count())
before.show(truncate=False)

                                                                                

row count:  3646
+--------------------------------------------+-----+-------------------+-------------------+
|PatientId                                   |Value|GlucoseDisplayTime |GlucoseSystemTime  |
+--------------------------------------------+-----+-------------------+-------------------+
|//JnO/6idyeOwAonWuEoNJ3JoK06f6km20B7FxqZ38w=|222.0|2022-02-01 11:18:11|2022-02-01 16:18:11|
|//JnO/6idyeOwAonWuEoNJ3JoK06f6km20B7FxqZ38w=|223.0|2022-02-01 11:23:11|2022-02-01 16:23:11|
|//JnO/6idyeOwAonWuEoNJ3JoK06f6km20B7FxqZ38w=|226.0|2022-02-01 11:28:11|2022-02-01 16:28:11|
|//JnO/6idyeOwAonWuEoNJ3JoK06f6km20B7FxqZ38w=|231.0|2022-02-01 11:33:11|2022-02-01 16:33:11|
|//JnO/6idyeOwAonWuEoNJ3JoK06f6km20B7FxqZ38w=|227.0|2022-02-01 11:38:11|2022-02-01 16:38:11|
|//JnO/6idyeOwAonWuEoNJ3JoK06f6km20B7FxqZ38w=|223.0|2022-02-01 11:43:11|2022-02-01 16:43:11|
|//JnO/6idyeOwAonWuEoNJ3JoK06f6km20B7FxqZ38w=|221.0|2022-02-01 11:48:11|2022-02-01 16:48:11|
|//JnO/6idyeOwAonWuEoNJ3JoK06f6km20B7FxqZ38w=|218.0|2

In [28]:
temp = df.filter(col('PatientId').isin(beforePatIds))
temp = temp.groupby('PatientId')\
            .agg(max("GlucoseSystemTime").alias("Max"),
                min("GlucoseSystemTime").alias("Min"))
print("row count: ", temp.count())
temp.orderBy("Max", ascending=False).show(truncate=False)
# temp.show(truncate=False)

                                                                                

row count:  158




+--------------------------------------------+-------------------+-------------------+
|PatientId                                   |Max                |Min                |
+--------------------------------------------+-------------------+-------------------+
|iVzLmZ0qiAgQKiLQFCa6k/+JdcLocroJyd72B22g8JM=|2022-02-01 23:45:44|2022-01-31 12:05:36|
|VKK+9Qh5AlToREMUNotiFavFRULltG9rQIZWbxupHZA=|2022-02-01 23:28:42|2022-01-31 23:18:40|
|bXGY4Py7dkiNVyDNdaTuKM5OhY9Et38RslaHKUuR6wk=|2022-02-01 23:06:09|2022-01-31 23:36:09|
|bs2bLcu34yK/GLeKhiuSznWumaaz1xcIz1wdrQoOVkw=|2022-02-01 22:57:00|2022-02-01 00:16:56|
|T7Hsczc1+Sl9OVIJYqPp2/pfOwnKB3/cjR1ozu/KBsc=|2022-02-01 22:48:03|2022-01-31 19:58:00|
|kpy290WrhyMebRMPHIphFpuSMGF+DNJyd/SgjyEcyAY=|2022-02-01 22:41:04|2022-01-31 23:06:01|
|PxgR1NdvahfzxzNspwMY2hoeYRukylhCQFlLvfICGOY=|2022-02-01 21:54:56|2022-01-31 23:04:58|
|q87E2PL8eeQdaxxkFR9zuDylYwxeUly+I/3sSObEm4I=|2022-02-01 21:53:31|2022-01-31 21:33:29|
|gC37IFDH+MySu7RVLp5tX+oz5E83842n2UxX3qB+J8

                                                                                

In [46]:
# temp = df.filter(col('PatientId').isin(patIds))
temp = before.groupby('PatientId').count()
temp = temp.orderBy("count", ascending=False)
temp.show(truncate=False)

[Stage 233:>                                                      (0 + 16) / 16]

+--------------------------------------------+-----+
|PatientId                                   |count|
+--------------------------------------------+-----+
|pVBBAjbTS/cd0ljowyrdwDAc3fRs/XeR/CfKq0l9T68=|6421 |
|hG3tNW2bodpcbOAnwKtv18XZt51yglPzifxOeFkNrVU=|5945 |
|MnfwjA6zL88a59KEDg1InBqNcBdIyhrCGry4K3Fq5rU=|1509 |
|lZ7UDPuO7YAmDrNzKK0Rt+s9AdskMlZR5KcLVecSgC0=|1404 |
|7hkFsZsZAZ7saYIlL1npE9ZDNs5rEkFMrWYZDMZOygQ=|975  |
|NYUV/N1W962H7viwGPL3wNrelpf6jQeUEwYrkY9zJyg=|631  |
|4s5/P5SKdaWqlBunmNE5Q8xSsa766t33qJyJRB86934=|489  |
|8u4jp/2u+jXZkJMirv2FPMSCNqxshT3NPT89A2YGgQU=|449  |
|TXIctmUxSWkHCjPnK4XovmJx64Ko3ohEs/hbpPTjxEA=|406  |
|wU8jt4ZqpMq5G9ED7x7SjYgYMJEazqSh5o0ZCS+jgKM=|391  |
|WiUIAEyMCn4ZeStqSbexT5fyDXOVXwqvVpgJ3NLObGY=|385  |
|+4EbSpcZWBDKPKDwTpg6vuvLgiaezxG+E4854g1svkY=|371  |
|iUWj8ymH9HoRVbzikJ/eS1EXpLR8VFxVTF9hnuxt5X0=|358  |
|u7PZ+G3SOg5m014I8H++7/4KnwIkDxl//zle+RaGGx0=|344  |
|eWVHxpju3wcHOOIXeEFDMOQUtXVzWFMOV/Jchrgp0VY=|342  |
|rCfrwIEkQH4dPxbSCoWpubOFAC3iKOfrnkZJn9u2KLY=|

                                                                                

In [47]:
orderedPatIds = [i.PatientId for i in temp.select('PatientId').distinct().collect()]
orderedPatIds[:6]

                                                                                

['pVBBAjbTS/cd0ljowyrdwDAc3fRs/XeR/CfKq0l9T68=',
 'hG3tNW2bodpcbOAnwKtv18XZt51yglPzifxOeFkNrVU=',
 'MnfwjA6zL88a59KEDg1InBqNcBdIyhrCGry4K3Fq5rU=',
 'lZ7UDPuO7YAmDrNzKK0Rt+s9AdskMlZR5KcLVecSgC0=',
 '7hkFsZsZAZ7saYIlL1npE9ZDNs5rEkFMrWYZDMZOygQ=',
 'NYUV/N1W962H7viwGPL3wNrelpf6jQeUEwYrkY9zJyg=']

In [48]:
df0.filter(col('PatientId')==orderedPatIds[0]).show()

+--------------------+-----+-------------------+-------------------+
|           PatientId|Value| GlucoseDisplayTime|  GlucoseSystemTime|
+--------------------+-----+-------------------+-------------------+
|pVBBAjbTS/cd0ljow...|180.0|2022-01-26 10:53:29|2022-01-26 18:53:29|
|pVBBAjbTS/cd0ljow...|176.0|2022-01-14 07:42:32|2022-01-14 15:42:32|
|pVBBAjbTS/cd0ljow...|152.0|2022-01-21 11:43:06|2022-01-21 19:43:06|
|pVBBAjbTS/cd0ljow...|202.0|2022-01-28 16:13:40|2022-01-29 00:13:40|
|pVBBAjbTS/cd0ljow...|139.0|2022-01-26 15:58:29|2022-01-26 23:58:29|
|pVBBAjbTS/cd0ljow...|144.0|2022-01-31 05:43:52|2022-01-31 13:43:52|
|pVBBAjbTS/cd0ljow...|155.0|2022-01-11 11:52:17|2022-01-11 19:52:17|
|pVBBAjbTS/cd0ljow...|168.0|2022-01-12 22:32:25|2022-01-13 06:32:25|
|pVBBAjbTS/cd0ljow...|214.0|2022-01-23 18:43:15|2022-01-24 02:43:15|
|pVBBAjbTS/cd0ljow...|201.0|2022-01-31 21:33:54|2022-02-01 05:33:54|
|pVBBAjbTS/cd0ljow...|205.0|2022-01-27 13:33:34|2022-01-27 21:33:34|
|pVBBAjbTS/cd0ljow...| 96.0|2022-0

In [44]:
before = df.where("GlucoseSystemTime <= '" + path_i[29:39] + " 00:00:00'")
beforePatIds = [i.PatientId for i in before.select('PatientId').distinct().collect()]

                                                                                

In [50]:
n = len(orderedPatIds)
for i in range(n-10,n):
    before.filter(col('PatientId')==orderedPatIds[i]).orderBy("GlucoseSystemTime", ascending=True).show()
    
    # if i==10:
    #     break

+--------------------+-----+-------------------+-------------------+
|           PatientId|Value| GlucoseDisplayTime|  GlucoseSystemTime|
+--------------------+-----+-------------------+-------------------+
|Iogq4UiEDRMJIIq4b...|202.0|2022-01-29 19:05:17|2022-01-30 03:05:17|
|Iogq4UiEDRMJIIq4b...|204.0|2022-01-29 19:10:17|2022-01-30 03:10:17|
|Iogq4UiEDRMJIIq4b...|206.0|2022-01-29 19:15:17|2022-01-30 03:15:17|
|Iogq4UiEDRMJIIq4b...|208.0|2022-01-29 19:20:18|2022-01-30 03:20:18|
|Iogq4UiEDRMJIIq4b...|202.0|2022-01-29 19:25:17|2022-01-30 03:25:17|
|Iogq4UiEDRMJIIq4b...|201.0|2022-01-29 19:30:17|2022-01-30 03:30:17|
|Iogq4UiEDRMJIIq4b...|203.0|2022-01-29 19:35:17|2022-01-30 03:35:17|
+--------------------+-----+-------------------+-------------------+

+--------------------+-----+-------------------+-------------------+
|           PatientId|Value| GlucoseDisplayTime|  GlucoseSystemTime|
+--------------------+-----+-------------------+-------------------+
|ZG81vJ8iDhbaI1Goz...| 76.0|2022-