### Start/loading in

In [1]:
# import sys
import time
import numpy as np
import pathlib
import pandas as pd
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, TimestampType, StringType, IntegerType, FloatType, DateType
from pyspark.sql.functions import pandas_udf, PandasUDFType, date_trunc, col
import pyspark.pandas as ps

spark.stop()



In [2]:
conf = pyspark.SparkConf().setAll([\
    ('spark.app.name', 'ReduceData')])
spark = SparkSession.builder.config(conf=conf)\
    .getOrCreate()

spark.version

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/15 00:33:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


'3.3.1'

### Schemas

In [3]:
glucose_data_schema=StructType([StructField('NumId', IntegerType(), True),
                                        StructField('PatientId', StringType(), True),
                                        StructField('Value', FloatType(), True),
                                        StructField('GlucoseDisplayTime', TimestampType(), True),
                                        StructField('GlucoseDisplayTimeRaw', StringType(), True),
                                        StructField('GlucoseDisplayDate', DateType(), True)])

# testing the parquets

### train vs interpolatd train

In [6]:
df = spark.read\
    .parquet('../../../../cephfs/interpolation/parquet_0_25.parquet')
print("row count: ", df.count())
df.printSchema()
df.show()
patIds = [i.NumId for i in df.select('NumId').distinct().collect()]
len(patIds)

row count:  3665908
root
 |-- GlucoseDisplayTime: timestamp (nullable = true)
 |-- NumId: double (nullable = true)
 |-- Value: float (nullable = true)
 |-- GlucoseDisplayDate: date (nullable = true)
 |-- IsFilledIn: double (nullable = true)
 |-- __index_level_0__: long (nullable = true)

+-------------------+------+---------+------------------+----------+-----------------+
| GlucoseDisplayTime| NumId|    Value|GlucoseDisplayDate|IsFilledIn|__index_level_0__|
+-------------------+------+---------+------------------+----------+-----------------+
|2022-02-01 00:15:00|1139.0|    123.0|        2022-02-01|       1.0|                0|
|2022-02-01 00:20:00|  null|121.42326|              null|       1.0|                1|
|2022-02-01 00:25:00|  null|121.42326|              null|       1.0|                2|
|2022-02-01 00:30:00|1139.0|    129.0|        2022-02-01|       1.0|                3|
|2022-02-01 00:35:00|1139.0|    124.0|        2022-02-01|       1.0|                4|
|2022-02-01 00:

26

In [7]:
df = spark.read\
    .parquet('../../../../cephfs/train_test_val/train/part-00000-0a623f34-b94e-4ee4-be3e-dace09ae6570-c000.snappy.parquet')
print("row count: ", df.count())
df.printSchema()
df.show()
patIds = [i.NumId for i in df.select('NumId').distinct().collect()]
len(patIds)

row count:  2556926
root
 |-- NumId: integer (nullable = true)
 |-- PatientId: string (nullable = true)
 |-- Value: float (nullable = true)
 |-- GlucoseDisplayTime: timestamp (nullable = true)
 |-- GlucoseDisplayTimeRaw: string (nullable = true)
 |-- GlucoseDisplayDate: date (nullable = true)

+-----+--------------------+-----+-------------------+---------------------+------------------+
|NumId|           PatientId|Value| GlucoseDisplayTime|GlucoseDisplayTimeRaw|GlucoseDisplayDate|
+-----+--------------------+-----+-------------------+---------------------+------------------+
| 4660|+Gr/1qOf9OWMa4LOL...|145.0|2022-02-01 05:12:00| 2022-02-01T05:12:...|        2022-02-01|
| 4660|+Gr/1qOf9OWMa4LOL...|137.0|2022-02-07 10:22:00| 2022-02-07T10:22:...|        2022-02-07|
| 4660|+Gr/1qOf9OWMa4LOL...|122.0|2022-02-01 16:22:00| 2022-02-01T16:22:...|        2022-02-01|
| 4660|+Gr/1qOf9OWMa4LOL...| 56.0|2022-02-07 18:27:00| 2022-02-07T18:27:...|        2022-02-07|
| 4660|+Gr/1qOf9OWMa4LOL...|277.0

46

### all the train/test/val data

In [11]:
"""replicate step1: read in the data"""
data_location = "/cephfs/train_test_val/train"
allPaths = [str(x) for x in list(pathlib.Path(data_location).glob('*.parquet')) if 'part-00' in str(x)]

In [12]:
startTime = time.time()
for j in range(len(allPaths)):
    pyspark_glucose_data = spark.read \
                           .schema(glucose_data_schema) \
                           .format('parquet') \
                           .load(allPaths[j])
    # pyspark_glucose_data = pyspark_glucose_data.withColumn("GlucoseDisplayTime",
    #                                                        date_trunc("minute",
    #                                                        col("GlucoseDisplayTime")))

    # pyspark_glucose_data=pyspark_glucose_data.orderBy("PatientId",
    #                                                   "GlucoseDisplayTime",
    #                                                   ascending=True)

    patIds = [i.NumId for i in pyspark_glucose_data.select('NumId').distinct().collect()]
    print(j,": ",len(patIds))

print(time.time()-startTime)

0 :  38
1 :  47
2 :  34
3 :  29
4 :  47
5 :  37
6 :  32
7 :  54
8 :  30
9 :  39
10 :  34
11 :  33
12 :  41
13 :  34
14 :  38
15 :  29
16 :  33
17 :  33
18 :  39
19 :  49
20 :  36
21 :  37
22 :  40
23 :  27
24 :  45
25 :  43
26 :  43
27 :  44
28 :  52
29 :  23
30 :  41
31 :  44
32 :  27
33 :  46
34 :  42
35 :  37
36 :  40
37 :  42
38 :  32
39 :  51
40 :  40
41 :  45
42 :  27
43 :  39
44 :  42
45 :  42
46 :  36
47 :  44
48 :  36
49 :  45
50 :  41
51 :  42
52 :  55
53 :  46
54 :  42
55 :  48
56 :  23
57 :  43
58 :  44
59 :  41
60 :  35
61 :  42
62 :  31
63 :  24
64 :  36
65 :  41
66 :  29
67 :  35
68 :  28
69 :  38
70 :  31
71 :  41
72 :  39
73 :  35
74 :  43
75 :  36
76 :  42
77 :  39
78 :  39
79 :  39
80 :  41
81 :  49
82 :  40
83 :  34
84 :  44
85 :  39
86 :  42
87 :  46
88 :  47
89 :  40
90 :  55
91 :  34
92 :  44
93 :  39
94 :  49
95 :  37
96 :  37
97 :  38
98 :  35
99 :  38
100 :  34
101 :  40
102 :  41
103 :  53
104 :  44
105 :  41
106 :  43
107 :  25
108 :  44
109 :  51
110 :  47


In [6]:
pyspark_glucose_data.show()



+-----+--------------------+-----+-------------------+---------------------+------------------+
|NumId|           PatientId|Value| GlucoseDisplayTime|GlucoseDisplayTimeRaw|GlucoseDisplayDate|
+-----+--------------------+-----+-------------------+---------------------+------------------+
| 1542|/GnILIfqzPqN3kco3...|125.0|2022-02-28 07:50:00| 2022-02-28T07:50:...|        2022-02-28|
| 1542|/GnILIfqzPqN3kco3...|125.0|2022-02-28 08:05:00| 2022-02-28T08:05:...|        2022-02-28|
| 1542|/GnILIfqzPqN3kco3...|130.0|2022-02-28 08:10:00| 2022-02-28T08:10:...|        2022-02-28|
| 1542|/GnILIfqzPqN3kco3...|133.0|2022-02-28 08:15:00| 2022-02-28T08:15:...|        2022-02-28|
| 1542|/GnILIfqzPqN3kco3...|136.0|2022-02-28 08:20:00| 2022-02-28T08:20:...|        2022-02-28|
| 1542|/GnILIfqzPqN3kco3...|137.0|2022-02-28 08:25:00| 2022-02-28T08:25:...|        2022-02-28|
| 1542|/GnILIfqzPqN3kco3...|136.0|2022-02-28 08:30:00| 2022-02-28T08:30:...|        2022-02-28|
| 1542|/GnILIfqzPqN3kco3...|136.0|2022-0

                                                                                

In [10]:
patIds = [i.NumId for i in pyspark_glucose_data.select('NumId').distinct().collect()]

38

# interpolation testing

In [5]:
test_df = pyspark_glucose_data.toPandas()
test_df = test_df.drop(columns=['PatientId','GlucoseDisplayTimeRaw','GlucoseDisplayDate'])

  series = series.astype(t, copy=False)


In [58]:
min_max = test_df.groupby('NumId')\
            .agg({'GlucoseDisplayTime' : ['min', 'max']})

merge_df = pd.DataFrame(columns=['GlucoseDisplayTime', 'NumId'])
for idx, row in min_max.iterrows():
    # print(test_df[test_df['NumId'] == idx].info(),"\n")
    #grab all potential dates in range

    date_df = pd.DataFrame(pd.date_range(row[0], row[1], freq='5min'), columns=['GlucoseDisplayTime'])                              
    date_df['NumId'] = idx

    # merge dates with big pypsark df
    merged = test_df[test_df['NumId'] == idx]\
            .merge(date_df, how='outer', on=['GlucoseDisplayTime', 'NumId'])\
            .sort_values(by=['GlucoseDisplayTime', 'Value'], na_position='last')

    merged['TimeLag'] = np.concatenate((merged['GlucoseDisplayTime'].iloc[0],\
                                        np.array(merged['GlucoseDisplayTime'].iloc[:-1].values)), axis=None)\
                        .astype('datetime64[ns]')
    print(merged.info(),"\n")

    merged['Diff'] = (merged['TimeLag'] - merged['GlucoseDisplayTime']).dt.seconds

    len_merged = len(merged)

    # get all index of rows with diff less than 5 mins, add 1 to remove next row, 
    # dont include last row to delete
    indexes_to_remove = [x for x in merged[merged['Diff'] < 300].index + 1 if x < len_merged]

    if len(indexes_to_remove) > 0:
        merged = merged.drop(indexes_to_remove)

    merged = merged.drop(columns=['TimeLag','Diff'])

    # its ready freddy for some interpoletty
    merge_df = pd.concat([merge_df,merged])

    break
    
# merge_df
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150016 entries, 0 to 58814
Data columns (total 4 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   NumId               150016 non-null  int64         
 1   Value               58815 non-null   float32       
 2   GlucoseDisplayTime  150016 non-null  datetime64[ns]
 3   TimeLag             150016 non-null  datetime64[ns]
dtypes: datetime64[ns](2), float32(1), int64(1)
memory usage: 5.2 MB
None 

27
<class 'pandas.core.frame.DataFrame'>
Int64Index: 149989 entries, 1 to 58814
Data columns (total 3 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   NumId               149989 non-null  int64         
 1   Value               58788 non-null   float32       
 2   GlucoseDisplayTime  149989 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float32(1), int64(1)
memory usage: 4.0 MB


In [51]:
temp = [i - 1 for i in indexes_to_remove] + indexes_to_remove
temp.sort()
merged.loc[temp]

Unnamed: 0,NumId,Value,GlucoseDisplayTime
0,149,112.0,2022-02-01 00:06:00
1,149,110.0,2022-02-01 00:11:00
46156,149,143.0,2022-11-17 04:42:00
46157,149,112.0,2022-11-17 04:42:00
46165,149,215.0,2022-11-17 05:27:00
46166,149,150.0,2022-11-17 05:27:00
46169,149,212.0,2022-11-17 05:42:00
46170,149,149.0,2022-11-17 05:42:00
46177,149,178.0,2022-11-17 06:42:00
46178,149,180.0,2022-11-17 06:47:00


[0,
 1,
 46156,
 46157,
 46165,
 46166,
 46169,
 46170,
 46177,
 46178,
 46178,
 46179,
 46189,
 46190,
 46191,
 46192,
 46193,
 46194,
 46196,
 46197,
 46197,
 46198,
 46200,
 46201,
 46205,
 46206,
 46206,
 46207,
 46211,
 46212,
 46212,
 46213,
 46218,
 46219,
 46221,
 46222,
 46228,
 46229,
 46230,
 46231,
 46234,
 46235,
 46237,
 46238,
 46239,
 46240,
 46242,
 46243,
 46244,
 46245,
 46252,
 46253,
 46254,
 46255]