### Save Raw Data to Filtered Parquet Files

In [6]:
# load in imports
#!sudo apt-get update
#!sudo apt-get install openjdk-8-jdk
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import StructType, StructField, \
StringType, IntegerType, TimestampType, DateType, FloatType
import time
import pathlib
from pyspark.sql.functions import col, to_date

In [10]:
conf = pyspark.SparkConf().setAll([\
    ('spark.app.name', 'ReduceData')])
spark = SparkSession.builder.config(conf=conf)\
    .getOrCreate()

In [21]:
# get all csvs to load in
allPaths = [str(x) for x in list(pathlib.Path('/cephfs/data').glob('*.csv')) if 'glucose_records' in str(x)]
allPaths.sort()

In [23]:
glucose_data_schema=StructType([StructField('_c0', IntegerType(),True),
                                StructField('PostDate', TimestampType(),True),
                                StructField('IngestionDate', TimestampType(),True),
                                StructField('PostId', StringType(),True),
                                StructField('PostTime', TimestampType(), True),
                                StructField('PatientId', StringType(), True),
                                StructField('Stream', StringType(), True),
                                StructField('SequenceNumber', StringType(), True),
                                StructField('TransmitterNumber', StringType(), True),
                                StructField('ReceiverNumber', StringType(), True),
                                StructField('RecordedSystemTime', TimestampType(), True),
                                StructField('RecordedDisplayTime', TimestampType(), True),
                                StructField('RecordedDisplayTimeRaw', TimestampType(), True),
                                StructField('TransmitterId', StringType(), True),
                                StructField('TransmitterTime', StringType(), True),
                                StructField('GlucoseSystemTime', TimestampType(), True),
                                StructField('GlucoseDisplayTime', TimestampType(), True),
                                StructField('GlucoseDisplayTimeRaw', StringType(), True),
                                StructField('Value', FloatType(), True),
                                StructField('Status', StringType(), True),
                                StructField('TrendArrow', StringType(), True),
                                StructField('TrendRate', FloatType(), True),
                                StructField('IsBackFilled', StringType(), True),
                                StructField('InternalStatus', StringType(), True),
                                StructField('SessionStartTime', StringType(), True)])



In [None]:
# load in data, select columns to save, create date column, save to parquet format
prevIdx = 0

startTime = time.time()
for idx in range(len(allPaths)):
    path = allPaths[idx]
    
    df = spark.read\
            .format('csv')\
            .option('delimiter', ',')\
            .option("mode", "DROPMALFORMED")\
            .option("header", True)\
            .schema(glucose_data_schema)\
            .load(path)\
            .select(col("PatientId"), col("Value"), \
                    col("GlucoseDisplayTime"), col("GlucoseDisplayTimeRaw"))
            
    df_toParq = df.withColumn('GlucoseDisplayDate',
                           to_date(col('GlucoseDisplayTime')))
    
    df_toParq.repartition(1).write\
                .mode('overwrite')\
                .parquet('/cephfs/stepped_glucose_data/step0_load_daily/parquet_' + str(idx)) 

    prevIdx = idx    

endTime = time.time()
print(endTime-startTime)



In [27]:
path

'/cephfs/data/glucose_records_2022-02-01.csv'

In [48]:
from save_to_parquet import *

ModuleNotFoundError: No module named 'save_to_parquet'