In [1]:
import numpy as np
import pandas as pd
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col

In [2]:
conf = pyspark.SparkConf().setAll([
    ('spark.master', 'local[1]'),
    ('spark.app.name', 'App Name')])
spark = SparkSession.builder.config(conf=conf).getOrCreate()

spark.version

JAVA_HOME is not set


RuntimeError: Java gateway process exited before sending its port number

# Load in

In [3]:
'''set schema'''
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType
dfSchema = StructType([       
    StructField('PatientId', IntegerType(), True),
    StructField('GlucoseDisplayTime', TimestampType(), True),
    StructField('GlucoseDisplayTimeRaw', StringType(), True),
    StructField('Value', DoubleType(), True),
    StructField('dayCSV', IntegerType(), True)
])

'''read in the csv'''
df = spark.read\
    .option("mode", "DROPMALFORMED") \
    .option("delimiter",",") \
    .schema(dfSchema) \
    .csv('../../../../cephfs/cleanedData/2022-02_sample.csv') 

In [4]:
print("row count: ", df.count())
df.printSchema()
# df.head(5)
df.select("*").show(5)

                                                                                

row count:  790417
root
 |-- PatientId: integer (nullable = true)
 |-- GlucoseDisplayTime: timestamp (nullable = true)
 |-- GlucoseDisplayTimeRaw: string (nullable = true)
 |-- Value: double (nullable = true)
 |-- dayCSV: integer (nullable = true)

+---------+--------------------+---------------------+-----+------+
|PatientId|  GlucoseDisplayTime|GlucoseDisplayTimeRaw|Value|dayCSV|
+---------+--------------------+---------------------+-----+------+
|     2189|2022-02-01 01:35:...| 2022-02-01T01:35:...|  0.0|     1|
|     2189|2022-01-31 18:10:...| 2022-01-31T18:10:...|  0.0|     1|
|     2189|2022-01-31 22:20:...| 2022-01-31T22:20:...|  0.0|     1|
|     2189|2022-02-01 08:00:...| 2022-02-01T08:00:...|  0.0|     1|
|     2189|2022-02-01 15:20:...| 2022-02-01T15:20:...|  0.0|     1|
+---------+--------------------+---------------------+-----+------+
only showing top 5 rows



                                                                                

# Prelim cleanup before filling

### replace 0s with NaN to save a two steps down the line

In [9]:
# lines.na.fill(value=0,subset=["population"]).show()
df.withColumn("Value", \
       when(col("Value")=="0", None) \
          .otherwise(col("Value"))) \
  .show()

+---------+--------------------+---------------------+-----+------+
|PatientId|  GlucoseDisplayTime|GlucoseDisplayTimeRaw|Value|dayCSV|
+---------+--------------------+---------------------+-----+------+
|     2189|2022-02-01 01:35:...| 2022-02-01T01:35:...| null|     1|
|     2189|2022-01-31 18:10:...| 2022-01-31T18:10:...| null|     1|
|     2189|2022-01-31 22:20:...| 2022-01-31T22:20:...| null|     1|
|     2189|2022-02-01 08:00:...| 2022-02-01T08:00:...| null|     1|
|     2189|2022-02-01 15:20:...| 2022-02-01T15:20:...| null|     1|
|     2189|2022-02-01 03:00:...| 2022-02-01T03:00:...| null|     1|
|     2189|2022-02-01 14:40:...| 2022-02-01T14:40:...| null|     1|
|     2189|2022-01-31 17:45:...| 2022-01-31T17:45:...| null|     1|
|     2189|2022-02-01 07:15:...| 2022-02-01T07:15:...| null|     1|
|     2189|2022-02-01 13:45:...| 2022-02-01T13:45:...| null|     1|
|     2189|2022-02-01 10:30:...| 2022-02-01T10:30:...| null|     1|
|     2189|2022-02-01 10:40:...| 2022-02-01T10:4