In [1]:
import time
import numpy as np
import pathlib
import pandas as pd
import pyspark
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import when, col, rank, monotonically_increasing_id, date_trunc, min, max
from pyspark.sql.types import StructType, StructField, TimestampType, StringType, IntegerType, FloatType, DateType
from datetime import date, datetime, timedelta

# spark.stop()

In [2]:
conf = pyspark.SparkConf().setAll([\
    ('spark.app.name', 'ReduceData')])
spark = SparkSession.builder.config(conf=conf)\
    .getOrCreate()

spark.version

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/16 11:35:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


'3.3.1'

In [27]:
glucose_data_schema=StructType([StructField('NumId', IntegerType(), True),
                                StructField('PatientId', StringType(), True),
                                StructField('Value', FloatType(), True),
                                StructField('GlucoseDisplayTime', TimestampType(), True),
                                StructField('GlucoseDisplayTimeRaw', StringType(), True),
                                StructField('GlucoseDisplayDate', DateType(), True)])

temp_schema = StructType([StructField('PatientId', StringType(), True),
                          StructField('count', IntegerType(), True)])

# read things in

In [35]:
df = spark.read\
    .parquet('temp_data/part-00000-bcc637a6-2bd8-4c20-bc2c-42de80cad98b-c000.snappy.parquet')

df = df.withColumnRenamed('PatientId','UserId')
df = df.withColumn('count',col('count').cast(IntegerType()))

print("row count: ", df.count())
df.printSchema()
df.show()
patIds = [i.UserId for i in df.select('UserId').distinct().collect()]
len(patIds)

row count:  7957
root
 |-- UserId: string (nullable = true)
 |-- count: integer (nullable = true)

+--------------------+------+
|              UserId| count|
+--------------------+------+
|+Gr/1qOf9OWMa4LOL...| 94433|
|+TH+y0M8bQb2EpRVJ...| 84416|
|+UJpU9Owd5VBvkNtJ...| 99362|
|+PvW5D8b0wZANElHi...|101336|
|+9Rtj/cdJMLF2raCJ...| 89139|
|+JcspO9meT7uENVtz...| 97617|
|+BPY2YsPzI4b+DwiN...| 82083|
|+Ld/+ikzUehNOZ5f+...|100467|
|+VCEKhqF9R2oI0VGv...| 91595|
|+UgMn9AIL2d78WOS4...| 94862|
|+M7reUgRS+P3suzYg...| 86859|
|+O6WjkJ9Gb5esPEgG...|101657|
|+9ipwJiIRcZTJrck+...|103118|
|+F5NQV8YRae91R0Og...|100282|
|+XcQw7Zbj9WPeWsDq...| 99547|
|+DkuZFXuoapNjKE9g...| 85436|
|+HzKQWGJels56+L6u...| 95393|
|+8cP93h83g3DYhiXN...| 83085|
|+7nbCvVCmsMWaCjcH...| 92185|
|+XHDtL8igY38vN48Q...|102874|
+--------------------+------+
only showing top 20 rows



7957

### all the training data

In [6]:
"""replicate step1: read in the data"""
data_location = "/cephfs/train_test_val/train"
allPaths = [str(x) for x in list(pathlib.Path(data_location).glob('*.parquet')) if 'part-00' in str(x)]
allPaths.sort()

In [67]:
startTime = time.time()

# for j in range(len(allPaths)):
j = 1

pyspark_glucose_data = spark.read \
                       .schema(glucose_data_schema) \
                       .format('parquet') \
                       .load(allPaths[j])
patIds = [i.NumId for i in pyspark_glucose_data.select('NumId').distinct().collect()]
print(j,": ",len(patIds))

print(time.time()-startTime)

1 :  39
0.1841890811920166


In [68]:
min_max = pyspark_glucose_data.groupby('PatientId')\
            .agg(min("GlucoseDisplayTime").alias("Min"),\
                 max("GlucoseDisplayTime").alias("Max"))
print("starting row count: ", min_max.count())

min_max = min_max.withColumn('ExpectedRecords',(col("Max").cast("long") - col('Min').cast("long"))/(60*5))

temp = min_max.join(df, min_max.PatientId == df.UserId)#.select(min_max.PatientId, min_max.ExpectedRecords, df.count)
temp = temp.drop("Min","Max","UserId")

temp = temp.withColumn('discrepancy%',100*(col("ExpectedRecords").cast("integer") / col('count'))-100)
temp = temp.orderBy('discrepancy%', ascending=False)

length = temp.count()
print("ending row count: ", length)
temp.show(length, truncate=False)

starting row count:  39
ending row count:  39
+--------------------------------------------+---------------+------+------------------+
|PatientId                                   |ExpectedRecords|count |discrepancy%      |
+--------------------------------------------+---------------+------+------------------+
|aRcdptX0j0XO5JiZjvcRhh/2hhLNDeIFccfFLPKXYG4=|104988.4       |71212 |47.430208391844076|
|Z9bKeYamn2rfzIvygFMtUMAnmvNWlLv5i0L6F491CDE=|94616.0        |71093 |33.08764575977946 |
|R8bQL1K4vkc1t1A1LppgK7zgjOQ3aApOV635AZaXzo8=|105041.8       |80634 |30.268869211498867|
|rWBn04Ywl5IV9Dnzg2UrZr2hv0znnTM9TZAcYnxrnKQ=|101467.0       |80818 |25.550001237348113|
|O3bOa6QuOQxus7y6YAwm/GYJMKbHIkQQ8mvhTRapn3Y=|105009.2       |83691 |25.472273004265688|
|XYvnckGwCuSRi9d72KltoyJAQoKj0ae1HC5WgqfgEjQ=|104311.4       |87478 |19.24255241317816 |
|uU3BWosfaHwu6vcHPMpXI1A8WrJIFJrOSrG+wo4Q6xk=|100855.8       |84924 |18.75912580660355 |
|kcnYD5XlN/B3VCJLdEgkjayJ8bRiCinjDwxJe2vmPs0=|104379.6       |88