#### TEST

In [1]:
import pyspark
from pyspark import pandas as ps
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import StructType, StructField, \
StringType, IntegerType, TimestampType, DateType, FloatType
import time
import pathlib
from pyspark.sql.functions import col, to_date, sum, avg, max, min, \
stddev, percentile_approx,\
pandas_udf, PandasUDFType, lit, udf, collect_list, sqrt, monotonically_increasing_id, map_from_entries,\
rank, dense_rank, count, when
from pyspark.sql.window import Window



In [4]:
conf = pyspark.SparkConf().setAll([\
            ('spark.app.name', 'Glucose_Analysis_Spark')])
spark = SparkSession.builder.config(conf=conf)\
    .getOrCreate()        

In [3]:
schema = StructType([
  StructField('PatientId', StringType(), True),
  StructField('Value', FloatType(), True),
  StructField('GlucoseDisplayDate', DateType(), True)
  ])

emptyRDD = spark.sparkContext.emptyRDD()
df = spark.createDataFrame(emptyRDD,schema)

df.printSchema()

root
 |-- PatientId: string (nullable = true)
 |-- Value: float (nullable = true)
 |-- GlucoseDisplayDate: date (nullable = true)



In [4]:
df = spark.read.parquet('/cephfs/stepped_glucose_data/step0_load/parquet_0_to_10/part-00000-532ee45d-8e0d-44c4-8f3b-884b22175e0f-c000.snappy.parquet')

In [10]:
df = df.withColumn('y_binary', lit(1))

In [11]:
#def pyspark_summary_statistics(self, df, spark):
# @pandas_udf(StructType([StructField('Entropy', FloatType())]), PandasUDFType.GROUPED_MAP)
# def entropy_grouped(df):
#     return feat_create_obj.entropy_extraction(df.Value)

def entropy_udf(vals):
    feat_create_obj = feat_create()
    return udf(feat_create_obj.entropy_extraction(vals), FloatType())

def poincare_udf(vals):
    feat_create_obj = feat_create()
    return udf(calculate_poincare(vals),\
               StructType([\
                   StructField('First', FloatType()),\
                   StructField('Second', FloatType()),\
                   StructField('Third', FloatType())\
               ]))

# def chunk_by_index():
#     return udf(collect_list

def create_partition_date(df, chunk_val):
    window = Window.partitionBy(df['PatientId']).orderBy(df['GlucoseDisplayTime'])
    df = df.select('*', rank().over(window).alias('index'))
    df = df.withColumn("Chunk", (df.index/chunk_val).cast(IntegerType()))

    return df


def pyspark_summary_statistics(df, \
                               daily_stats_features_lower,\
                               daily_stats_features_upper, \
                               chunk_val = 12):  

    df_added = create_partition_date(df, chunk_val)
    
    group_cols = ["PatientId", "Chunk"]

    summary_df = df_added.groupby(group_cols)\
        .agg(max('y_binary').alias('y_summary_binary'),\
             avg("Value").alias("Mean"),\
             stddev("Value").alias("Std Dev"),\
             percentile_approx("Value", .5).alias("Median"), \
             min("Value").alias("Min"),\
             max("Value").alias("Max"),\
             count(when(col("Value") < daily_stats_features_lower, 1)).alias("CountBelow"),\
             count(when(col("Value") > daily_stats_features_upper, 1)).alias("CountAbove"),\
             (count(when(col("Value") < daily_stats_features_lower, 1))/chunk_val).alias("PercentageBelow"),\
             (count(when(col("Value") > daily_stats_features_upper, 1))/chunk_val).alias("PercentageAbove")
            )

    df_added = df_added.join(summary_df, ['PatientId', 'Chunk'])
    
    return df_added

In [12]:
pyspark_summary_statistics(df, 70, 180, 12)

[Stage 12:====>             (1 + 3) / 4][Stage 13:>                 (0 + 1) / 4]

23/04/19 21:27:36 WARN TaskMemoryManager: Failed to allocate a page (33554432 bytes), try again.
23/04/19 21:27:37 WARN TaskMemoryManager: Failed to allocate a page (33554432 bytes), try again.
23/04/19 21:27:37 WARN TaskMemoryManager: Failed to allocate a page (33554432 bytes), try again.


[Stage 22:>                                                         (0 + 1) / 1]

+--------------------+-----+-----+-------------------+---------------------+------------------+--------+-----+----------------+------------------+------------------+------+-----+-----+----------+----------+---------------+---------------+
|           PatientId|Chunk|Value| GlucoseDisplayTime|GlucoseDisplayTimeRaw|GlucoseDisplayDate|y_binary|index|y_summary_binary|              Mean|           Std Dev|Median|  Min|  Max|CountBelow|CountAbove|PercentageBelow|PercentageAbove|
+--------------------+-----+-----+-------------------+---------------------+------------------+--------+-----+----------------+------------------+------------------+------+-----+-----+----------+----------+---------------+---------------+
|++3L3PAkDvSTkWnWe...|    8|169.0|2022-02-19 01:56:07| 2022-02-19T01:56:...|        2022-02-19|       1|   96|               1|168.83333333333334| 2.823065172768233| 169.0|162.0|172.0|         0|         0|            0.0|            0.0|
|++3L3PAkDvSTkWnWe...|    8|170.0|2022-02-19

                                                                                

DataFrame[PatientId: string, Chunk: int, Value: float, GlucoseDisplayTime: timestamp, GlucoseDisplayTimeRaw: string, GlucoseDisplayDate: date, y_binary: int, index: int, y_summary_binary: int, Mean: double, Std Dev: double, Median: float, Min: float, Max: float, CountBelow: bigint, CountAbove: bigint, PercentageBelow: double, PercentageAbove: double]

In [None]:
added_daily_features=df.groupby(analysis_group).apply(transform_features)

    return added_daily_features

In [None]:
spark.stop()

In [51]:
df = spark.read.parquet('/cephfs/summary_stats/all_val_bool/')

In [52]:
df.show(5)

+-----+-----+-----------------+----------------+-------------+-------------+------------------+------------------+------------------+------+-----+-----+--------------------+--------------------+------------------+------------------+----------+----------+---------------+------------+------+----------+--------+------------------+-------------------------+------------+-------------------------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
|NumId|Chunk|ShortTermVariance|LongTermVariance|VarianceRatio|SampleEntropy|PermutationEntropy|              Mean|            StdDev|Median|  Min|  Max|        AvgFirstDiff|          AvgSecDiff|      StdFirstDiff|        StdSecDiff|CountAbove|CountBelow|TotalOutOfRange|DiffPrevious|target|Sex_Female|Sex_Male|Treatment_yes_both|Treatment_yes_long_acting|Treatment_no|Treatment_yes_fast_acting|AgeGroup_50|AgeGroup_60|AgeGroup_70|AgeGroup_40|AgeGroup_30|AgeGroup_80|AgeGroup_90|AgeGroup_10|
+-----+-----+-----

In [53]:
new_df = df.withColumn('target', when(df.DiffPrevious > 9, 1)
                         .when(df.DiffPrevious < -9,-1)
                         .otherwise(0))

In [54]:
new_df.select(col('DiffPrevious'), col('target')).show(5)

+------------+------+
|DiffPrevious|target|
+------------+------+
|           7|     0|
|          18|     1|
|          -1|     0|
|         -24|    -1|
|          11|     1|
+------------+------+
only showing top 5 rows



In [55]:
new_df.repartition('NumId').write.parquet('/cephfs/summary_stats/all_val_bool_updated')

                                                                                

In [34]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .appName("Glucose") \
        .getOrCreate()

training_df_poincare = spark.read.parquet('/cephfs/featuresData/poincare/train')
training_df_poincare.show(5)
training_df_entropy = spark.read.parquet('/cephfs/featuresData/entropy/train')
training_df_entropy.show(5)

training_df_complex_features = training_df_poincare.join(training_df_entropy,['NumId', 'Chunk'])
training_df_complex_features.show()

training_features_summary_stats= spark.read.parquet('/cephfs/summary_stats/encoded/one_hot_train/summary_stats_cohort_bool_encoded.parquet')
training_features_summary_stats.show(3)

training_df_final = training_df_complex_features.join(training_features_summary_stats,['NumId', 'Chunk'])
training_df_final.show(5)

training_df_final.repartition('NumId').write.parquet('/cephfs/summary_stats/all_train_bool')

+-----+-----+-----------------+----------------+-------------+
|NumId|Chunk|ShortTermVariance|LongTermVariance|VarianceRatio|
+-----+-----+-----------------+----------------+-------------+
|  752|  277|         75.15563|        5.317868|    14.132662|
|  752|  292|         98.29378|       4.4664574|    22.007101|
|  752|  327|         82.88033|       2.6379993|    31.417875|
|  767|  246|        46.237247|       2.4311175|    19.018927|
|  767|  331|         6.747326|       11.497889|   0.58683175|
+-----+-----+-----------------+----------------+-------------+
only showing top 5 rows

+-----+-----+-------------+------------------+
|NumId|Chunk|SampleEntropy|PermutationEntropy|
+-----+-----+-------------+------------------+
|  752|  277|    0.1978033|          0.864667|
|  752|  292|   0.06829524|         0.8204138|
|  752|  327|   0.14589928|          0.666634|
|  767|  246|   0.29596713|         0.9315732|
|  767|  331|   0.37795234|         1.1740693|
+-----+-----+-------------+-----

                                                                                

+-----+-----+-----------------+----------------+-------------+-------------+------------------+
|NumId|Chunk|ShortTermVariance|LongTermVariance|VarianceRatio|SampleEntropy|PermutationEntropy|
+-----+-----+-----------------+----------------+-------------+-------------+------------------+
|    0|    0|        2.8500366|       3.8021948|    0.7495767|    0.2984626|         1.2680585|
|    0|    2|        1.9557492|       4.1611867|   0.46999794|    0.2726442|         1.1109602|
|    0|    8|        31.130909|       2.6183333|    11.889589|    0.2606011|         1.0298378|
|    0|   10|        29.197123|       2.1754076|    13.421449|   0.25588748|         0.8495794|
|    0|   12|         51.51686|       3.2331884|   15.9337635|   0.16961278|         0.8927734|
|    0|   13|         48.71614|       2.5468535|    19.127972|   0.14173158|        0.78283364|
|    0|   16|         34.90279|       1.8008504|     19.38128|   0.26931518|         0.8654307|
|    0|   19|        32.694202|      0.8

                                                                                

+-----+-----+-----------------+----------------+-------------+-------------+------------------+------------------+------------------+------+-----+-----+-------------------+--------------------+------------------+------------------+----------+----------+---------------+------------+------+----------+--------+------------------+-------------------------+------------+-------------------------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+
|NumId|Chunk|ShortTermVariance|LongTermVariance|VarianceRatio|SampleEntropy|PermutationEntropy|              Mean|            StdDev|Median|  Min|  Max|       AvgFirstDiff|          AvgSecDiff|      StdFirstDiff|        StdSecDiff|CountAbove|CountBelow|TotalOutOfRange|DiffPrevious|target|Sex_Female|Sex_Male|Treatment_yes_both|Treatment_yes_long_acting|Treatment_no|Treatment_yes_fast_acting|AgeGroup_50|AgeGroup_60|AgeGroup_70|AgeGroup_40|AgeGroup_30|AgeGroup_80|AgeGroup_90|AgeGroup_10|
+-----+-----+-------

                                                                                