In [180]:
from datetime import datetime,timedelta
from pyspark.sql import Row
dt=datetime.now()
rdd=sc.parallelize([Row(last_modified_date=dt,policy_id='CR_1',active='N',reason='removed'),
                    Row(last_modified_date=dt+timedelta(minutes=5),policy_id='CR_2',active='Y',reason='applied'),
                    Row(last_modified_date=dt+timedelta(minutes=6),policy_id='CR_1',active='Y',reason='applied'),
                    Row(last_modified_date=dt+timedelta(minutes=6),policy_id='CR_1',active='N',reason='removed'),
                    Row(last_modified_date=dt+timedelta(minutes=6),policy_id='CR_1',active='N',reason='removed')])
df=rdd.toDF()
df.persist()

DataFrame[active: string, last_modified_date: timestamp, policy_id: string, reason: string]

In [181]:
from pyspark.sql.window import Window
from pyspark.sql.functions import lag ,when,array,lit,udf
from pyspark.sql import Column
from pyspark.sql.types import ArrayType,StringType,NullType

@udf(returnType=ArrayType(StringType()))
def addValue(arr,elem):
    if elem not in arr:
        arr.append(elem)
    return arr

@udf(returnType=ArrayType(StringType()))
def mergeArrays(arr1,arr2):
    returnVal=[]
    if arr1 != None:
        returnVal.extend(arr1)
    if arr2 != None:
        returnVal.extend(arr2)
    return returnVal

##sqlContext.udf.register(f=addValue,name='addValue',returnType=ArrayType(StringType()))

windowSpec=Window.partitionBy().orderBy('last_modified_date')
df=df.withColumn('added',array()).withColumn('removed',array())
df=df.select(when(df.active=='Y',addValue(df.added,df.policy_id)).otherwise(df.added).alias('added'),
             when(df.active=='N',addValue(df.removed,df.policy_id)).otherwise(df.removed).alias('removed'),
             df.last_modified_date,df.policy_id,df.active,df.reason)
#df.schema
df=df.withColumn('currentEnts',mergeArrays(lag('added').over(windowSpec),df.added))
df.show()

+------+-------+--------------------+---------+------+-------+------------+
| added|removed|  last_modified_date|policy_id|active| reason| currentEnts|
+------+-------+--------------------+---------+------+-------+------------+
|    []| [CR_1]|2018-07-08 22:39:...|     CR_1|     N|removed|          []|
|[CR_2]|     []|2018-07-08 22:44:...|     CR_2|     Y|applied|      [CR_2]|
|[CR_1]|     []|2018-07-08 22:45:...|     CR_1|     Y|applied|[CR_2, CR_1]|
|    []| [CR_1]|2018-07-08 22:45:...|     CR_1|     N|removed|      [CR_1]|
|    []| [CR_1]|2018-07-08 22:45:...|     CR_1|     N|removed|          []|
+------+-------+--------------------+---------+------+-------+------------+



In [182]:
"""
df = spark.createDataFrame([("Alive", 4)], ["Name", "Number"])


def example(n):
    return Row('Out1', 'Out2')(n + 2, n - 2)


schema = StructType([
    StructField("Out1", IntegerType(), False),
    StructField("Out2", IntegerType(), False)])

example_udf = f.UserDefinedFunction(example, schema)

newDF = df.withColumn("Output", example_udf(df["Number"]))
newDF = newDF.select("Name", "Number", "Output.*")

newDF.show(truncate=False)

"""

'\ndf = spark.createDataFrame([("Alive", 4)], ["Name", "Number"])\n\n\ndef example(n):\n    return Row(\'Out1\', \'Out2\')(n + 2, n - 2)\n\n\nschema = StructType([\n    StructField("Out1", IntegerType(), False),\n    StructField("Out2", IntegerType(), False)])\n\nexample_udf = f.UserDefinedFunction(example, schema)\n\nnewDF = df.withColumn("Output", example_udf(df["Number"]))\nnewDF = newDF.select("Name", "Number", "Output.*")\n\nnewDF.show(truncate=False)\n\n'