In [47]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.sql.functions import col, lag, unix_timestamp, when
from pyspark.sql.functions import sum, min, max, count

In [4]:
spark = SparkSession\
    .builder\
    .config("spark.executor.instances", "2")\
    .config("spark.executor.cores", "2")\
    .config("spark.executor.memory", "2g")\
    .config("spark.driver.memory", "2g")\
    .getOrCreate()

21/08/14 00:19:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [13]:
schema = StructType([StructField("user_id", StringType()), 
                      StructField("occured_at", StringType()), 
                      StructField("event_type", StringType())])

In [14]:
test_list = (
[['16','2014-06-04 09:33:02','engagement'],
['16','2014-08-18 09:32:27','engagement'],
['16','2014-05-27 09:27:01','engagement'],
['16','2014-05-13 19:58:46','engagement'],
['16','2014-07-31 15:19:02','engagement'],
['16','2014-06-28 15:03:59','signup_flow'],
['1547','2014-06-16 17:25:51','engagement'],
['1547','2014-07-24 02:58:10','engagement'],
['1547','2014-07-07 09:31:51','engagement'],
['1547','2014-07-09 01:42:40','engagement']]
)

In [15]:
df = spark.createDataFrame(test_list, schema)
df.show()

+-------+-------------------+-----------+
|user_id|         occured_at| event_type|
+-------+-------------------+-----------+
|     16|2014-06-04 09:33:02| engagement|
|     16|2014-08-18 09:32:27| engagement|
|     16|2014-05-27 09:27:01| engagement|
|     16|2014-05-13 19:58:46| engagement|
|     16|2014-07-31 15:19:02| engagement|
|     16|2014-06-28 15:03:59|signup_flow|
|   1547|2014-06-16 17:25:51| engagement|
|   1547|2014-07-24 02:58:10| engagement|
|   1547|2014-07-07 09:31:51| engagement|
|   1547|2014-07-09 01:42:40| engagement|
+-------+-------------------+-----------+



In [16]:
last_event = df.withColumn("last_event", lag('occured_at').over(Window.partitionBy('user_id').orderBy('occured_at')))
last_event.show()

                                                                                

+-------+-------------------+-----------+-------------------+
|user_id|         occured_at| event_type|         last_event|
+-------+-------------------+-----------+-------------------+
|     16|2014-05-13 19:58:46| engagement|               null|
|     16|2014-05-27 09:27:01| engagement|2014-05-13 19:58:46|
|     16|2014-06-04 09:33:02| engagement|2014-05-27 09:27:01|
|     16|2014-06-28 15:03:59|signup_flow|2014-06-04 09:33:02|
|     16|2014-07-31 15:19:02| engagement|2014-06-28 15:03:59|
|     16|2014-08-18 09:32:27| engagement|2014-07-31 15:19:02|
|   1547|2014-06-16 17:25:51| engagement|               null|
|   1547|2014-07-07 09:31:51| engagement|2014-06-16 17:25:51|
|   1547|2014-07-09 01:42:40| engagement|2014-07-07 09:31:51|
|   1547|2014-07-24 02:58:10| engagement|2014-07-09 01:42:40|
+-------+-------------------+-----------+-------------------+



In [19]:
lag_in_day = last_event.withColumn('lag_in_day', (unix_timestamp('occured_at') - unix_timestamp('last_event'))/3600)
lag_in_day.show()

+-------+-------------------+-----------+-------------------+------------------+
|user_id|         occured_at| event_type|         last_event|        lag_in_day|
+-------+-------------------+-----------+-------------------+------------------+
|     16|2014-05-13 19:58:46| engagement|               null|              null|
|     16|2014-05-27 09:27:01| engagement|2014-05-13 19:58:46|325.47083333333336|
|     16|2014-06-04 09:33:02| engagement|2014-05-27 09:27:01| 192.1002777777778|
|     16|2014-06-28 15:03:59|signup_flow|2014-06-04 09:33:02| 581.5158333333334|
|     16|2014-07-31 15:19:02| engagement|2014-06-28 15:03:59| 792.2508333333334|
|     16|2014-08-18 09:32:27| engagement|2014-07-31 15:19:02| 426.2236111111111|
|   1547|2014-06-16 17:25:51| engagement|               null|              null|
|   1547|2014-07-07 09:31:51| engagement|2014-06-16 17:25:51|             496.1|
|   1547|2014-07-09 01:42:40| engagement|2014-07-07 09:31:51|40.180277777777775|
|   1547|2014-07-24 02:58:10

In [26]:
new_session = lag_in_day.withColumn('is_new_session', when( col('lag_in_day') > 370 , 1).otherwise(0))
new_session.show()

+-------+-------------------+-----------+-------------------+------------------+--------------+
|user_id|         occured_at| event_type|         last_event|        lag_in_day|is_new_session|
+-------+-------------------+-----------+-------------------+------------------+--------------+
|     16|2014-05-13 19:58:46| engagement|               null|              null|             0|
|     16|2014-05-27 09:27:01| engagement|2014-05-13 19:58:46|325.47083333333336|             0|
|     16|2014-06-04 09:33:02| engagement|2014-05-27 09:27:01| 192.1002777777778|             0|
|     16|2014-06-28 15:03:59|signup_flow|2014-06-04 09:33:02| 581.5158333333334|             1|
|     16|2014-07-31 15:19:02| engagement|2014-06-28 15:03:59| 792.2508333333334|             1|
|     16|2014-08-18 09:32:27| engagement|2014-07-31 15:19:02| 426.2236111111111|             1|
|   1547|2014-06-16 17:25:51| engagement|               null|              null|             0|
|   1547|2014-07-07 09:31:51| engagement

In [27]:
user_session_id = new_session.withColumn("user_session_id", sum('is_new_session').over(Window.partitionBy('user_id').orderBy('occured_at')))
user_session_id.show()

+-------+-------------------+-----------+-------------------+------------------+--------------+---------------+
|user_id|         occured_at| event_type|         last_event|        lag_in_day|is_new_session|user_session_id|
+-------+-------------------+-----------+-------------------+------------------+--------------+---------------+
|     16|2014-05-13 19:58:46| engagement|               null|              null|             0|              0|
|     16|2014-05-27 09:27:01| engagement|2014-05-13 19:58:46|325.47083333333336|             0|              0|
|     16|2014-06-04 09:33:02| engagement|2014-05-27 09:27:01| 192.1002777777778|             0|              0|
|     16|2014-06-28 15:03:59|signup_flow|2014-06-04 09:33:02| 581.5158333333334|             1|              1|
|     16|2014-07-31 15:19:02| engagement|2014-06-28 15:03:59| 792.2508333333334|             1|              2|
|     16|2014-08-18 09:32:27| engagement|2014-07-31 15:19:02| 426.2236111111111|             1|         

In [53]:
dfsum = user_session_id.groupBy('user_id','user_session_id').agg(min('occured_at').alias('session_start'),max('occured_at').alias('session_end'),count('*').alias('count'))
dfsum.show()

+-------+---------------+-------------------+-------------------+-----+
|user_id|user_session_id|      session_start|        session_end|count|
+-------+---------------+-------------------+-------------------+-----+
|     16|              0|2014-05-13 19:58:46|2014-06-04 09:33:02|    3|
|     16|              1|2014-06-28 15:03:59|2014-06-28 15:03:59|    1|
|     16|              2|2014-07-31 15:19:02|2014-07-31 15:19:02|    1|
|     16|              3|2014-08-18 09:32:27|2014-08-18 09:32:27|    1|
|   1547|              0|2014-06-16 17:25:51|2014-06-16 17:25:51|    1|
|   1547|              1|2014-07-07 09:31:51|2014-07-24 02:58:10|    3|
+-------+---------------+-------------------+-------------------+-----+



+-------+---------------+-----+
|user_id|user_session_id|count|
+-------+---------------+-----+
|     16|              0|    3|
|     16|              1|    1|
|     16|              2|    1|
|     16|              3|    1|
|   1547|              0|    1|
|   1547|              1|    3|
+-------+---------------+-----+

