In [2]:
from pyspark.sql import SparkSession

# Build the SparkSession
spark = SparkSession.builder \
    .master("local") \
    .appName("Sessionize test") \
    .config("spark.executor.memory", "1gb") \
    .getOrCreate()
sc = spark.sparkContext
# Load the data by creating rdd
rdd = sc.textFile('/home/hassan/Side_Projects/WeblogChallenge/test.log')
# split the data into columns
rdd = rdd.map(lambda line: line.split(" "))
# ====================================
# Manipulating data
# ====================================
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import *

#Map the RDD to a DF for better performance
mainDF = rdd.map(lambda line: Row(timestamp=line[0], ipaddress=line[1].split(':')[0])).toDF()
mainDF.show(20,False)

+---------+---------------------------+
|ipaddress|timestamp                  |
+---------+---------------------------+
|0.0.0.130|2015-07-22T09:00:28.019143Z|
|0.0.0.130|2015-07-22T09:00:29.019143Z|
|0.0.0.130|2015-07-22T09:05:28.019143Z|
|0.0.0.130|2015-07-22T09:10:28.019143Z|
|1.0.0.130|2015-07-22T09:00:28.019143Z|
|1.0.0.130|2015-07-22T09:00:29.019143Z|
|1.0.0.130|2015-07-22T09:10:28.019143Z|
+---------+---------------------------+



In [4]:
# convert timestamps from string to timestamp datatype
mainDF = mainDF.withColumn('timestamp', mainDF['timestamp'].cast(TimestampType()))


# sessionizing data based on 5 min fixed window time
# assign an Id to each session
SessionDF = mainDF.select(window("timestamp", "5 minutes").alias('FixedTimeWindow'),'timestamp',"ipaddress").groupBy('FixedTimeWindow','ipaddress').count().withColumnRenamed('count', 'NumberHitsInSessionForIp')
SessionDF = SessionDF.withColumn("SessionId", monotonically_increasing_id())
SessionDF.show(20,False)


+---------------------------------------------+---------+------------------------+-------------+
|FixedTimeWindow                              |ipaddress|NumberHitsInSessionForIp|SessionId    |
+---------------------------------------------+---------+------------------------+-------------+
|[2015-07-22 05:05:00.0,2015-07-22 05:10:00.0]|0.0.0.130|1                       |128849018880 |
|[2015-07-22 05:00:00.0,2015-07-22 05:05:00.0]|1.0.0.130|2                       |326417514496 |
|[2015-07-22 05:10:00.0,2015-07-22 05:15:00.0]|0.0.0.130|1                       |360777252864 |
|[2015-07-22 05:00:00.0,2015-07-22 05:05:00.0]|0.0.0.130|2                       |575525617664 |
|[2015-07-22 05:10:00.0,2015-07-22 05:15:00.0]|1.0.0.130|1                       |1159641169920|
+---------------------------------------------+---------+------------------------+-------------+

