In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.utils import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
import util
from config import *
import logging

In [7]:
logging.basicConfig(level=logging.INFO)
paths:Paths=Paths(
    dataLake="../DataLake/"
    ,srcSearches="../searches/"
    ,srcVisitors="../visitors/"
    ,rawSearches="../DataLake/raw/searches/"
    ,rawVisitors="../DataLake/raw/visitors/"
    ,ezSearches="../DataLake/ez/searches/"
    ,ezVisitors="../DataLake/ez/visitors/"
)
util.rawZoneSetup(paths)

TASK 1

In [8]:
newSearchesFiles=util.getNewFiles(paths.srcSearches,paths.rawSearches)
newVisitorFiles=util.getNewFiles(paths.srcVisitors,paths.rawVisitors)
if(len(newSearchesFiles)==0 and len(newVisitorFiles)==0):
    logging.info("No New Files found from source")

INFO:root:getNewFiles(): path ../searches/, new Files Count 2
INFO:root:getNewFiles(): path ../visitors/, new Files Count 2


In [None]:
spark=SparkSession.builder.appName("test").getOrCreate()

In [9]:
for files in newSearchesFiles:
    ts=util.getTsFromFileName(files)
    spark.read.json(f"{paths.srcSearches}/{files}").coalesce(1).write.options(header="True",compression="snappy").parquet(f"{paths.rawSearches}/{ts}")

for files in newVisitorFiles:
    ts=util.getTsFromFileName(files)
    spark.read.json(f"{paths.srcVisitors}/{files}").coalesce(1).write.options(header="True",compression="snappy").parquet(f"{paths.rawVisitors}/{ts}")

                                                                                

TASK 2

In [10]:
visitorStruct = StructType([
    StructField("countPerday", StringType()),
    StructField("country", StringType()),
    StructField("first_hit_pagename", StringType()),
    StructField("hits_avg", StringType()),
    StructField("logged_in", StringType()),
    StructField("region", StringType()),
    StructField("registered", StringType()),
    StructField("visit_start", StringType()),
    StructField("visitor_id", StringType()),
    StructField("visits", StringType())
])

def dateHandler(dateStr:str)->str:
    year=dateStr.split("-")[0]
    if(len(year)==2):
        dateStr="20"+dateStr
    return dateStr

udateHandler=udf(dateHandler)

In [86]:
def cleanVisitor(df:DataFrame)->DataFrame:
    df=df\
    .withColumn("hits_avg",df["hits_avg"].cast(IntegerType()))\
    .withColumn("logged_in",df["logged_in"].cast(BooleanType()))\
    .withColumn("visit_start", udateHandler(df.visit_start) )\
    .withColumn("visits",df.visits.cast(IntegerType()))
    
    df=df\
    .withColumn("visit_start", to_timestamp(df.visit_start, "yyyy-MM-dd HH:mm:ss"))

    df=df\
    .withColumn("date", date_format(df.visit_start,"yyyy-MM-dd").cast(DateType()))\
    .withColumn("visitor_id",df.visitor_id.cast(LongType()))

    df=df.na.fill(0,["visitor_id"])
    return df

def cleanSearches(df:DataFrame)->DataFrame:
    df= df\
    .withColumn("date_time",to_timestamp(df.date_time, "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"))\
    .withColumn("visitor_id", when(trim(df.visitor_id) == "null",lit(0) ).otherwise(df.visitor_id))

    df=df.na.fill("0",["visitor_id"])\
    .withColumn("date", date_format(df.date_time,"yyyy-MM-dd").cast(DateType()))

    df=df.withColumn("visitor_id",df.visitor_id.cast( LongType() ))
    return df

In [101]:
rawVisitorDF= spark.read.options(header="True").options(inferSchema="True").parquet(f"{paths.rawVisitors}/*").cache()
logging.info(f" rawVisitorPartitions = {rawVisitorDF.rdd.getNumPartitions()} , rawVisitorDF.count={rawVisitorDF.count()} ")
cleanVisitorDF=rawVisitorDF.transform(cleanVisitor)
rawVisitorDF.unpersist()

rawSearchesDF= spark.read.options(header="True").options(inferSchema="True").parquet(f"{paths.rawSearches}/*").cache()
logging.info(f" rawSearchesPartitions={rawSearchesDF.rdd.getNumPartitions()} , rawSearchesPartitions ={rawSearchesDF.count()}")
cleanSearchesDF=rawSearchesDF.transform(cleanSearches)
rawSearchesDF.unpersist()


INFO:root: rawVisitorPartitions = 2 , rawVisitorDF.count=29997 
INFO:root: rawSearchesPartitions=2 , rawSearchesPartitions =26395


DataFrame[date_time: string, destination_out: string, destination_ret: string, flight_date_inbound: string, flight_date_outbound: string, origin_out: string, origin_ret: string, segments: bigint, visitor_id: double]

In [132]:
latestVisitorDF=cleanVisitorDF.groupBy("visitor_id","date").agg(max("visit_start").alias("visit_start")).cache()
latestVisitorDFExtended=latestVisitorDF.join(cleanVisitorDF,["visitor_id","visit_start"]).select("visitor_id","visit_start",latestVisitorDF.date,"country","region")

23/09/07 18:08:53 WARN CacheManager: Asked to cache already cached data.
+-----------+-------------------+----------+-------+------+
| visitor_id|        visit_start|      date|country|region|
+-----------+-------------------+----------+-------+------+
| 1754585810|2021-03-19 03:46:54|2021-03-19|    deu|    nw|
|22732060101|2021-03-19 06:38:19|2021-03-19|    fra|   idf|
|15523596535|2021-03-19 07:02:48|2021-03-19|    deu|    sn|
| 2538218538|2021-03-19 07:18:47|2021-03-19|    deu|    nw|
|30266526872|2021-03-19 07:58:28|2021-03-19|    deu|    sl|
| 4378517186|2021-03-19 08:00:39|2021-03-19|    deu|    he|
| 1262411058|2021-03-19 08:02:09|2021-03-19|    deu|    nw|
|14657567712|2021-03-19 08:02:11|2021-03-19|    deu|    by|
|18738898137|2021-03-19 08:11:07|2021-03-19|    deu|    ni|
| 5022402273|2021-03-19 08:14:44|2021-03-19|    deu|    by|
| 5532489524|2021-03-19 08:31:07|2021-03-19|    esp|    pm|
|10432905542|2021-03-19 08:48:08|2021-03-19|    deu|    nw|
| 4304304726|2021-03-19 09:

In [127]:
latestVisitorDFExtended.join(cleanSearchesDF,[""])

21847

In [133]:
cleanVisitorDF.filter("visitor_id=25117075546").orderBy(desc(cleanVisitorDF.visit_start)).show(100)

+-----------+-------+------------------+--------+---------+------+----------+-------------------+-----------+------+----------+
|countPerday|country|first_hit_pagename|hits_avg|logged_in|region|registered|        visit_start| visitor_id|visits|      date|
+-----------+-------+------------------+--------+---------+------+----------+-------------------+-----------+------+----------+
|       9216|    deu|              null|      34|     true|    bw|     false|2021-05-01 11:08:52|25117075546|     1|2021-05-01|
|       8935|    deu|              null|      16|     true|    nw|     false|2021-05-01 11:02:10|25117075546|     1|2021-05-01|
|       8817|    deu|              null|       7|    false|    nw|     false|2021-05-01 10:59:09|25117075546|     1|2021-05-01|
|       8753|    deu|              null|       3|    false|    ni|     false|2021-05-01 10:57:45|25117075546|     1|2021-05-01|
|       8224|    deu|            Select|       9|    false|    bw|     false|2021-05-01 10:44:41|2511707