In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.utils import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
import util
from config import *
import logging
import os

In [25]:
logging.basicConfig(level=logging.INFO)
paths:Paths=Paths(
    dataLake="../DataLake/"
    ,srcSearches="../searches/"
    ,srcVisitors="../visitors/"
    ,rawSearches="../DataLake/raw/searches"
    ,rawVisitors="../DataLake/raw/visitors"
    ,ezSearches="../DataLake/ez/searches"
    ,ezVisitors="../DataLake/ez/visitors"
    ,archive="../archive/"
    ,archiveSearches="../archive/searches"
    ,archiveVisitors="../archive/visitors"
)
util.rawZoneSetup(paths)

Task 1: Data Ingestion

In [5]:
spark=SparkSession\
.builder\
.appName("test")\
.getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/09/08 07:17:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [27]:
def dataIngestion(srcFolder,targetFolder,archiveFolder):
    for f in os.listdir(srcFolder):
        srcFile=f"{srcFolder}/{f}"
        if (".json" not in f):os.remove(srcFile)
        else:
            ts=util.getTsFromFileName(f)
            targetPath=f"{targetFolder}/{ts}/"
            spark.read.json(f"{srcFolder}/{f}").coalesce(1).write.mode("append").options(header="True",compression="snappy").parquet(targetPath)
            os.rename(srcFile,f"{archiveFolder}/{f}")
            logging.info(f"Completed dataIngestion {srcFile}")


dataIngestion(paths.srcSearches,paths.rawSearches,paths.archiveSearches)
dataIngestion(paths.srcVisitors,paths.rawVisitors,paths.archiveVisitors)

Task 2: Preprocessing

In [66]:
def cleanVisitor(df:DataFrame)->DataFrame:
    df=df\
    .withColumn("hits_avg",df["hits_avg"].cast(IntegerType()))\
    .withColumn("logged_in",df["logged_in"].cast(BooleanType()))\
    .withColumn("visit_start", udateHandler(df.visit_start) )\
    .withColumn("visits",df.visits.cast(IntegerType()))\
    .withColumn("visitor_id",trim(df.visitor_id.cast(StringType())))

    df=df.withColumn("visit_start", to_timestamp(df.visit_start, "yyyy-MM-dd HH:mm:ss"))
    df=df.withColumn("date", date_format(df.visit_start,"yyyy-MM-dd").cast(DateType()))\
    .na.fill("na",["visitor_id"])
    return df

def cleanSearches(df:DataFrame)->DataFrame:
    df= df\
    .withColumn("date_time",to_timestamp(df.date_time, "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"))\
    .withColumn("visitor_id",trim(df.visitor_id.cast(StringType())))
    
    df=df\
    .withColumn("date", date_format(df.date_time,"yyyy-MM-dd").cast(DateType()))\
    .na.fill("na",["visitor_id"])
    
    return df

def dateHandler(dateStr:str)->str:
    year=dateStr.split("-")[0]
    if(len(year)==2):
        dateStr="20"+dateStr
    return dateStr

udateHandler=udf(dateHandler)

In [67]:
rawVisitorDF= spark.read.options(header="True").options(inferSchema="True").parquet(f"{paths.rawVisitors}/*").cache()
logging.info(f" rawVisitorPartitions = {rawVisitorDF.rdd.getNumPartitions()} , rawVisitorDF.count={rawVisitorDF.count()} ")
cleanVisitorDF=rawVisitorDF.transform(cleanVisitor)
rawVisitorDF.unpersist()

rawSearchesDF= spark.read.options(header="True").options(inferSchema="True").parquet(f"{paths.rawSearches}/*").cache()
logging.info(f" rawSearchesPartitions={rawSearchesDF.rdd.getNumPartitions()} , rawSearchesPartitions ={rawSearchesDF.count()}")
cleanSearchesDF=rawSearchesDF.transform(cleanSearches)
rawSearchesDF.unpersist()

INFO:root: rawVisitorPartitions = 1 , rawVisitorDF.count=9999 
INFO:root: rawSearchesPartitions=1 , rawSearchesPartitions =13192


DataFrame[date_time: string, destination_out: string, destination_ret: string, flight_date_inbound: string, flight_date_outbound: string, origin_out: string, origin_ret: string, segments: bigint, visitor_id: double]

In [68]:
# cleanVisitorDF.select("visitor_id").filter("visitor_id is null").distinct().show()
visitorDim=cleanVisitorDF.select("visitor_id").distinct().withColumn("visitorkey",monotonically_increasing_id()).cache()
# visitorDim.filter("visitor_id is null").show()
# cleanVisitorDF.join(visitorDim,["visitor_id"],"left_outer").orderBy(asc("visitorkey")).show()

In [69]:
visitorDim.filter("visitorkey = 'null'").show()

+----------+----------+
|visitor_id|visitorkey|
+----------+----------+
+----------+----------+



In [70]:
cleanVisitorDF.join(visitorDim,["visitor_id"],"left_outer").filter("visitorkey is null").show()
# visitorDim.filter("visitorkey is null").show()

+----------+-----------+-------+------------------+--------+---------+------+----------+-----------+------+----+----------+
|visitor_id|countPerday|country|first_hit_pagename|hits_avg|logged_in|region|registered|visit_start|visits|date|visitorkey|
+----------+-----------+-------+------------------+--------+---------+------+----------+-----------+------+----+----------+
+----------+-----------+-------+------------------+--------+---------+------+----------+-----------+------+----+----------+



Task3: Reports

In [35]:
latestVisitorDF=cleanVisitorDF.groupBy("visitor_id","date").agg(max("visit_start").alias("visit_start")).cache()

latestVisitorDFExtended=latestVisitorDF\
                        .join(cleanVisitorDF,["visitor_id","visit_start"])\
                        .select("visitor_id","visit_start",latestVisitorDF.date,"country","region")\
                        .withColumnRenamed("visit_start","date_time")

In [36]:
cleanSearchesDF\
.join(latestVisitorDFExtended,["visitor_id","date"],"left_outer")\
.select("country","region","visitor_id","date")\
.show()

+-------+------+--------------------+----------+
|country|region|          visitor_id|      date|
+-------+------+--------------------+----------+
|   null|  null|1.594288968457053...|2021-03-11|
|   null|  null|1.756565223890326...|2021-01-05|
|   null|  null|1.988182525421821E10|2021-04-17|
|   null|  null|2.036067340216234...|2021-04-02|
|   null|  null|2.198028629380746...|2021-04-03|
|   null|  null|2.463305758825798E10|2021-01-30|
|   null|  null|2.710483719626454...|2021-03-01|
|   null|  null|2.964895272253205E10|2021-03-12|
|   null|  null|3.341345276463134E10|2021-01-01|
|   null|  null|3.591424903713482E10|2021-03-16|
|   null|  null|3.749918802270866E10|2021-05-01|
|   null|  null|3.767071691283598E10|2021-01-25|
|   null|  null| 3.76756188429285E10|2021-02-07|
|   null|  null| 3.76756188429285E10|2021-05-12|
|   null|  null|4.348108097504457E10|2021-02-22|
|   null|  null|4.580088141862797E10|2021-03-29|
|   null|  null|4.940985909838370...|2021-05-11|
|   null|  null| 6.6

In [None]:
# cleanSearchesDF\
# .join(latestVisitorDFExtended,["visitor_id","date"],"left_outer")\
# .filter("country is not null")\
# .groupBy(cleanSearchesDF.date,"country","region")\
# .agg(count("*"))\
# .orderBy(desc(cleanSearchesDF.date))\
# .show()