In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.utils import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
import util
from config import *
import logging

In [7]:
logging.basicConfig(level=logging.INFO)
paths:Paths=Paths(
    dataLake="../DataLake/"
    ,srcSearches="../searches/"
    ,srcVisitors="../visitors/"
    ,rawSearches="../DataLake/raw/searches/"
    ,rawVisitors="../DataLake/raw/visitors/"
    ,ezSearches="../DataLake/ez/searches/"
    ,ezVisitors="../DataLake/ez/visitors/"
)
util.rawZoneSetup(paths)

TASK 1

In [8]:
newSearchesFiles=util.getNewFiles(paths.srcSearches,paths.rawSearches)
newVisitorFiles=util.getNewFiles(paths.srcVisitors,paths.rawVisitors)
if(len(newSearchesFiles)==0 and len(newVisitorFiles)==0):
    logging.info("No New Files found from source")

INFO:root:getNewFiles(): path ../searches/, new Files Count 2
INFO:root:getNewFiles(): path ../visitors/, new Files Count 2


In [None]:
spark=SparkSession.builder.appName("test").getOrCreate()

In [9]:
for files in newSearchesFiles:
    ts=util.getTsFromFileName(files)
    spark.read.json(f"{paths.srcSearches}/{files}").coalesce(1).write.options(header="True",compression="snappy").parquet(f"{paths.rawSearches}/{ts}")

for files in newVisitorFiles:
    ts=util.getTsFromFileName(files)
    spark.read.json(f"{paths.srcVisitors}/{files}").coalesce(1).write.options(header="True",compression="snappy").parquet(f"{paths.rawVisitors}/{ts}")

                                                                                

TASK 2

In [10]:
visitorStruct = StructType([
    StructField("countPerday", StringType()),
    StructField("country", StringType()),
    StructField("first_hit_pagename", StringType()),
    StructField("hits_avg", StringType()),
    StructField("logged_in", StringType()),
    StructField("region", StringType()),
    StructField("registered", StringType()),
    StructField("visit_start", StringType()),
    StructField("visitor_id", StringType()),
    StructField("visits", StringType())
])

def dateHandler(dateStr:str)->str:
    year=dateStr.split("-")[0]
    if(len(year)==2):
        dateStr="20"+dateStr
    return dateStr

udateHandler=udf(dateHandler)

In [86]:
def cleanVisitor(df:DataFrame)->DataFrame:
    df=df\
    .withColumn("hits_avg",df["hits_avg"].cast(IntegerType()))\
    .withColumn("logged_in",df["logged_in"].cast(BooleanType()))\
    .withColumn("visit_start", udateHandler(df.visit_start) )\
    .withColumn("visits",df.visits.cast(IntegerType()))
    
    df=df\
    .withColumn("visit_start", to_timestamp(df.visit_start, "yyyy-MM-dd HH:mm:ss"))

    df=df\
    .withColumn("date", date_format(df.visit_start,"yyyy-MM-dd").cast(DateType()))\
    .withColumn("visitor_id",df.visitor_id.cast(LongType()))

    df=df.na.fill(0,["visitor_id"])
    return df

def cleanSearches(df:DataFrame)->DataFrame:
    df= df\
    .withColumn("date_time",to_timestamp(df.date_time, "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"))\
    .withColumn("visitor_id", when(trim(df.visitor_id) == "null",lit(0) ).otherwise(df.visitor_id))

    df=df.na.fill("0",["visitor_id"])\
    .withColumn("date", date_format(df.date_time,"yyyy-MM-dd").cast(DateType()))

    df=df.withColumn("visitor_id",df.visitor_id.cast( LongType() ))
    return df

In [101]:
rawVisitorDF= spark.read.options(header="True").options(inferSchema="True").parquet(f"{paths.rawVisitors}/*").cache()
logging.info(f" rawVisitorPartitions = {rawVisitorDF.rdd.getNumPartitions()} , rawVisitorDF.count={rawVisitorDF.count()} ")
cleanVisitorDF=rawVisitorDF.transform(cleanVisitor)
rawVisitorDF.unpersist()

rawSearchesDF= spark.read.options(header="True").options(inferSchema="True").parquet(f"{paths.rawSearches}/*").cache()
logging.info(f" rawSearchesPartitions={rawSearchesDF.rdd.getNumPartitions()} , rawSearchesPartitions ={rawSearchesDF.count()}")
cleanSearchesDF=rawSearchesDF.transform(cleanSearches)
rawSearchesDF.unpersist()

INFO:root: rawVisitorPartitions = 2 , rawVisitorDF.count=29997 
INFO:root: rawSearchesPartitions=2 , rawSearchesPartitions =26395


DataFrame[date_time: string, destination_out: string, destination_ret: string, flight_date_inbound: string, flight_date_outbound: string, origin_out: string, origin_ret: string, segments: bigint, visitor_id: double]

In [None]:
latestVisitorDF=cleanVisitorDF.groupBy("visitor_id","date").agg(max("visit_start").alias("visit_start")).cache()

latestVisitorDFExtended=latestVisitorDF\
                        .join(cleanVisitorDF,["visitor_id","visit_start"])\
                        .select("visitor_id","visit_start",latestVisitorDF.date,"country","region")\
                        .withColumnRenamed("visit_start","date_time")


latestVisitorDFExtended.filter("visitor_id=25117075546").show(10)
cleanSearchesDF.printSchema()

cleanVisitorDF.filter("visitor_id=25117075546").orderBy(desc(cleanVisitorDF.visit_start)).show(100)

In [164]:
cleanSearchesDF\
.join(latestVisitorDFExtended,["visitor_id","date"],"left_outer")\
.groupBy(cleanSearchesDF.date,"country","region")\
.agg(count("*"))\
.show()



+----------+-------+------+--------+
|      date|country|region|count(1)|
+----------+-------+------+--------+
|2021-01-27|   null|  null|      56|
|2021-05-12|   null|  null|     179|
|2021-04-24|   null|  null|     210|
|2021-02-15|   null|  null|     185|
|2021-05-03|   null|  null|     209|
|2021-03-22|   null|  null|     201|
|2021-03-19|    fra|   idf|       2|
|2021-01-25|   null|  null|     203|
|2021-03-07|   null|  null|     193|
|2021-05-07|   null|  null|     206|
|2021-04-07|   null|  null|     211|
|2021-04-15|   null|  null|     207|
|2021-01-05|   null|  null|     215|
|2021-04-23|   null|  null|     178|
|2021-02-21|   null|  null|     211|
|2021-02-05|   null|  null|     200|
|2021-03-21|   null|  null|     168|
|2021-02-10|   null|  null|     187|
|2021-01-19|   null|  null|     229|
|2021-02-06|   null|  null|     218|
+----------+-------+------+--------+
only showing top 20 rows



                                                                                

In [147]:
cleanSearchesDF.count()

26395

In [153]:
cleanSearchesDF.groupBy("date").agg(count("*")).show()

+----------+--------+
|      date|count(1)|
+----------+--------+
|2021-01-27|     191|
|2021-05-12|     179|
|2021-04-29|     218|
|2021-04-24|     210|
|2021-02-15|     185|
|2021-03-22|     201|
|2021-05-03|     209|
|2021-01-18|     197|
|2021-01-25|     203|
|2021-02-02|     202|
|2021-04-25|     213|
|2021-03-07|     193|
|2021-05-07|     206|
|2021-04-21|     197|
|2021-02-26|     168|
|2021-04-07|     211|
|2021-04-15|     207|
|2021-05-09|     209|
|2021-01-05|     215|
|2021-04-23|     178|
+----------+--------+
only showing top 20 rows

