In [0]:
import datetime
from pyspark.sql.functions import col   
from pyspark.sql.functions import col, to_timestamp, date_format,concat, trunc,from_unixtime,lit

In [0]:
#THE GIVEN SOURCE HAVE EXTRA ATTRIBUTE 'clientId' SO UNION IN 2 SPLITS 
#FROM THESE 2 SPLITS , SELECTED ATTRIBUTES WILL BE CARRIED AND UNION THEM INTO ONE DATAFRAME FOR TRANSFORMATION TASK

#SPLIT ONE - UNION LIST OF ALL DFS INTO ONE 
table_names = []
start_date = "20160801"
end_date = "20160803"
#end_date = "20170631"  use this , for testing using only 2 dataframes
current_date = start_date
while current_date <= end_date:
    table_name = f"bigquery-public-data.google_analytics_sample.ga_sessions_{current_date}"
    table_names.append(table_name)
    current_date = (datetime.datetime.strptime(current_date, "%Y%m%d") + datetime.timedelta(days=1)).strftime("%Y%m%d")
dfs1 = []
for table_name in table_names:
    df_temp = spark.read.format("bigquery") \
        .option("table", table_name) \
        .load()
    dfs1.append(df_temp)
df = dfs1[0]
for i in range(1, len(dfs1)):
    df = df.union(dfs1[i]) 

In [0]:
#SPLIT TWO - UNION ALL TABLES CONTAINING CLIENTID AND DROPPING THE ATTRIBUTE AT LAST 

table_names = []
start_date = "20170701"
end_date = "20170702"
# end_date = "20170801" use this , for testing using only 2 dataframes
current_date = start_date
while current_date <= end_date:
    table_name = f"bigquery-public-data.google_analytics_sample.ga_sessions_{current_date}"
    table_names.append(table_name)
    current_date = (datetime.datetime.strptime(current_date, "%Y%m%d") + datetime.timedelta(days=1)).strftime("%Y%m%d")
dfs2 = []
for table_name in table_names:
    df_temp = spark.read.format("bigquery") \
        .option("table", table_name) \
        .load()
    dfs2.append(df_temp)
df2 = dfs2[0]
for i in range(1, len(dfs2)):
    df2 = df2.union(dfs2[i])
df2=df2.drop('clientId')        

In [0]:
#FEAUTURE SELECTION FOR AGGREGATION
selected_col = [ 
    col("visitorId").alias("visitorId"),
    col("visitNumber").alias("visitNumber"),
    col("visitId").alias("visitId"),
    col("visitStartTime").alias("visitStartTime"),
    col("date").alias("date"),
    col("totals.visits").alias("visits"),
    col("totals.hits").alias("hits"),
    col("totals.pageviews").alias("pageviews"),
    col("totals.bounces").alias("bounces"),
    col("totals.newVisits").alias("newVisits"),
    col("totals.transactionRevenue").alias("transactionRevenue"),
    col("totals.transactions").alias("transaction"),
    col("totals.timeOnSite").alias("timeOnSite"),
    col("totals.sessionQualityDim").alias("sessionQualityDim"),
    col("trafficSource.referralPath").alias("referralPath"),
    col("trafficSource.campaign").alias("campaign"),
    col("trafficSource.source").alias("source"),
    col("trafficSource.medium").alias("medium"),
    col("trafficSource.adwordsClickInfo.criteriaParameters").alias("criteriaParameters"),
    col("device.browser").alias("browser"),
    col("device.operatingSystem").alias("operatingSystem"),
    col("device.isMobile").alias("isMobile"),
    col("device.deviceCategory").alias("deviceCategory"),
    col("geoNetwork.continent").alias("continent"),
    col("geoNetwork.subContinent").alias("subContinent"),
    col("geoNetwork.country").alias("country"),
    col("geoNetwork.networkDomain").alias("networkDomain"),
    col("hits.hitNumber").alias("hitNumber"),
    col("hits.time").alias("time"),
    col("hits.hour").alias("hour"),
    col("hits.minute").alias("minute"),
    col("hits.isInteraction").alias("isInteraction"),
    col("hits.isEntrance").alias("isEntrance"),
    col("hits.isExit").alias("isExit"),
    col("hits.referer").alias("referer"),
    col("hits.page.pagePath").alias("pagePath"),
    col("hits.page.hostname").alias("hostname"),
    col("hits.page.pageTitle").alias("pageTitle"),
    col("hits.page.pagePathLevel1").alias("pagePathLevel1"),
    col("hits.page.pagePathLevel2").alias("pagePathLevel2"),
    col("hits.page.pagePathLevel3").alias("pagePathLevel3"),
    col("hits.appInfo.screenName").alias("screenName"),
    col("hits.appInfo.landingScreenName").alias("landingScreenName"),
    col("hits.appInfo.exitScreenName").alias("exitScreenName"),
    col("hits.appInfo.screenDepth").alias("screenDepth"),
    col("hits.exceptionInfo.isFatal").alias("isFatal"),
    col("hits.eCommerceAction.action_type").alias("action_type"),
    col("hits.eCommerceAction.step").alias("step"),
    col("hits.social.socialNetwork").alias("socialNetwork"),
    col("hits.contentGroup.contentGroup1").alias("contentGroup1"),
    col("hits.contentGroup.contentGroup2").alias("contentGroup2"),
    col("hits.contentGroup.previousContentGroup1").alias("previousContentGroup1"),
    col("hits.contentGroup.previousContentGroup2").alias("previousContentGroup2"),
    col("hits.dataSource").alias("dataSource"),
    col("fullVisitorId").alias("fullVisitorId"),
    col("channelGrouping").alias("channelGrouping"),
    col("socialEngagementType").alias("socialEngagementType")]

In [0]:
#UNION INTO ONE 
df = df.select(selected_col)
df2=df2.select(selected_col)
df=df.union(df2)

In [0]:
#GENERATING TIMESTAMPS FOR GIVEN JOB CONFIGURATION IN TRANSFORMATION PROCESS
df = df.withColumn("timestamp_daily_formatted", to_timestamp(col("date"), "yyyyMMdd"))
df = df.withColumn("timestamp_daily", date_format(col("timestamp_daily_formatted"), "yyyy-MM-dd'T'HH:mm:ss"))
df = df.withColumn("timestamp_weekly", date_format(trunc(col("timestamp_daily_formatted"), "week"), "yyyy-MM-dd'T'HH:mm:ss"))
df = df.withColumn("timestamp_monthly", date_format(trunc(col("timestamp_daily_formatted"), "month"), "yyyy-MM-dd'T'HH:mm:ss"))
df = df.withColumn("timestamp_quarterly", date_format(trunc(col("timestamp_daily_formatted"), "quarter"), "yyyy-MM-dd'T'HH:mm:ss"))
df = df.withColumn("timestamp_yearly", date_format(trunc(col("timestamp_daily_formatted"), "year"), "yyyy-MM-dd'T'HH:mm:ss"))
df = df.withColumn("hourly_timestamp", from_unixtime(col("visitStartTime"), "yyyy-MM-dd'T'HH:mm:ss"))
df = df.withColumn("timestamp_hourly", concat(date_format("hourly_timestamp", "yyyy-MM-dd'T'HH"), lit(":00:00")))
df=df.drop('timestamp_daily_formatted')
df=df.drop('hourly_timestamp')

In [0]:
spark.catalog.tableExists("extract_df")

Out[21]: False

In [0]:
dbutils.fs.rm("dbfs:/user/hive/warehouse/extract_df", True)

Out[22]: True

In [0]:
spark.sql("DROP TABLE IF EXISTS extract_df")

Out[19]: DataFrame[]

In [0]:
df.write.format("delta").option("path", "dbfs:/user/hive/warehouse/extract_df").saveAsTable("extract_df")