# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [None]:
%help

####  Run this cell to set up and start your interactive session.


In [None]:
%idle_timeout 60
%glue_version 4.0
%worker_type G.1X
%number_of_workers 2

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

In [6]:
from datetime import *
current_date = datetime.today().strftime('%Y-%m-%d')

s3_path = f's3://video-raw-bucket/{current_date}/'

dynamic_frame = glueContext.create_dynamic_frame.from_options(
    connection_type="s3",
    connection_options={"paths": [s3_path]},
    format="csv",
    format_options={"withHeader": True}#,"optimizePerformance": True}
)
dynamic_frame.printSchema()
dynamic_frame.count()
df = dynamic_frame.toDF()

root
|-- DateTime: string
|-- VideoTitle: string
|-- events: string
|-- id: string

995676


## Layout Check

In [7]:

df = df.withColumn("VideoTitle", trim(col("VideoTitle")))
# 替换 VideoTitle 列中每个 | 符号后的空格
df = df.withColumn("VideoTitle", regexp_replace(col("VideoTitle"), r'\|\s+', '|'))\
       .withColumn("VideoTitle_split", split(col("VideoTitle"), "\\|")).\
       .filter((((size(col("VideoTitle_split")) == 2) & (col("VideoTitle_split")[0] == "news")) |
    ((size(col("VideoTitle_split")) == 4) & (col("VideoTitle_split")[0] != "news"))))\
       .drop("VideoTitle_split")




## Garbled characters or unreadable character

In [None]:
from pyspark.sql.functions import size,lit

# None value
# When you call df.filter(condition), Spark applies this condition to each row of data.
# If any column value in a row meets the criteria specified in the condition (i.e., is null), 
# then that row will be included in anomalies_df.
condition = None
for column in df.columns:
    #print(column)
    if condition is None:
        condition = col(column).isNull()
    else:
        condition = condition | col(column).isNull()

# anomalies_df = df.filter(condition) 
df = df.filter(~condition)
df.count()

## outliers

In [None]:
from pyspark.sql.functions import to_date,length
from pyspark.sql.types import StringType, ArrayType
date_cols = ["DateTime", "VideoTitle","events"]
max_length = 150
for col_name in date_cols:
    if col_name == "DateTime":
        df = df.withColumn("DateTime_test", to_date(col_name, "yyyy-MM-dd HH:mm:ss"))\
        .filter(col("DateTime_test").isNotNull())\
        .drop("DateTime_test") #if df_fault == 0 then df = df
        print("the number of fault DateTime is:",df.count())
    elif col_name == "VideoTitle":
        df = df.filter(length(col(col_name)) <= max_length)
        print("the number of fault VideoTitle is:"df.count())
    elif col_name == "events":
        df = df.filter(~df.events.contains("206"))
        print("the number of fault events is:"df.count())


# Build Data Warehouse

## DimDate

In [None]:
from pyspark.sql.functions import col, hour, minute, dayofweek, month, quarter, year, to_timestamp

df_time = df\
    .withColumn("DateTime", to_timestamp(col("DateTime"), "yyyy-MM-dd HH:mm:ss")) \
    .withColumn("Hour", hour("DateTime")) \
    .withColumn("Minute", minute("DateTime")) \
    .withColumn("DayOfWeek", dayofweek("DateTime")) \
    .withColumn("Month", month("DateTime")) \
    .withColumn("Quarter", quarter("DateTime")) \
    .withColumn("Year", year("DateTime"))

from pyspark.sql.functions import concat, lit, row_number
from pyspark.sql.window import Window
df_time = df_time.dropDuplicates(["DateTime"])  # maybe several events coincide
# Generate a unique DateKey
windowSpec = Window.orderBy("DateTime")
df_time = df_time.withColumn("DateKey", row_number().over(windowSpec))

# Select required columns for DimDate
DimDate = df_time.select("DateKey", "DateTime", "Hour", "Minute", "DayOfWeek", "Month", "Quarter", "Year")

# Show the prepared DimDate DataFrame
DimDate.show(2,truncate=False)
DimDate.count()


## Build DimPlatform, DimVideotype, DimVideoname

In [None]:
from pyspark.sql.functions import split, col

# 根据拆分后的结果创建新列 DimPlatform, DimVideoType, DimVideoName, DimvideoTopic
df_video = df\
    .withColumn("DimPlatform", col("VideoTitle_split")[0]) \
    .withColumn("DimVideoType", when(col("VideoTitle_split")[0] == "news", "Unknow").otherwise(col("VideoTitle_split")[1])) \
    .withColumn("DimVideoName", when(col("VideoTitle_split")[0] == "news", "Unknow").otherwise(col("VideoTitle_split")[2])) \
    .withColumn("DimvideoTopic", when(col("VideoTitle_split")[0] == "news", col("VideoTitle_split")[1]).otherwise(col("VideoTitle_split")[3]))
df_video = df_video.withColumn("DimVideoType",
                               when(col("DimVideoType") == "Clips", "Clip")
                               .when(col("DimVideoType") == "Episodes", "Episode")
                               .otherwise(col("DimVideoType")))
df_video.show(1)

In [None]:
df_platform = df_video.select("DimPlatform").distinct()

windowSpec = Window.orderBy("DimPlatform")
df_platform = df_platform.withColumn("platform_id", row_number().over(windowSpec))

# Select required columns for DimDate
DimPlatform = df_platform.select("platform_id", "DimPlatform") \
                        .withColumnRenamed("DimPlatform","platform")

# Show the prepared DimDate DataFrame
DimPlatform.show(truncate=False)

In [None]:
from pyspark.sql.functions import asc
windowSpec = Window.orderBy(asc("DimVideoType"))
df_videotype = df_video.select("DimVideoType")\
                        .distinct()\
                        .withColumn("videotype_id", row_number().over(windowSpec))

# Select required columns for DimDate
DimVideoType = df_videotype.select("videotype_id", "DimVideoType") \
                        .withColumnRenamed("DimVideoType","videotype")

# Show the prepared DimDate DataFrame
DimVideoType.show(truncate=False)

In [None]:
windowSpec = Window.orderBy(asc("DimVideoName"))
df_videoname = df_video.select("DimVideoName")\
                        .distinct()\
                        .withColumn("videoname_id", row_number().over(windowSpec))

# Select required columns for DimDate
DimVideoName = df_videoname.select("videoname_id", "DimVideoName") \
                        .withColumnRenamed("DimVideoName","videoname")

# Show the prepared DimDate DataFrame
DimVideoName.show(5,truncate=False)

DimVideoName.count()

In [None]:
windowSpec = Window.orderBy(asc("DimvideoTopic"))
df_topic = df_video.dropDuplicates(["DimvideoTopic"])\
                    .withColumn("topic_id", row_number().over(windowSpec))\
                    .select("topic_id","DimvideoTopic","DimVideoName","DimVideoType","DimPlatform")\
                    .withColumnRenamed("DimvideoTopic","videoTopic")\
                    .withColumnRenamed("DimVideoName","videoName")\
                    .withColumnRenamed("DimVideoType","videoType")\
                    .withColumnRenamed("DimPlatform","videoPlatform")
df_topic.show(3,truncate=False)

In [None]:
df_topic = df_topic.join(DimVideoName, df_topic.videoName == DimVideoName.videoname, "left") \
                 .join(DimVideoType, df_topic.videoType == DimVideoType.videotype, "left") \
                 .join(DimPlatform, df_topic.videoPlatform == DimPlatform.platform, "left")

df_topic.show(2,truncate=False)
DimVideoTopic = df_topic.select("topic_id","videoTopic","videoname_id","videotype_id","platform_id")
DimVideoTopic.show(2)
DimVideoTopic.count()

## Fact Table

In [None]:
windowSpec = Window.orderBy(asc("DateTime"))
FactVideo = df_video.join(df_topic,df_video.DimvideoTopic == df_topic.videoTopic,"left")\
                    .withColumn("record_id", row_number().over(windowSpec))\
                    .withColumnRenamed("id","user_id")\
                    .select("record_id","user_id","DateTime","videoTopic","events")
            
FactVideo.show(10)

In [None]:

date_today = datetime.now().strftime("%Y-%m-%d")
output_path1 = f's3://video-curated-bucket/data-warehouse/dimdate/{date_today}'
output_path2 = f's3://video-curated-bucket/data-warehouse/dimplatform/{date_today}'
output_path3 = f's3://video-curated-bucket/data-warehouse/dimvideotype/{date_today}'
output_path4 = f's3://video-curated-bucket/data-warehouse/dimvideoname/{date_today}'
output_path5 = f's3://video-curated-bucket/data-warehouse/dimvideotopic/{date_today}'
output_path6 = f's3://video-curated-bucket/data-warehouse/factvideo/{date_today}'


FactVideo.write \
    .mode('overwrite') \
    .option("header", "true") \
    .csv(output_path6)

DimPlatform.coalesce(1).write \
    .mode('overwrite') \
    .option("header", "true") \
    .csv(output_path2)

DimDate.write \
    .mode('overwrite') \
    .option("header", "true") \
    .csv(output_path1)

DimVideoType.coalesce(1).write
    .mode('overwrite') \
    .option("header", "true") \
    .csv(output_path3)

DimVideoName.coalesce(1).write\
    .mode('overwrite') \
    .option("header", "true") \
    .csv(output_path4)

DimVideoTopic.write\
    .mode('overwrite') \
    .option("header", "true") \
    .csv(output_path5)