In [4]:
import re
import os
import logging
import datetime
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.types import *
from general_functions import *
import matplotlib.pyplot as plt
import matplotlib.pyplot as pyplt
from scipy.interpolate import make_interp_spline
from scipy.ndimage import gaussian_filter1d


logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)
logger = logging.getLogger('Data_Processing')
logger.info('main.py Script started')

2024-06-07 02:01:38,423 - main.py Script started


In [5]:

def apply_transformations(spark,destination_path):
    try:
        """
        This function is used to transform raw data received in Landing folder and save it in clean folder as parquet format, with partitionBy date.
        Parameters:
        destination_path: This is the path of clean folder where data is being saved. 
        spark: spark object
        returns: transformed pyspark-dataframe
        """
        logger.info('stream_processing_Script started')
        df = (spark.readStream.option("cleanSource","archive")
              .option("sourceArchiveDir", f"{destination_path}/Files_done/here/")
              .option("maxFilesPerTrigger", 1).format("json").load(f"C:/Users/Admin/Downloads/stream_task/data/raw"))
        
        df = df.select('*', "ip_geo.*", "query.*").drop("query", "ip_geo")
        df = df.toDF(*get_unique_column_names(df.columns))

        df = df.drop(*get_duplicate_column_names(df))
        #removing all the numbers from column name ex: column_name_1 --> column_name
        df = df.toDF(*[column.rsplit("_",1)[0] for column in df.columns])

        df = df.withColumn("real_filepath", input_file_name())
        # df.printSchema()
        df = df.withColumn("click_time" ,col("click_time").cast("timestamp"))
        # df.printSchema()
        # df.show()
        df = df.withColumn("actual_file" , split(df.real_filepath, '/',limit=-1))
        df = df.withColumn("count_file", size(df.actual_file))
        df = df.withColumn("actual_file" , df.actual_file[col("count_file")-1]).drop("count_file")
        df = df.withColumn("file_creation_date", get_file_generation_date_udf(col("actual_file")))
        df = df.withColumn("file_creation_date", to_timestamp("file_creation_date", "yyyy-MM-dd HH-mm"))
        publisher_id  = get_publisher_id_column_name(df)
        df = df.na.fill("null")
        # df.printSchema()
        # # print("this is the column structure", df.columns)
        df = df.withColumnRenamed(publisher_id, "publisher_id")
        df = df.select("publisher_id", "file_creation_date", "actual_file","click_time")
        df = df.withColumn("publisher_id", when(length(col("publisher_id")) > 6, regexp_extract(col("publisher_id"), "^(va-\d{3})|^(VA-\d{3})",0)).otherwise(col("publisher_id")))

        # df.printSchema()
        df = df.withWatermark("click_time", "10 minutes").groupBy(window("click_time", "10 minutes"),"publisher_id", "file_creation_date", "actual_file").agg(count("publisher_id").alias("total_clicks"))
        # print("this is the window function count", df.count())
        df = df.withColumn("date", split(col("file_creation_date"), " ").getItem(0))
        df = df.withColumn("date", to_timestamp("date", "yyyy-MM-dd"))
        # df = df.withColumn("path", lit(path))
        # df.printSchema()

        # df.write.partitionBy("date").mode("append").format("parquet").save(str(os.getcwd()).replace("\\", "/")+f'/clean1')
        # df.writeStream.format("console").trigger(processingTime="30 seconds").start().awaitTermination()
        df.writeStream.format("console").option("checkpointLocation", f"{destination_path}/checkpoint/").trigger(processingTime="30 seconds").outputMode("append").start().awaitTermination()
        # .option("path", f"{destination_path}/output/")
        # .option("path", f"{os.getcwd()}/output/")
        return f"{destination_path}/output/"
    except Exception as e:
        logger.info(f"Error has been encountered at apply_transformations {e}")



In [8]:
try:
    if __name__ == "__main__":

        currect_working_directory = os.getcwd().replace("\\", "/")
        logger.info(f"current working directory: {currect_working_directory}")



        spark = (SparkSession.builder.master("local[*]").appName("stream_procecssing_pipeline_from_s3")\
                 .config("spark.sql.legacy.timeParserPolicy","LEGACY")\
                    .config("spark.executor.memory", "4g")\
                        .config("spark.driver.memory", "4g")\
                            .config("spark.cores.max", "3").getOrCreate())
        logger.info(f"SparkSession Created Successfully")
        spark.conf.set("spark.sql.streaming.schemaInference", True)
        
        logger.info(f"apply_transformations function started successfully reading data from location : /data/raw/")
        destination_path = apply_transformations(spark,f"{currect_working_directory}")

        # logger.info(f"apply_transformations function completed saved parquet at location: {destination_path}")
        # df = generating_publisher_id_with_maximum_queries(spark, destination_path)
        # logger.info("generating_publisher_id_with_maximum_queries function runned successfully")
        # df.coalesce(1).write.mode("overwrite").csv("top_5_publishers_id_data")
        # logger.info("top-5 publishers_id saved in csv file")
        # generatring_line_graph_for_top_5_publishers(df, os.getcwd())
        # logger.info(f"generatring_line_graph_for_top_5_publishers function completed saved parquet at location: {destination_path}")
            
except Exception as e:
        logger.info(f"Error has been encountered at main {e}")

2024-06-07 02:03:29,065 - current working directory: c:/Users/Admin/Downloads/stream_task/Processing
2024-06-07 02:03:29,075 - SparkSession Created Successfully
2024-06-07 02:03:29,076 - apply_transformations function started successfully reading data from location : /data/raw/
2024-06-07 02:03:29,077 - stream_processing_Script started


In [7]:

# logger.info("current working directory-->>",os.getcwd())

# spark = SparkSession.builder.master("local[*]").appName("Batch_procecssing_pipeline_from_s3").config("spark.sql.legacy.timeParserPolicy","LEGACY").getOrCreate()
# spark.conf.set("spark.sql.streaming.schemaInference", True)
# df = (spark.readStream.option("cleanSource","archive").option("sourceArchiveDir", "./archived/here/").option("maxFilesPerTrigger", 1).format("json").load(f"/data/"))
# df.printSchema()
# df = df.select('*', "ip_geo.*", "query.*").drop("query", "ip_geo")
# df = df.toDF(*get_unique_column_names(df.columns))

# df = df.drop(*get_duplicate_column_names(df))

# df = df.withColumn("real_filepath", input_file_name())
# df.printSchema()
# df = df.withColumn("click_time" ,col("click_time_1").cast("timestamp"))
# df.printSchema()
# # df.show()
# df = df.withColumn("actual_file" , split(df.real_filepath, '/',limit=-1))
# df = df.withColumn("count_file", size(df.actual_file))
# df = df.withColumn("actual_file" , df.actual_file[col("count_file")-1]).drop("count_file")
# df = df.withColumn("file_creation_date", get_file_generation_date_udf(col("actual_file")))
# df = df.withColumn("file_creation_date", to_timestamp("file_creation_date", "yyyy-MM-dd HH-mm"))
# publisher_id  = get_publisher_id_column_name(df)
# df = df.na.fill("null")
# # df.printSchema()
# # # print("this is the column structure", df.columns)
# df = df.withColumnRenamed(publisher_id, "publisher_id")
# df = df.select("publisher_id", "file_creation_date", "actual_file","click_time")
# df = df.withColumn("publisher_id", when(length(col("publisher_id")) > 6, regexp_extract(col("publisher_id"), "^(va-\d{3})|^(VA-\d{3})",0)).otherwise(col("publisher_id")))

# # df.printSchema()
# df = df.withWatermark("click_time", "10 minutes").groupBy(window("click_time", "10 minutes"),"publisher_id", "file_creation_date", "actual_file").agg(count("publisher_id").alias("total_clicks"))
# # print("this is the window function count", df.count())
# df = df.withColumn("date", split(col("file_creation_date"), " ").getItem(0))
# df = df.withColumn("date", to_timestamp("date", "yyyy-MM-dd"))
# # df = df.withColumn("path", lit(path))
# # df.printSchema()

# # df.write.partitionBy("date").mode("append").format("parquet").save(str(os.getcwd()).replace("\\", "/")+f'/clean1')
# df.writeStream.format("console").trigger(processingTime="30 seconds").outputMode("append").start().awaitTermination()
# logger.info(f"successfully saved data of {path} with partiton column date")