## PySpark Logging Tutorial

In [51]:
##-- Import required libraries
import os
import re
import pyspark as ps
from pyspark.sql import SparkSession

In [52]:
##-- Create a SparkSession object and set its application name
spark = SparkSession.builder.appName("PySpark Logging Tutorial").getOrCreate()
##-- Set the log level for Spark
spark.sparkContext.setLogLevel("WARN")

In [53]:
##-- Open a file containing the directory path
with open('path.txt') as f:
    dir_path = f.readline()
##-- Combine the directory path with the name of the log file to create a full path
logFile = os.path.join(dir_path,"README.md")

In [54]:
##-- Read the log file into a DataFrame and cache it in memory for faster processing
logData = spark.read.text(logFile).cache()

##-- Define a lambda function to filter the log data by a given criteria and count the number of matching lines
log_filter = lambda criteria: logData.filter(logData.value.contains("%s" % criteria)).count()

In [55]:
##-- Define two criteria and count the number of lines that match each one
c1_count = log_filter("OutOfMemoryError")
c2_count = log_filter("FetchFailedException")

print("Lines with %s: %i, lines with %s: %i" % (c1,c1_count,c2,c2_count))

Lines with OutOfMermoryError: 0, lines with FetchFailedException: 0


In [63]:
def search_logfile(logFile,*argv):
    
    log_results = {}
    
    for criteria in argv:  
        log_results["%s" % criteria] = 0
    
    for line in logFile:
        for criteria in argv:
            if re.compile(r"%s" % criteria).search(line):
                log_results["%s" % criteria] += 1
            else: pass
    
    return log_results

log_results = search_logfile(logFile,c1,c2)

print(log_results)

{'OutOfMermoryError': 0, 'FetchFailedException': 0}


In [60]:
##-- Stop the SparkSession
spark.stop()