In [1]:
#Log parsing of access log of NASA Kennedy Space Center WWW server in Florida for getting meaningful insights on data.
from pyspark.sql import Row
import re

PATTERN = '^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+)(.*)" (\d{3}) (\S+)'

def parseLogLine(log):
    m = re.match(PATTERN, log)
    if m:
        return [Row(timeStamp=m.group(4),url=m.group(6), httpCode=int(m.group(8)))]
    else:
        return []

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("PythonWordCount").getOrCreate()
sc = spark.sparkContext

logFile = sc.textFile("/data/spark/project/NASA_access_log_Aug95.gz")

accessLog = logFile.flatMap(parseLogLine)
accessDf = spark.createDataFrame(accessLog)
accessDf.printSchema()
accessDf.createOrReplaceTempView("nasalog")
output = spark.sql("select * from nasalog")
output.createOrReplaceTempView("nasa_log")

# Problem 1 Top 10 requested URLs:
spark.sql("select url,count(*) as req_cnt from nasa_log where upper(url) like '%HTML%' group by url order by req_cnt desc LIMIT 10").show()

# Problem 2 Top 5 time frames for high traffic:
spark.sql("select substr(timeStamp,1,14) as timeFrame,count(*) as req_cnt from nasa_log group by substr(timeStamp,1,14) order by req_cnt  LIMIT 5").show()

# Problem 3 Top 5 time frames for least traffic:
spark.sql("select substr(timeStamp,1,14) as timeFrame,count(*) as req_cnt from nasa_log group by substr(timeStamp,1,14) order by req_cnt desc LIMIT 5").show()

# Problem 4 Find HTTP codes:
spark.sql("select httpCode,count(*) as req_cnt from nasa_log group by httpCode ").show()


ModuleNotFoundError: No module named 'pyspark'