In [141]:
from apachelogs import LogParser
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf

In [142]:
sc.stop() # Uncomment to terminate previously running SparkContexts if there are any 
sc = SparkContext()
rdd = sc.textFile("logfiles.log")
df = (rdd.filter(lambda x: len(x) > 0))

In [143]:
# Partition data into 5 groups based on status 
informational_1 = df.filter(lambda x: x.split()[8].startswith("1")) # Entries with status 100–199
successful_2 = df.filter(lambda x: x.split()[8].startswith("2")) # Entries with status 200–299
redirection_3 = df.filter(lambda x: x.split()[8].startswith("3")) # Entries with status 300–399
clienterr_4 = df.filter(lambda x: x.split()[8].startswith("4")) # Entries with status 400–499
servererr_5 = df.filter(lambda x: x.split()[8].startswith("5")) # Entries with status 500–599

In [144]:
# Print number of responses in each category 
print("Informational: ", informational_1.count())
print("Successful: ", successful_2.count())
print("Redirection: ", redirection_3.count())
print("Client error: ", clienterr_4.count())
print("Server error: ", servererr_5.count())


                                                                                

Informational:  0
Successful:  142564
Redirection:  285598
Client error:  285553
Server error:  286285


In [145]:
# Calculate percentages
sum = informational_1.count() + successful_2.count() + redirection_3.count() + clienterr_4.count() + servererr_5.count()
percent_1 = informational_1.count() / sum * 100
percent_2 = successful_2.count() / sum * 100 
percent_3 = redirection_3.count() / sum * 100 
percent_4 = clienterr_4.count() / sum * 100 
percent_5 = servererr_5.count() / sum * 100 

                                                                                

In [146]:
# Print percentage by response category 
print("Informational: ", percent_1, "%")
print("Successful: ", percent_2, "%")
print("Redirection: ", percent_3, "%")
print("Client error: ", percent_4, "%")
print("Server error: ", percent_5, "%")

Informational:  0.0 %
Successful:  14.2564 %
Redirection:  28.559800000000003 %
Client error:  28.5553 %
Server error:  28.628500000000003 %


### Top 5 IP addresses generating client error

In [147]:
ID_client_err = clienterr_4.map(lambda x: x.split()[0]) # Get IP only 
ip_count = ID_client_err.map(lambda x: (x, 1)) # Mapper (add count)
ip_count = ip_count.reduceByKey(lambda x, y: x + y) # Reducer 
ip_most = ip_count.sortBy(lambda x: x[1], False) # Sort by descending count
ip_most.take(5) # Show top 5 

                                                                                

[('122.155.216.51', 2),
 ('175.115.37.123', 2),
 ('40.237.64.134', 2),
 ('32.188.156.161', 2),
 ('133.203.25.240', 2)]