## [NASA logs for July 1995](http://ita.ee.lbl.gov/html/contrib/NASA-HTTP.html)

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [None]:
#!wget ftp://ita.ee.lbl.gov/traces/NASA_access_log_Jul95.gz
#!gunzip NASA_access_log_Jul95.gz

## Load logs into a RDD

In [3]:
sc = spark.sparkContext

nasa = sc.textFile("NASA_access_log_Jul95", 4)

### How many non-empty lines?

In [4]:
type(nasa)

pyspark.rdd.RDD

In [10]:
nasa.count()

1891715

In [40]:
nasa.take(5)

['199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245',
 'unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] "GET /shuttle/countdown/ HTTP/1.0" 200 3985',
 '199.120.110.21 - - [01/Jul/1995:00:00:09 -0400] "GET /shuttle/missions/sts-73/mission-sts-73.html HTTP/1.0" 200 4085',
 'burger.letters.com - - [01/Jul/1995:00:00:11 -0400] "GET /shuttle/countdown/liftoff.html HTTP/1.0" 304 0',
 '199.120.110.21 - - [01/Jul/1995:00:00:11 -0400] "GET /shuttle/missions/sts-73/sts-73-patch-small.gif HTTP/1.0" 200 4179']

In [11]:
nasa.filter(lambda line: len(line) > 0).count()

1891715

### Create a function to parse Apache logs and, at the same time, counts malformed lines

In [37]:
import re
# host unused user time method resource status size referer agent 
pattern = '^(\S+) \S+ (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+)\s?(\S+)?\s?(\S+)?" (\d{3}|-) (\d+|-)\s?"?([^"]*)"?\s?"?([^"]*)?"?$'
weblog = re.compile(pattern=pattern)

malformedCount = sc.accumulator(0)

# TIP: you can use Accumulator to count lines
# Make sure you return an empty string ('') 
# whenever you find a malformed line

def process_logs(line):
    try:
        return weblog.findall(line)[0]
    except IndexError:
        malformedCount.add(1)
        return ''
    
nasa.foreach(process_logs)
print(malformedCount)

1902


In [25]:
# Taking one row
print(nasa.take(1)[0])

199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245


In [33]:
# Applies parsing function and 
parsed_nasa = nasa.map(process_logs)
# Check how many empty lines you got
parsed_nasa.filter(lambda line: line == '').count()

1902

In [None]:
# Check the accumulator value
# Both values should match

### Create a function to convert a tuple into a Row object

In [39]:
import time, datetime

def parse_time(str_time):
    tt = time.strptime(str_time[:-6], "%d/%b/%Y:%H:%M:%S")
    return datetime.datetime(*tt[:6])

In [43]:
from pyspark.sql import Row

# The size of the response in the logs may be '-' sometimes
# Create a function that converts the size to an integer value
# and returns -1 for invalid sizes, while also keeping track
# of how many times it happened in the NO_SIZE variable
# (hint: Accumulator)

no_size = sc.accumulator(0)

def to_int(value):
    try:
        value = int(value)
    except:
        value = -1
        no_size.add(1)
    return value

# Leveraging on the previous function, that converts a line
# from the logs into a tuple using RegEx, populate a Row object
# to make it ready to turn into a DataFrame

def log_to_row(line):
    fields = process_logs(line)
    if fields == '':
        fields = [''] * 10
        fields[2] = '01/Jan/1900:00:00:00 -0000'
        fields[6] = 0

    # INSERT CODE HERE
    # Create a row object with all the fields
    # host, user, time, method, resource, protocol, status, size, referer, agent
    row = Row(
        host=fields[0], 
              user=fields[1], 
              time=parse_time(fields[2]),
              method=fields[3], 
              resource=fields[4], 
              protocol=fields[5],
              status=int(fields[6]),
              size=to_int(fields[7]),
              referer=fields[8], 
              agent=fields[9]
    )
    
    return row

In [44]:
log_to_row(nasa.take(1)[0])

Row(agent='', host='199.72.81.55', method='GET', protocol='HTTP/1.0', referer='', resource='/history/apollo/', size=6245, status=200, time=datetime.datetime(1995, 7, 1, 0, 0, 1), user='-')

### Create a DataFrame

In [49]:
# Use the original RDD (nasa), apply your function to it
# and create a DataFrame
df = spark.createDataFrame(nasa.map(log_to_row))

In [51]:
df.limit(4).toPandas()

Unnamed: 0,agent,host,method,protocol,referer,resource,size,status,time,user
0,,199.72.81.55,GET,HTTP/1.0,,/history/apollo/,6245,200,1995-07-01 00:00:01,-
1,,unicomp6.unicomp.net,GET,HTTP/1.0,,/shuttle/countdown/,3985,200,1995-07-01 00:00:06,-
2,,199.120.110.21,GET,HTTP/1.0,,/shuttle/missions/sts-73/mission-sts-73.html,4085,200,1995-07-01 00:00:09,-
3,,burger.letters.com,GET,HTTP/1.0,,/shuttle/countdown/liftoff.html,0,304,1995-07-01 00:00:11,-


In [None]:
df.count()

In [52]:
# How many lines without size?
no_size.value

21557

In [53]:
# How many lines without size?
# Both values should match
df.filter('size == -1').count()

21557

### Which hosts were responsible for the largest amount of traffic?

In [56]:
from pyspark.sql import functions as F

df.groupBy('host').agg({"size":"SUM"}).orderBy("sum(size)", ascending=False).show()

+--------------------+---------+
|                host|sum(size)|
+--------------------+---------+
|piweba3y.prodigy.com|433605413|
|piweba1y.prodigy.com|261097448|
|piweba4y.prodigy.com|250619786|
|  alyssa.prodigy.com|209657052|
|         news.ti.com|172264955|
|piweba2y.prodigy.com|159858665|
|      163.206.137.21|138230468|
|   poppy.hensa.ac.uk|123475475|
|  e659229.boeing.com|123248242|
|www-d4.proxy.aol.com|117296962|
|www-b5.proxy.aol.com|111133037|
|www-a2.proxy.aol.com|109363240|
|www-b3.proxy.aol.com|106341593|
|www-b2.proxy.aol.com|105828128|
|    webgate1.mot.com|105503472|
|        163.206.89.4|104977983|
|       198.133.29.18|104445299|
|www-a1.proxy.aol.com|102951146|
|www-b4.proxy.aol.com|101836146|
|www-d3.proxy.aol.com|100341935|
+--------------------+---------+
only showing top 20 rows



### Which resources were the most requested?

In [63]:
df.groupBy('resource').agg({'*': 'count'}).orderBy('count(1)',ascending=False).show()

+--------------------+--------+
|            resource|count(1)|
+--------------------+--------+
|/images/NASA-logo...|  111330|
|/images/KSC-logos...|   89638|
|/images/MOSAIC-lo...|   60467|
|/images/USA-logos...|   60013|
|/images/WORLD-log...|   59488|
|/images/ksclogo-m...|   58801|
|/images/launch-lo...|   40871|
| /shuttle/countdown/|   40276|
|           /ksc.html|   40223|
|/images/ksclogosm...|   33585|
|                   /|   32838|
|/history/apollo/i...|   31072|
|/shuttle/missions...|   24864|
|  /htbin/cdt_main.pl|   22626|
|/shuttle/countdow...|   22216|
|/shuttle/countdow...|   21996|
|/shuttle/countdow...|   20956|
|/images/launchmed...|   20812|
|/shuttle/missions...|   19852|
|/shuttle/missions...|   18159|
+--------------------+--------+
only showing top 20 rows



In [67]:
df.rdd.take(5)

[Row(agent='', host='199.72.81.55', method='GET', protocol='HTTP/1.0', referer='', resource='/history/apollo/', size=6245, status=200, time=datetime.datetime(1995, 7, 1, 0, 0, 1), user='-'),
 Row(agent='', host='unicomp6.unicomp.net', method='GET', protocol='HTTP/1.0', referer='', resource='/shuttle/countdown/', size=3985, status=200, time=datetime.datetime(1995, 7, 1, 0, 0, 6), user='-'),
 Row(agent='', host='199.120.110.21', method='GET', protocol='HTTP/1.0', referer='', resource='/shuttle/missions/sts-73/mission-sts-73.html', size=4085, status=200, time=datetime.datetime(1995, 7, 1, 0, 0, 9), user='-'),
 Row(agent='', host='burger.letters.com', method='GET', protocol='HTTP/1.0', referer='', resource='/shuttle/countdown/liftoff.html', size=0, status=304, time=datetime.datetime(1995, 7, 1, 0, 0, 11), user='-'),
 Row(agent='', host='199.120.110.21', method='GET', protocol='HTTP/1.0', referer='', resource='/shuttle/missions/sts-73/sts-73-patch-small.gif', size=4179, status=200, time=dat

In [68]:
sc.stop()