## [NASA logs for July 1995](http://ita.ee.lbl.gov/html/contrib/NASA-HTTP.html)

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [2]:
# !wget ftp://ita.ee.lbl.gov/traces/NASA_access_log_Jul95.gz
# !gunzip NASA_access_log_Jul95.gz

## Load logs into a RDD

In [3]:
sc = spark.sparkContext
nasa = sc.textFile('NASA_access_log_Jul95')

### How many non-empty lines?

In [4]:
nasa.filter(lambda line: len(line) > 0).count()

1891715

### Create a function to parse Apache logs and, at the same time, counts malformed lines

In [5]:
import re
# host unused user time method resource status size referer agent 
pattern = '^(\S+) \S+ (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+)\s?(\S+)?\s?(\S+)?" (\d{3}|-) (\d+|-)\s?"?([^"]*)"?\s?"?([^"]*)?"?$'
weblog = re.compile(pattern=pattern)

bad_line = spark.sparkContext.accumulator(0)

def process_logs(line):
    try:
        return weblog.findall(line)[0]
    except IndexError:
        bad_line.add(1)
        return ''

In [6]:
print(nasa.take(1)[0])

199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245


In [7]:
nasa.map(process_logs).filter(lambda line: line == '').count()

1902

In [8]:
bad_line.value

1902

### Create a function to convert a tuple into a Row object

In [9]:
import time, datetime

def parse_time(str_time):
    tt = time.strptime(str_time[:-6], "%d/%b/%Y:%H:%M:%S")
    return datetime.datetime(*tt[:6])

In [10]:
from pyspark.sql import Row

no_size = spark.sparkContext.accumulator(0)

def to_int(value):
    try:
        value = int(value)
    except:
        value = -1
        no_size.add(1)
    return value

def log_to_row(line):
    fields = process_logs(line)
    if fields == '':
        fields = [''] * 10
        fields[2] = '01/Jan/1900:00:00:00 -0000'
        fields[6] = 0

    row = Row(host=fields[0], 
              user=fields[1], 
              time=parse_time(fields[2]),
              method=fields[3], 
              resource=fields[4], 
              protocol=fields[5],
              status=int(fields[6]),
              size=to_int(fields[7]),
              referer=fields[8], 
              agent=fields[9])
    return row

In [11]:
log_to_row(nasa.take(1)[0])

Row(agent='', host='199.72.81.55', method='GET', protocol='HTTP/1.0', referer='', resource='/history/apollo/', size=6245, status=200, time=datetime.datetime(1995, 7, 1, 0, 0, 1), user='-')

### Create a DataFrame

In [12]:
df = spark.createDataFrame(nasa.map(log_to_row))

In [13]:
df.show()

+-----+--------------------+------+--------+-------+--------------------+-----+------+-------------------+----+
|agent|                host|method|protocol|referer|            resource| size|status|               time|user|
+-----+--------------------+------+--------+-------+--------------------+-----+------+-------------------+----+
|     |        199.72.81.55|   GET|HTTP/1.0|       |    /history/apollo/| 6245|   200|1995-07-01 00:00:01|   -|
|     |unicomp6.unicomp.net|   GET|HTTP/1.0|       | /shuttle/countdown/| 3985|   200|1995-07-01 00:00:06|   -|
|     |      199.120.110.21|   GET|HTTP/1.0|       |/shuttle/missions...| 4085|   200|1995-07-01 00:00:09|   -|
|     |  burger.letters.com|   GET|HTTP/1.0|       |/shuttle/countdow...|    0|   304|1995-07-01 00:00:11|   -|
|     |      199.120.110.21|   GET|HTTP/1.0|       |/shuttle/missions...| 4179|   200|1995-07-01 00:00:11|   -|
|     |  burger.letters.com|   GET|HTTP/1.0|       |/images/NASA-logo...|    0|   304|1995-07-01 00:00:1

In [14]:
df.count()

1891715

In [15]:
no_size.value

21557

In [16]:
df.filter('size == -1').count()

21557

### Which hosts were responsible for the largest amount of traffic?

In [17]:
from pyspark.sql import functions as F

df.groupby('host').agg(F.sum('size').alias('total_size')).orderBy(F.desc('total_size')).show()

+--------------------+----------+
|                host|total_size|
+--------------------+----------+
|piweba3y.prodigy.com| 433605413|
|piweba1y.prodigy.com| 261097448|
|piweba4y.prodigy.com| 250619786|
|  alyssa.prodigy.com| 209657052|
|         news.ti.com| 172264955|
|piweba2y.prodigy.com| 159858665|
|      163.206.137.21| 138230468|
|   poppy.hensa.ac.uk| 123475475|
|  e659229.boeing.com| 123248242|
|www-d4.proxy.aol.com| 117296962|
|www-b5.proxy.aol.com| 111133037|
|www-a2.proxy.aol.com| 109363240|
|www-b3.proxy.aol.com| 106341593|
|www-b2.proxy.aol.com| 105828128|
|    webgate1.mot.com| 105503472|
|        163.206.89.4| 104977983|
|       198.133.29.18| 104445299|
|www-a1.proxy.aol.com| 102951146|
|www-b4.proxy.aol.com| 101836146|
|www-d3.proxy.aol.com| 100341935|
+--------------------+----------+
only showing top 20 rows



### Which resources were the most requested?

In [18]:
df.groupby('resource').agg(F.count(F.lit(1)).alias('count')).orderBy(F.desc('count')).limit(20).toPandas()

Unnamed: 0,resource,count
0,/images/NASA-logosmall.gif,111330
1,/images/KSC-logosmall.gif,89638
2,/images/MOSAIC-logosmall.gif,60467
3,/images/USA-logosmall.gif,60013
4,/images/WORLD-logosmall.gif,59488
5,/images/ksclogo-medium.gif,58801
6,/images/launch-logo.gif,40871
7,/shuttle/countdown/,40276
8,/ksc.html,40223
9,/images/ksclogosmall.gif,33585
