# desafioSemantix

### Desafio Técnico - Engenheiro de Dados 

### Leonardo Damasio

## Parte Prática

### 0. Setup

#### Importando bibliotecas

In [1]:
from pyspark.sql import SparkSession
import pandas as pd

#### Configurando SparkSession

In [2]:
spark = SparkSession.builder \
   .appName("desafioSemantix") \
   .config("spark.executor.memory", "4gb") \
   .getOrCreate()

#### Configurando SparkContext

In [3]:
sc = spark.sparkContext

#### Importando RDDs

In [4]:
jul95 = sc.textFile('data/access_log_Jul95')
aug95 = sc.textFile('data/access_log_Aug95')

#### Unificando em uma única RDD

In [5]:
rdd = jul95 + aug95
rdd.cache()

UnionRDD[4] at union at <unknown>:0

#### Amostra

In [6]:
rdd.take(10)

['199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245',
 'unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] "GET /shuttle/countdown/ HTTP/1.0" 200 3985',
 '199.120.110.21 - - [01/Jul/1995:00:00:09 -0400] "GET /shuttle/missions/sts-73/mission-sts-73.html HTTP/1.0" 200 4085',
 'burger.letters.com - - [01/Jul/1995:00:00:11 -0400] "GET /shuttle/countdown/liftoff.html HTTP/1.0" 304 0',
 '199.120.110.21 - - [01/Jul/1995:00:00:11 -0400] "GET /shuttle/missions/sts-73/sts-73-patch-small.gif HTTP/1.0" 200 4179',
 'burger.letters.com - - [01/Jul/1995:00:00:12 -0400] "GET /images/NASA-logosmall.gif HTTP/1.0" 304 0',
 'burger.letters.com - - [01/Jul/1995:00:00:12 -0400] "GET /shuttle/countdown/video/livevideo.gif HTTP/1.0" 200 0',
 '205.212.115.106 - - [01/Jul/1995:00:00:12 -0400] "GET /shuttle/countdown/countdown.html HTTP/1.0" 200 3985',
 'd104.aa.net - - [01/Jul/1995:00:00:13 -0400] "GET /shuttle/countdown/ HTTP/1.0" 200 3985',
 '129.94.144.152 - - [01/Jul/

### 1. Número de hosts únicos.

#### Realizando contagem

In [7]:
hosts = rdd.map(lambda line: line.split(" ")[0]) \
    .map(lambda value: (value, 1)) \
    .reduceByKey(lambda a,b: a+b) 

hosts.cache()

PythonRDD[10] at RDD at PythonRDD.scala:53

#### Amostra da contagem

In [8]:
hosts.take(10)

[('unicomp6.unicomp.net', 14),
 ('dial22.lloyd.com', 4),
 ('www-a1.proxy.aol.com', 6661),
 ('dave.dev1.ihub.com', 4),
 ('brandt.xensei.com', 80),
 ('dnet018.sat.texas.net', 71),
 ('166.79.67.111', 17),
 ('dynip38.efn.org', 17),
 ('piweba1y.prodigy.com', 12825),
 ('oahu-53.u.aloha.net', 7)]

#### Resultado

In [9]:
Q1 = hosts.count()
Q1

137979

### 2. O total de erros 404.

In [10]:
error404 = rdd.filter(lambda line: " 404 -"  in line or " 404 0" in line)

error404.cache()

PythonRDD[13] at RDD at PythonRDD.scala:53

#### Resultado

In [11]:
Q2 = error404.count()
Q2

20901

### 3. Os 5 URLs que mais causaram erro 404.

In [12]:
top5urls = rdd.filter(lambda line: line.split(" ")[0] and " 404 -"  in line or " 404 0" in line) \
    .map(lambda line: line.split(" ")[0]) \
    .map(lambda value: (value, 1)) \
    .reduceByKey(lambda a,b: a+b) \
    .sortBy(lambda x: x[1], ascending=False)

top5urls.cache()

PythonRDD[25] at RDD at PythonRDD.scala:53

#### Resultado

In [13]:
Q3 = top5urls.take(5)
Q3

[('hoohoo.ncsa.uiuc.edu', 251),
 ('piweba3y.prodigy.com', 157),
 ('jbiagioni.npt.nuwc.navy.mil', 132),
 ('piweba1y.prodigy.com', 114),
 ('www-d4.proxy.aol.com', 91)]

### 4. Quantidade de erros 404 por dia.

In [14]:
def func404(line):
    try: 
        code = line.split(" ")[-2]
        if code == "404": 
            return True
    except: 
        pass
    return False

error404_jul95 = jul95.filter(func404) \
    .map(lambda line: line.split("[")[1].split(':')[0]) \
    .map(lambda value: (value, 1)) \
    .reduceByKey(lambda a,b: a+b) \
    .sortBy(lambda x: x[0])

error404_aug95 = aug95.filter(lambda line: line.split("[")[1].split(":")[0] and " 404 -"  in line or " 404 0" in line ) \
    .map(lambda line: line.split("[")[1].split(':')[0]) \
    .map(lambda value: (value, 1)) \
    .reduceByKey(lambda a,b: a+b) \
    .sortBy(lambda x: x[0])

error404daily = error404_jul95 + error404_aug95

error404daily.cache()

UnionRDD[49] at union at <unknown>:0

#### Resultado

In [15]:
Q4 = error404daily.collect()
Q4

[('01/Jul/1995', 316),
 ('02/Jul/1995', 291),
 ('03/Jul/1995', 474),
 ('04/Jul/1995', 359),
 ('05/Jul/1995', 497),
 ('06/Jul/1995', 640),
 ('07/Jul/1995', 570),
 ('08/Jul/1995', 302),
 ('09/Jul/1995', 348),
 ('10/Jul/1995', 398),
 ('11/Jul/1995', 471),
 ('12/Jul/1995', 471),
 ('13/Jul/1995', 532),
 ('14/Jul/1995', 413),
 ('15/Jul/1995', 254),
 ('16/Jul/1995', 257),
 ('17/Jul/1995', 406),
 ('18/Jul/1995', 465),
 ('19/Jul/1995', 639),
 ('20/Jul/1995', 428),
 ('21/Jul/1995', 334),
 ('22/Jul/1995', 192),
 ('23/Jul/1995', 233),
 ('24/Jul/1995', 328),
 ('25/Jul/1995', 461),
 ('26/Jul/1995', 336),
 ('27/Jul/1995', 336),
 ('28/Jul/1995', 94),
 ('01/Aug/1995', 243),
 ('03/Aug/1995', 304),
 ('04/Aug/1995', 346),
 ('05/Aug/1995', 236),
 ('06/Aug/1995', 373),
 ('07/Aug/1995', 537),
 ('08/Aug/1995', 391),
 ('09/Aug/1995', 279),
 ('10/Aug/1995', 315),
 ('11/Aug/1995', 263),
 ('12/Aug/1995', 196),
 ('13/Aug/1995', 216),
 ('14/Aug/1995', 287),
 ('15/Aug/1995', 327),
 ('16/Aug/1995', 259),
 ('17/Aug/19

### 5. O total de bytes retornados.

In [16]:
def funcbytes(line):
    try: 
        nbytes = int(line.split(" ")[-1])
        return True
    except: 
        pass
    return False

totalbytes = rdd.filter(funcbytes) \
    .map(lambda line: line.split(" ")[-1]) \
    .map(lambda num: int(num)) \
    .reduce(lambda a,b: a+b)

#### Resultado

In [17]:
Q5 = totalbytes
Q5

65524314915

### Resumo das respostas:

In [18]:
print("1. Número de hosts únicos.\n", Q1, "\n")

print("2. O total de erros 404.\n", Q2, "\n")

print("3. Os 5 URLs que mais causaram erro 404.\n")
Q3_df = pd.DataFrame(Q3, columns=["URLs","Qtd. erros 404"])
print(Q3_df)
print("\n")

print("4. Quantidade de erros 404 por dia.\n")
Q4_df = pd.DataFrame(Q4, columns=["Dia","Qtd. erros 404"])
print(Q4_df)
print("\n")

print("5. O total de bytes retornados.\n", Q5, "\n")

1. Número de hosts únicos.
 137979 

2. O total de erros 404.
 20901 

3. Os 5 URLs que mais causaram erro 404.

                          URLs  Qtd. erros 404
0         hoohoo.ncsa.uiuc.edu             251
1         piweba3y.prodigy.com             157
2  jbiagioni.npt.nuwc.navy.mil             132
3         piweba1y.prodigy.com             114
4         www-d4.proxy.aol.com              91


4. Quantidade de erros 404 por dia.

            Dia  Qtd. erros 404
0   01/Jul/1995             316
1   02/Jul/1995             291
2   03/Jul/1995             474
3   04/Jul/1995             359
4   05/Jul/1995             497
5   06/Jul/1995             640
6   07/Jul/1995             570
7   08/Jul/1995             302
8   09/Jul/1995             348
9   10/Jul/1995             398
10  11/Jul/1995             471
11  12/Jul/1995             471
12  13/Jul/1995             532
13  14/Jul/1995             413
14  15/Jul/1995             254
15  16/Jul/1995             257
16  17/Jul/1995       

### Encerrando a sessão

In [19]:
spark.stop()

#### Qualquer dúvida, favor entrar em contato:
#### leoleonardo1996@hotmail.com
#### (11) 98783-3949