# Data processing course assignments
---

In [1]:
import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import asc, lit, udf
from pyspark.sql.types import *
import re

In [2]:
conf = SparkConf().setAppName('Assignment').setMaster('local')
spark = SparkContext(conf=conf)

## Ejercicio 1.
#### Leer el archivo data/containers.csv y contar el número de líneas.
---

In [3]:
path = "/home/jovyan/work/data_processing_course-master/assignments/data/containers.csv"
csvdata = spark.textFile(path) #.option("header", "false").load(path)

In [4]:
csvdata.take(2)

['ship_imo;ship_name;country;departure;container_id;container_type;container_group;net_weight;gross_weight;owner;declared;contact;customs_ok',
 'AMC1861710;Jayden;BD;201602183;FCUK1755843;4960;28VH;44804866.62;2240243.33;Streich-Wilkinson;Music, Tools, Automotive & Health;octavia@stammbednar.name;true']

In [5]:
csvdata.count()

614

## Ejercicio 2.
#### Leer el archivo data/containers.csv y filtrar aquellos contenedores cuyo ship_imo es DEJ1128330 y el grupo del contenedor es 22P1. Guardar los resultados en un archivo de texto en resultados/resutado_2.
---

In [6]:
print(type(csvdata))
datardd = csvdata
print(type(datardd))

datasplit = datardd.map(lambda row: row.split(";"))

datasplit.take(2)

<class 'pyspark.rdd.RDD'>
<class 'pyspark.rdd.RDD'>


[['ship_imo',
  'ship_name',
  'country',
  'departure',
  'container_id',
  'container_type',
  'container_group',
  'net_weight',
  'gross_weight',
  'owner',
  'declared',
  'contact',
  'customs_ok'],
 ['AMC1861710',
  'Jayden',
  'BD',
  '201602183',
  'FCUK1755843',
  '4960',
  '28VH',
  '44804866.62',
  '2240243.33',
  'Streich-Wilkinson',
  'Music, Tools, Automotive & Health',
  'octavia@stammbednar.name',
  'true']]

In [7]:
datasplit.map(lambda row: row[0]).take(10)

['ship_imo',
 'AMC1861710',
 'POG1615575',
 'SQH1155999',
 'JCI1797526',
 'MBV1836745',
 'GYR1192020',
 'GLV1922612',
 'NLH1771681',
 'FUS1202266']

In [8]:
datasplit.filter(lambda row: row[6] == '22P1' and row[0] == 'DEJ1128330').count()

2

In [9]:
datasplit.filter(lambda row: row[6] == '22P1' and row[0] == 'DEJ1128330').take(2)

[['DEJ1128330',
  'Tiara',
  'GP',
  '2016021818',
  'GYFD1228113',
  '20PF',
  '22P1',
  '51503716.88',
  '5150371.69',
  'Armstrong-Goldner',
  'Automotive, Sports, Games & Clothing',
  'caria@cronin.io',
  'true'],
 ['DEJ1128330',
  'Tiara',
  'GP',
  '2016021818',
  'MBPF1909627',
  '24H2',
  '22P1',
  '37266600.88',
  '1863330.04',
  'Lehner-Hamill',
  'Jewelery, Automotive, Games & Electronics',
  'phoebe@volkman.net',
  'true']]

Guardando los datos como archivo de texto

In [10]:
x1 = datasplit.filter(lambda row: row[6] == '22P1' and row[0] == 'DEJ1128330')
x1.saveAsTextFile("/home/jovyan/work/data_processing_course-master/assignments/data/1.csv")

## Ejercicio 3.
#### Leer el archivo data/containers.csv y convertir a formato Parquet. Recuerda que puedes hacer uso de la funcion parse_container en helpers.py tal y como vimos en clase. Guarda los resultados en resultados/resultado_3.
---

In [34]:
path = "/home/jovyan/work/data_processing_course-master/assignments/data/containers.csv" #Usamos el mismo path creado previamente

In [35]:
sqlContext = SQLContext(spark)

In [38]:
df = sqlContext.read.format('com.databricks.spark.csv').options(header = "true", sep = ';').load(path)

In [47]:
df.first()

Row(ship_imo='AMC1861710', ship_name='Jayden', country='BD', departure='201602183', container_id='FCUK1755843', container_type='4960', container_group='28VH', net_weight='44804866.62', gross_weight='2240243.33', owner='Streich-Wilkinson', declared='Music, Tools, Automotive & Health', contact='octavia@stammbednar.name', customs_ok='true')

In [42]:
df.write.parquet("ejercicios_3.parquet")

## Ejercicio 4.
#### Lee el archivo de Parquet guardado en el ejercicio 3 y filtra los barcos que tienen al menos un contenedor donde la columna customs_ok es igual a false. Extrae una lista con los identificadores de barco, ship_imo, sin duplicados y ordenados alfabéticamente, en formato json.
---

In [48]:
data4 = sqlContext.read.parquet("ejercicios_3.parquet")
data4.registerTempTable("parquetFile")

In [50]:
sqlContext.sql("SELECT DISTINCT ship_imo FROM parquetFile WHERE customs_ok = 'false'").show()

+----------+
|  ship_imo|
+----------+
|KSP1096387|
|GYR1192020|
|JET1053895|
|SQH1155999|
|NLH1771681|
|JCI1797526|
|GEU1548633|
|AEY1108363|
|IWE1254579|
|AMC1861710|
|POG1615575|
|MBV1836745|
|GLV1922612|
|YZX1455509|
|TCU1641123|
|JMP1637582|
|DEJ1128330|
|RYP1117603|
|FUS1202266|
|NCZ1777367|
+----------+



In [52]:
data4 = sqlContext.sql("SELECT DISTINCT ship_imo FROM parquetFile WHERE customs_ok = 'false'")

In [54]:
sqlContext.sql("SELECT DISTINCT ship_imo FROM parquetFile WHERE customs_ok = 'false'").count()

20

In [55]:
data4.sort(asc("ship_imo")).collect()

[Row(ship_imo='AEY1108363'),
 Row(ship_imo='AMC1861710'),
 Row(ship_imo='DEJ1128330'),
 Row(ship_imo='FUS1202266'),
 Row(ship_imo='GEU1548633'),
 Row(ship_imo='GLV1922612'),
 Row(ship_imo='GYR1192020'),
 Row(ship_imo='IWE1254579'),
 Row(ship_imo='JCI1797526'),
 Row(ship_imo='JET1053895'),
 Row(ship_imo='JMP1637582'),
 Row(ship_imo='KSP1096387'),
 Row(ship_imo='MBV1836745'),
 Row(ship_imo='NCZ1777367'),
 Row(ship_imo='NLH1771681'),
 Row(ship_imo='POG1615575'),
 Row(ship_imo='RYP1117603'),
 Row(ship_imo='SQH1155999'),
 Row(ship_imo='TCU1641123'),
 Row(ship_imo='YZX1455509')]

In [58]:
data4.sort(asc("ship_imo")).toJSON("data_processing_course-master/assignments/resultados/resultado_4")

MapPartitionsRDD[176] at toJavaRDD at NativeMethodAccessorImpl.java:0