# Data Formats

In [1]:
# Making sure to link pyspark to the right Spark folder with findspark
import findspark
from functools import wraps
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType
findspark.init('/opt/spark')

In [2]:
conf = SparkConf().setAppName("data-formats")
sc = SparkContext(conf=conf)

In [3]:
spark = SparkSession(sc)

In [4]:
! hadoop fs -put ../datasets/20230515_000000.jsonl

23/08/10 00:54:33 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
put: `20230515_000000.jsonl': File exists


### JSONL

In [5]:
! hadoop fs -ls -h /user/root

23/08/10 00:54:35 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 7 items
drwxr-xr-x   - root supergroup          0 2023-08-10 00:54 /user/root/.sparkStaging
-rw-r--r--   2 root supergroup     46.8 M 2023-08-09 23:48 /user/root/20230515_000000.jsonl
drwxr-xr-x   - root supergroup          0 2023-08-09 23:53 /user/root/20230515_000000.parquet
drwxr-xr-x   - root supergroup          0 2023-08-09 23:45 /user/root/bus-api-small
drwxr-xr-x   - root supergroup          0 2023-08-10 00:39 /user/root/csv
drwxr-xr-x   - root supergroup          0 2023-08-10 00:43 /user/root/orc
drwxr-xr-x   - root supergroup          0 2023-08-10 00:38 /user/root/parquet


## Conversão pré-carregamento

In [6]:
# datahora         =>   integer
# datahoraenvio    =>   integer
# datahoraservidor =>   integer
# latitude         =>   float/double/numeric
# longitude        =>   float/double/numeric
# linha            =>   string
# ordem            =>   string
# velocidade       =>   integer

schema = StructType([
    StructField("datahora", IntegerType(), True),
    StructField("datahoraenvio", IntegerType(), True),
    StructField("datahoraservidor", IntegerType(), True),
    StructField("latitude", FloatType(), True),
    StructField("longitude", FloatType(), True),
    StructField("linha", StringType(), True),
    StructField("ordem", StringType(), True),
    StructField("velocidade", IntegerType(), True),
])

In [7]:
%%time
df = spark.read.schema(schema).json("/user/root/20230515_000000.jsonl")

CPU times: user 2.08 ms, sys: 1.59 ms, total: 3.67 ms
Wall time: 899 ms


In [8]:
df.printSchema()

root
 |-- datahora: integer (nullable = true)
 |-- datahoraenvio: integer (nullable = true)
 |-- datahoraservidor: integer (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- linha: string (nullable = true)
 |-- ordem: string (nullable = true)
 |-- velocidade: integer (nullable = true)



In [9]:
df.head()

Row(datahora=None, datahoraenvio=None, datahoraservidor=None, latitude=None, longitude=None, linha=None, ordem=None, velocidade=None)

Não funciona o carregamento passando o schema no caso de jsonl!<br/>
Valores são carregados como `None`

## Conversão pós-carregamento

In [10]:
%%time
df = spark.read.json("/user/root/20230515_000000.jsonl")

CPU times: user 1.72 ms, sys: 855 µs, total: 2.58 ms
Wall time: 764 ms


In [11]:
df.printSchema()

root
 |-- datahora: string (nullable = true)
 |-- datahoraenvio: string (nullable = true)
 |-- datahoraservidor: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- linha: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- ordem: string (nullable = true)
 |-- velocidade: string (nullable = true)



In [12]:
df.head()

Row(datahora='1684119592000', datahoraenvio='1684119600000', datahoraservidor='1684119620000', latitude='-22,87652', linha='LECD36', longitude='-43,36818', ordem='C51641', velocidade='37')

Valores todos carregados como `string`

Realizando a conversão

In [13]:
df = df.withColumn(
    "datahora", col("datahora").cast("long")
).withColumn(
    "datahoraenvio", col("datahoraenvio").cast("long")
).withColumn(
    "datahoraservidor", col("datahoraservidor").cast("long")
).withColumn(
    "latitude", regexp_replace(col("latitude"), ",", ".").cast("double")
).withColumn(
    "longitude", regexp_replace(col("longitude"), ",", ".").cast("double")
).withColumn(
    "velocidade", col("velocidade").cast("integer")
)

In [14]:
df.printSchema()

root
 |-- datahora: long (nullable = true)
 |-- datahoraenvio: long (nullable = true)
 |-- datahoraservidor: long (nullable = true)
 |-- latitude: double (nullable = true)
 |-- linha: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- ordem: string (nullable = true)
 |-- velocidade: integer (nullable = true)



In [15]:
df.head()

Row(datahora=1684119592000, datahoraenvio=1684119600000, datahoraservidor=1684119620000, latitude=-22.87652, linha='LECD36', longitude=-43.36818, ordem='C51641', velocidade=37)

Escrevendo em apenas um arquivo Parquet<br/>
Lembrando que devemos passar um diretório como Path para escrita

### Parquet

In [16]:
! hadoop fs -rm -r /user/root/parquet

23/08/10 00:54:40 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/08/10 00:54:40 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.
Deleted /user/root/parquet


In [17]:
%%time
df.coalesce(1).write.parquet("/user/root/parquet")

CPU times: user 1.86 ms, sys: 1.98 ms, total: 3.84 ms
Wall time: 1.52 s


In [18]:
! hadoop fs -ls -h /user/root/parquet

23/08/10 00:54:43 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 2 items
-rw-r--r--   2 root supergroup          0 2023-08-10 00:54 /user/root/parquet/_SUCCESS
-rw-r--r--   2 root supergroup      2.2 M 2023-08-10 00:54 /user/root/parquet/part-00000-affdba43-db1e-49a5-ab5f-40c1074d870b-c000.snappy.parquet


46.8 MB vs 2.2 MB

In [19]:
%%time
dfp = spark.read.parquet("/user/root/parquet")

CPU times: user 2.81 ms, sys: 1.42 ms, total: 4.23 ms
Wall time: 123 ms


In [20]:
dfp.printSchema()

root
 |-- datahora: long (nullable = true)
 |-- datahoraenvio: long (nullable = true)
 |-- datahoraservidor: long (nullable = true)
 |-- latitude: double (nullable = true)
 |-- linha: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- ordem: string (nullable = true)
 |-- velocidade: integer (nullable = true)



## Exportando de Parquet para demais tipos

### CSV

In [21]:
! hadoop fs -rm -r /user/root/csv

23/08/10 00:54:44 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/08/10 00:54:44 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.
Deleted /user/root/csv


In [22]:
%%time
dfp.coalesce(1).write.option("header", True).csv("/user/root/csv")

CPU times: user 4.32 ms, sys: 1.65 ms, total: 5.97 ms
Wall time: 678 ms


In [23]:
! hadoop fs -ls -h /user/root/csv

23/08/10 00:54:46 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 2 items
-rw-r--r--   2 root supergroup          0 2023-08-10 00:54 /user/root/csv/_SUCCESS
-rw-r--r--   2 root supergroup     17.2 M 2023-08-10 00:54 /user/root/csv/part-00000-778e7551-736b-4ae7-b961-e2dcebc59372-c000.csv


In [24]:
%%time
df = spark.read.option("header", True).csv("/user/root/csv")

CPU times: user 3.35 ms, sys: 1.15 ms, total: 4.49 ms
Wall time: 315 ms


In [25]:
df.printSchema()

root
 |-- datahora: string (nullable = true)
 |-- datahoraenvio: string (nullable = true)
 |-- datahoraservidor: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- linha: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- ordem: string (nullable = true)
 |-- velocidade: string (nullable = true)



### Avro

In [26]:
! hadoop fs -ls -h /user/root/avro

23/08/10 00:54:48 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
ls: `/user/root/avro': No such file or directory


In [27]:
dfp.coalesce(1).write.avro("/user/root/avro")

AttributeError: 'DataFrameWriter' object has no attribute 'avro'

Carregar dependência da biblioteca `Spark Avro`

### ORC

In [32]:
! hadoop fs -rm -r /user/root/orc

23/08/10 00:55:43 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/08/10 00:55:43 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.
Deleted /user/root/orc


In [33]:
%%time
dfp.coalesce(1).write.orc("/user/root/orc")

CPU times: user 3.63 ms, sys: 2.11 ms, total: 5.74 ms
Wall time: 1.05 s


In [34]:
! hadoop fs -ls -h /user/root/orc

23/08/10 00:55:45 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Found 2 items
-rw-r--r--   2 root supergroup          0 2023-08-10 00:55 /user/root/orc/_SUCCESS
-rw-r--r--   2 root supergroup      3.1 M 2023-08-10 00:55 /user/root/orc/part-00000-b757656e-3dfb-46a5-a766-4a2affbe4019-c000.snappy.orc


In [35]:
%%time
df = spark.read.orc("/user/root/orc")

CPU times: user 5.03 ms, sys: 3.62 ms, total: 8.65 ms
Wall time: 48.5 ms


In [36]:
df.printSchema()

root
 |-- datahora: long (nullable = true)
 |-- datahoraenvio: long (nullable = true)
 |-- datahoraservidor: long (nullable = true)
 |-- latitude: double (nullable = true)
 |-- linha: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- ordem: string (nullable = true)
 |-- velocidade: integer (nullable = true)



### Exemplo para debugging

In [38]:
! hadoop fs -cat /user/root/csv/*.csv | head -n 2

23/08/10 00:56:02 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
datahora,datahoraenvio,datahoraservidor,latitude,linha,longitude,ordem,velocidade
1684119592000,1684119600000,1684119620000,-22.87652,LECD36,-43.36818,C51641,37
cat: Unable to write to output stream.
