In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages mysql:mysql-connector-java:8.0.23 pyspark-shell"

SPARK_HOME = '/home/jose/Frameworks/spark-3.0.2-bin-hadoop2.7'

In [3]:
import findspark
findspark.init(SPARK_HOME)

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

config = SparkConf() \
                    .setMaster('local[*]') \
                    .setAppName('Spark Base') \
                    .setAll([('spark.executor.memory', '2G'),
                            ('spark.driver.memory', '2G'),
                            ('spark.driver.maxResultSize', '1G')])

sc = SparkContext(conf=config)
spark = SparkSession(sc)

spark

## CSV

In [4]:
csv_df = spark.read \
              .options(delimeter=',') \
              .csv('../datasets/forestfires.csv', header=True, inferSchema=True)
csv_df.show()
csv_df.count()

+---+---+-----+---+----+-----+-----+----+----+---+----+----+----+
|  X|  Y|month|day|FFMC|  DMC|   DC| ISI|temp| RH|wind|rain|area|
+---+---+-----+---+----+-----+-----+----+----+---+----+----+----+
|  7|  5|  mar|fri|86.2| 26.2| 94.3| 5.1| 8.2| 51| 6.7| 0.0| 0.0|
|  7|  4|  oct|tue|90.6| 35.4|669.1| 6.7|18.0| 33| 0.9| 0.0| 0.0|
|  7|  4|  oct|sat|90.6| 43.7|686.9| 6.7|14.6| 33| 1.3| 0.0| 0.0|
|  8|  6|  mar|fri|91.7| 33.3| 77.5| 9.0| 8.3| 97| 4.0| 0.2| 0.0|
|  8|  6|  mar|sun|89.3| 51.3|102.2| 9.6|11.4| 99| 1.8| 0.0| 0.0|
|  8|  6|  aug|sun|92.3| 85.3|488.0|14.7|22.2| 29| 5.4| 0.0| 0.0|
|  8|  6|  aug|mon|92.3| 88.9|495.6| 8.5|24.1| 27| 3.1| 0.0| 0.0|
|  8|  6|  aug|mon|91.5|145.4|608.2|10.7| 8.0| 86| 2.2| 0.0| 0.0|
|  8|  6|  sep|tue|91.0|129.5|692.6| 7.0|13.1| 63| 5.4| 0.0| 0.0|
|  7|  5|  sep|sat|92.5| 88.0|698.6| 7.1|22.8| 40| 4.0| 0.0| 0.0|
|  7|  5|  sep|sat|92.5| 88.0|698.6| 7.1|17.8| 51| 7.2| 0.0| 0.0|
|  7|  5|  sep|sat|92.8| 73.2|713.0|22.6|19.3| 38| 4.0| 0.0| 0.0|
|  6|  5| 

517

In [5]:
csv_df.write.csv('../datasets/write-csv', mode='overwrite')

## Parquet

In [6]:
parquet_df = spark.read \
                  .parquet('../datasets/olympic-history/athletes_events')
parquet_df.show()
parquet_df.count()

+---+--------------------+---+---+------+------+--------------+---+-----------+----+------+-----------+--------------------+--------------------+-----+
| ID|                Name|Sex|Age|Height|Weight|          Team|NOC|      Games|Year|Season|       City|               Sport|               Event|Medal|
+---+--------------------+---+---+------+------+--------------+---+-----------+----+------+-----------+--------------------+--------------------+-----+
|  1|           A Dijiang|  M| 24|   180|    80|         China|CHN|1992 Summer|1992|Summer|  Barcelona|          Basketball|Basketball Men's ...|   NA|
|  2|            A Lamusi|  M| 23|   170|    60|         China|CHN|2012 Summer|2012|Summer|     London|                Judo|Judo Men's Extra-...|   NA|
|  3| Gunnar Nielsen Aaby|  M| 24|    NA|    NA|       Denmark|DEN|1920 Summer|1920|Summer|  Antwerpen|            Football|Football Men's Fo...|   NA|
|  4|Edgar Lindenau Aabye|  M| 34|    NA|    NA|Denmark/Sweden|DEN|1900 Summer|1900|Summ

271116

In [7]:
parquet_df.write.parquet('../datasets/write-parquet', mode='overwrite')

## Database

In [63]:
# Setup the JDBC connection
jdbc_url = "jdbc:mysql://localhost:3306/loja"
connection_properties = {
      "user" : "root",
      "password" : "123",
      "driver" : "com.mysql.jdbc.Driver"
    }

# Create a query
query = """
            SELECT * FROM bairro
        """

# run the query
bh_bairros_df = spark.read \
                     .jdbc(url=jdbc_url, 
                           table=f"({query}) AS t", 
                           properties=connection_properties)

bh_bairros_df.show()
bh_bairros_df.count()
bh_bairros_df.printSchema()

+---+--------------------+--------+------------+-------------------+-------------------+
| id|                nome|  regiao|taxa_entrega|         created_at|         updated_at|
+---+--------------------+--------+------------+-------------------+-------------------+
|  1|Conjunto Ademar M...|Barreiro|        5.99|2018-05-21 22:08:33|2018-07-22 01:28:08|
|  2|Conjunto Águas Cl...|Barreiro|         4.0|2018-05-21 22:08:33|2018-05-21 22:08:33|
|  3|    Vila Alta Tensão|Barreiro|         4.0|2018-05-21 22:08:33|2018-05-21 22:08:33|
|  4|        Vila Antenas|Barreiro|         4.0|2018-05-21 22:08:33|2018-05-21 22:08:33|
|  5|            Araguaia|Barreiro|         4.0|2018-05-21 22:08:33|2018-05-21 22:08:33|
|  6|Conjunto Átila de...|Barreiro|         4.0|2018-05-21 22:08:33|2018-05-21 22:08:33|
|  7| Vila Átila de Paiva|Barreiro|         4.0|2018-05-21 22:08:33|2018-05-21 22:08:33|
|  8|       B. Indústrias|Barreiro|         4.0|2018-05-21 22:08:33|2018-05-21 22:08:33|
|  9|            Barr

In [65]:
from pyspark.sql import functions as F

bh_bairros_df.write \
             .mode("overwrite") \
             .option("truncate", "true") \
             .jdbc(url=jdbc_url, 
                   table="bairro",
                   properties=connection_properties)

bh_bairros_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- nome: string (nullable = true)
 |-- regiao: string (nullable = true)
 |-- taxa_entrega: double (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- updated_at: timestamp (nullable = true)

