<a href="https://colab.research.google.com/github/juliaschuck/usePyspark/blob/main/usePySpark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Instalação de dependências

In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark

In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop2.7"


In [4]:
import findspark
findspark.init('spark-2.4.4-bin-hadoop2.7')

In [5]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
        .master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

# Busca json e salva em tmp

In [6]:
df = spark.read.json("/content/sample_data/books.json")

In [7]:
df.printSchema()

root
 |-- author: string (nullable = true)
 |-- edition: string (nullable = true)
 |-- price: double (nullable = true)
 |-- title: string (nullable = true)
 |-- year_written: long (nullable = true)



# Mostra tabela com dados

In [8]:
df.show(4,False)

+---------------+--------------+-----+----------------+------------+
|author         |edition       |price|title           |year_written|
+---------------+--------------+-----+----------------+------------+
|Austen, Jane   |Penguin       |18.2 |Northanger Abbey|1814        |
|Tolstoy, Leo   |Penguin       |12.7 |War and Peace   |1865        |
|Tolstoy, Leo   |Penguin       |13.5 |Anna Karenina   |1875        |
|Woolf, Virginia|Harcourt Brace|25.0 |Mrs. Dalloway   |1925        |
+---------------+--------------+-----+----------------+------------+
only showing top 4 rows



In [9]:
df.show(4,False)


+---------------+--------------+-----+----------------+------------+
|author         |edition       |price|title           |year_written|
+---------------+--------------+-----+----------------+------------+
|Austen, Jane   |Penguin       |18.2 |Northanger Abbey|1814        |
|Tolstoy, Leo   |Penguin       |12.7 |War and Peace   |1865        |
|Tolstoy, Leo   |Penguin       |13.5 |Anna Karenina   |1875        |
|Woolf, Virginia|Harcourt Brace|25.0 |Mrs. Dalloway   |1925        |
+---------------+--------------+-----+----------------+------------+
only showing top 4 rows



In [10]:
df.count()

13

In [11]:
df.select("title", "price", "year_written").show()

+--------------------+-----+------------+
|               title|price|year_written|
+--------------------+-----+------------+
|    Northanger Abbey| 18.2|        1814|
|       War and Peace| 12.7|        1865|
|       Anna Karenina| 13.5|        1875|
|       Mrs. Dalloway| 25.0|        1925|
|           The Hours|12.35|        1999|
|    Huckleberry Finn| 5.76|        1865|
|         Bleak House| 5.75|        1870|
|          Tom Sawyer| 7.75|        1862|
| A Room of One's Own| 29.0|        1922|
|        Harry Potter|19.95|        2000|
|One Hundred Years...| 14.0|        1967|
|Hamlet, Prince of...| 7.95|        1603|
|   Lord of the Rings|27.45|        1937|
+--------------------+-----+------------+



In [12]:
df_filtered = df.filter("year_written > 1950 AND price > 10 AND title IS NOT NULL")
df_filtered.select("title", "price", "year_written").show(10, False)

+-----------------------------+-----+------------+
|title                        |price|year_written|
+-----------------------------+-----+------------+
|The Hours                    |12.35|1999        |
|Harry Potter                 |19.95|2000        |
|One Hundred Years of Solitude|14.0 |1967        |
+-----------------------------+-----+------------+



In [13]:
df_filtered.select("title", "year_written").filter("title LIKE '%Harry Potter%'").distinct().show()

+------------+------------+
|       title|year_written|
+------------+------------+
|Harry Potter|        2000|
+------------+------------+



# PySpark SQL Funcion


In [14]:
from pyspark.sql.functions import max

maxValue = df_filtered.agg(max("price")).collect()[0][0]
print("maxValue: ",maxValue)


maxValue:  19.95
