In [1]:
!pip3 install pyspark
!pip3 install findspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.2-py2.py3-none-any.whl size=281824025 sha256=6cf826b7878787b4f1e506e8afebe1ff63fa72402c82347d91b1b34f1bd998bb
  Stored in directory: /root/.cache/pip/wheels/6c/e3/9b/0525ce8a69478916513509d43693511463c6468db0de237c86
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [4]:
import findspark
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

In [6]:
spark = SparkSession.builder.appName('app_spark').getOrCreate()

### DataFrames

In [7]:
df1 = spark.createDataFrame([('Pedro', 10), ('Maria', 15), ('Higor', 22)])
df1.show()

### Criando Schema

In [13]:
schema = 'id INT, nome STRING'
dados = [[1, 'Pedro'], [2, 'Maria']]

df2 = spark.createDataFrame(dados, schema)
df2.show()

+---+-----+
| id| nome|
+---+-----+
|  1|Pedro|
|  2|Maria|
+---+-----+



### Utilizando agregações

In [14]:
from pyspark.sql.functions import sum

In [16]:
schema2 = 'produtos STRING, qtd_vendida INT'
vendas = [['Caneta', 10], ['Lápis', 30], ['Caneta', 30]]
df3 = spark.createDataFrame(vendas, schema2)
df3.show()

+--------+-----------+
|produtos|qtd_vendida|
+--------+-----------+
|  Caneta|         10|
|   Lápis|         30|
|  Caneta|         30|
+--------+-----------+



In [20]:
agrupado = df3.groupBy('produtos').agg(sum('qtd_vendida'))
agrupado.show()

+--------+----------------+
|produtos|sum(qtd_vendida)|
+--------+----------------+
|  Caneta|              40|
|   Lápis|              30|
+--------+----------------+



### Selecionar colunas

In [21]:
df3.select('produtos').show()

+--------+
|produtos|
+--------+
|  Caneta|
|   Lápis|
|  Caneta|
+--------+



### Criando cálculos (expressões)

In [22]:
from pyspark.sql.functions import expr

In [26]:
df3.select('produtos', 'qtd_vendida', expr('qtd_vendida * 0.2')).show()

+--------+-----------+-------------------+
|produtos|qtd_vendida|(qtd_vendida * 0.2)|
+--------+-----------+-------------------+
|  Caneta|         10|                2.0|
|   Lápis|         30|                6.0|
|  Caneta|         30|                6.0|
+--------+-----------+-------------------+



### Checar schema e colunas

In [29]:
df3.schema

StructType([StructField('produtos', StringType(), True), StructField('qtd_vendida', IntegerType(), True)])

In [30]:
df3.columns

['produtos', 'qtd_vendida']

### Coletando dados de csv

In [31]:
from pyspark.sql.types import *

In [35]:
csv_schema = 'id INT, nome STRING, status STRING, cidade STRING, vendas INT, data STRING'
despachantes = spark.read.csv('/content/despachantes.csv', header=False, schema=csv_schema)
despachantes.show()

+---+-------------------+------+-------------+------+----------+
| id|               nome|status|       cidade|vendas|      data|
+---+-------------------+------+-------------+------+----------+
|  2|    Deolinda Vilela| Ativo|Novo Hamburgo|    34|2020-03-05|
|  3|   Emídio Dornelles| Ativo| Porto Alegre|    34|2020-02-05|
|  4|Felisbela Dornelles| Ativo| Porto Alegre|    36|2020-02-05|
|  5|     Graça Ornellas| Ativo| Porto Alegre|    12|2020-02-05|
|  6|   Matilde Rebouças| Ativo| Porto Alegre|    22|2019-01-05|
|  7|    Noêmia   Orriça| Ativo|  Santa Maria|    45|2019-10-05|
|  8|      Roque Vásquez| Ativo| Porto Alegre|    65|2020-03-05|
|  9|      Uriel Queiroz| Ativo| Porto Alegre|    54|2018-05-05|
| 10|   Viviana Sequeira| Ativo| Porto Alegre|     0|2020-09-05|
+---+-------------------+------+-------------+------+----------+



In [39]:
despachantes_auto_schema = spark.read.load('/content/despachantes.csv', header=False, format='csv', sep=',', inferSchema=True)
despachantes_auto_schema.show()

+---+-------------------+-----+-------------+---+-------------------+
|_c0|                _c1|  _c2|          _c3|_c4|                _c5|
+---+-------------------+-----+-------------+---+-------------------+
|  1|   Carminda Pestana|Ativo|  Santa Maria| 23|2020-08-11 00:00:00|
|  2|    Deolinda Vilela|Ativo|Novo Hamburgo| 34|2020-03-05 00:00:00|
|  3|   Emídio Dornelles|Ativo| Porto Alegre| 34|2020-02-05 00:00:00|
|  4|Felisbela Dornelles|Ativo| Porto Alegre| 36|2020-02-05 00:00:00|
|  5|     Graça Ornellas|Ativo| Porto Alegre| 12|2020-02-05 00:00:00|
|  6|   Matilde Rebouças|Ativo| Porto Alegre| 22|2019-01-05 00:00:00|
|  7|    Noêmia   Orriça|Ativo|  Santa Maria| 45|2019-10-05 00:00:00|
|  8|      Roque Vásquez|Ativo| Porto Alegre| 65|2020-03-05 00:00:00|
|  9|      Uriel Queiroz|Ativo| Porto Alegre| 54|2018-05-05 00:00:00|
| 10|   Viviana Sequeira|Ativo| Porto Alegre|  0|2020-09-05 00:00:00|
+---+-------------------+-----+-------------+---+-------------------+



### Comparando os schemas

In [41]:
despachantes.schema

StructType([StructField('id', IntegerType(), True), StructField('nome', StringType(), True), StructField('status', StringType(), True), StructField('cidade', StringType(), True), StructField('vendas', IntegerType(), True), StructField('data', StringType(), True)])

In [40]:
despachantes_auto_schema.schema

StructType([StructField('_c0', IntegerType(), True), StructField('_c1', StringType(), True), StructField('_c2', StringType(), True), StructField('_c3', StringType(), True), StructField('_c4', IntegerType(), True), StructField('_c5', TimestampType(), True)])