# Rodar pySpark no Colab:

[Iniciando um projeto Spark no Google Colab](https://www.alura.com.br/artigos/iniciando-projeto-spark-no-colab)

# Documentação PySpark
[Documentação](https://spark.apache.org/docs/latest/api/python/reference/pyspark.html)

In [1]:
!pip3 install pyspark
!pip3 install findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.2-py2.py3-none-any.whl size=281824025 sha256=8fa55d90a8e094a486141092f3a040ad11ebd5fa374494e8aeaadebbd6b034b2
  Stored in directory: /root/.cache/pip/wheels/6c/e3/9b/0525ce8a69478916513509d43693511463c6468db0de237c86
Successfully built pyspark
Installing collected packages: py4j, pyspa

In [2]:
import findspark
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

In [None]:
# spark = SparkSession.builder \
#     .master('local[*]') \
#     .appName('Iniciando com Spark') \
#     .config('spark.ui.port', '4050') \
#     .getOrCreate()

In [3]:
findspark.init()
conf = SparkConf().setAppName("app")
sc = SparkContext(conf=conf)

# RDD

In [None]:
numeros = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

### Mostrar 5 primeiros valores

In [None]:
numeros.take(5)

[1, 2, 3, 4, 5]

### Mostrar 5 maiores valores

In [None]:
numeros.top(5)

[10, 9, 8, 7, 6]

### Quantidade de registros

In [None]:
numeros.count()

10

### Média dos valores

In [None]:
numeros.mean()

5.5

### Criando um filtro

In [None]:
filtro = numeros.filter(lambda valor: valor > 2)

In [None]:
filtro.collect()

[3, 4, 5, 6, 7, 8, 9, 10]

### Utilizando o map - Semelhante ao do Python

In [None]:
multiplicacao = numeros.map(lambda valor: valor * 2)

In [None]:
multiplicacao.collect()

[2, 4, 6, 8, 10, 12, 14, 16, 18, 20]

### União de RDDs

In [None]:
numeros2 = sc.parallelize([11, 12, 13, 14, 15])
uniao = numeros.union(numeros2)
uniao.collect()

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]

### Pegando as chaves dos valores

In [12]:
id_cliente_valor_compra = sc.parallelize([('cliente1', 200), ('cliente2', 300), ('cliente3', 120), ('cliente4', 250), ('cliente5', 78)])
id_cliente = id_cliente_valor_compra.keys()
id_cliente.collect()

['cliente1', 'cliente2', 'cliente3', 'cliente4', 'cliente5']

### Pegando os valores da tupla

In [10]:
valor_compra = id_cliente_valor_compra.values()
valor_compra.collect()

[200, 300, 120, 250, 78]

# Quantidade de valores por chave

In [13]:
id_cliente_valor_compra.countByKey()

defaultdict(int,
            {'cliente1': 1,
             'cliente2': 1,
             'cliente3': 1,
             'cliente4': 1,
             'cliente5': 1})

### Realizando operações com os valores

In [14]:
soma = id_cliente_valor_compra.mapValues(lambda valor: valor * 2)
soma.collect()

[('cliente1', 400),
 ('cliente2', 600),
 ('cliente3', 240),
 ('cliente4', 500),
 ('cliente5', 156)]

### Realizando joins

In [16]:
qtd_paga_por_cliente = sc.parallelize([('cliente2', 55), ('cliente4', 400)])

In [18]:
resultado = id_cliente_valor_compra.join(qtd_paga_por_cliente)
resultado.collect()

[('cliente2', (300, 55)), ('cliente4', (250, 400))]

In [19]:
sem_debito = id_cliente_valor_compra.subtractByKey(qtd_paga_por_cliente)
sem_debito.collect()

[('cliente5', 78), ('cliente1', 200), ('cliente3', 120)]