#**PySpark no Google Colab**
---

####Configurando o Google Colab para habilitar o uso do PySpark

In [1]:
# Instalação do Java JDK 8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [3]:
# Download do Apache Spark 3.1.2
!wget -q https://downloads.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz

In [4]:
# Descompactação do Apache Spark 3.1.2
!tar xf spark-3.1.2-bin-hadoop3.2.tgz

In [6]:
# Remove o arquivo compactado do Apache Spark 3.1.2
!rm -rf spark-3.1.2-bin-hadoop3.2.tgz

In [None]:
# Instala os módulos FindSpark e PySpark
!pip install -q findspark
!pip install -q pyspark

[K     |████████████████████████████▉   | 253.2 MB 1.2 MB/s eta 0:00:23

####Configurando o ambiente para uso do PySpark

In [19]:
# Importa o módulos
import os
import findspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [8]:
#  Define as variáveis ambientes Home do Java e Spark
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"

In [12]:
# Inicia o FindSpark e cria a instância da sessão Spark
findspark.init()
spark = SparkSession.builder.master("local[*]").getOrCreate()

####PySpark pronto para uso, divirta-se!

In [11]:
dataset = spark.read.format("json") \
  .option("multiLine",True) \
  .load("sample_data/anscombe.json")

['Series', 'X', 'Y']

In [None]:
dataset.columns

In [None]:
dataset.show(10)

In [None]:
dataset.printSchema()

In [None]:
dataset_agrupado = dataset \
  .groupBy("Series") \
  .agg(F.avg("X").alias("X_agrupado")
        , F.avg("Y").alias("Y_agrupado")) \
  .orderBy("Series")

dataset_agrupado.show()

In [None]:
dataset_agrupado.explain()

In [13]:
df = spark.createDataFrame(
    [ (1., 4.)
    , (2., 5.)
    , (3., 6.)]
    , ["A", "B"])

In [14]:
df.show()

+---+---+
|  A|  B|
+---+---+
|1.0|4.0|
|2.0|5.0|
|3.0|6.0|
+---+---+



In [16]:
df = spark.createDataFrame(
  [
      ('864.754.453-33,565.878.787-43',)
    , ('565.878.787-43 864.754.453-33 565.878.787-43',)
    , ('333.444.555-66 222.222.222-33',)
  ]
  , ["cpf",])

In [20]:
df.show(10,False)

+--------------------------------------------+
|cpf                                         |
+--------------------------------------------+
|864.754.453-33,565.878.787-43               |
|565.878.787-43 864.754.453-33 565.878.787-43|
|333.444.555-66 222.222.222-33               |
+--------------------------------------------+



In [21]:
df.select(
      df.cpf
    , F.length(F.regexp_replace(df.cpf, r'\d+\.\d+\.\d+\-\d+', '')).alias('reg1')
    , F.length(F.regexp_replace(df.cpf, r'\d{3}\.\d{3}\.\d{3}\-\d{2}', '')).alias('reg2')
    , df.cpf.rlike(r'\d{3}\.\d{3}\.\d{3}\-\d{2}').alias('reg3')
).show(10, False)

+--------------------------------------------+----+----+----+
|cpf                                         |reg1|reg2|reg3|
+--------------------------------------------+----+----+----+
|864.754.453-33,565.878.787-43               |1   |1   |true|
|565.878.787-43 864.754.453-33 565.878.787-43|2   |2   |true|
|333.444.555-66 222.222.222-33               |1   |1   |true|
+--------------------------------------------+----+----+----+



In [23]:
df2 = spark.createDataFrame(
  [
      ('http://site.com:8080/',)
    , ('http://localhost.com:8080/?pub=200',)
    , ('http://server.com:1234',)
    , ('http://server.com',)
  ]
  , ["url",])

In [24]:
df2.show(10,False)

+----------------------------------+
|url                               |
+----------------------------------+
|http://site.com:8080/             |
|http://localhost.com:8080/?pub=200|
|http://server.com:1234            |
|http://server.com                 |
+----------------------------------+



In [25]:
df2.select(
      df2.url
    , df2.url.rlike(r'https?:[\/]{2}\s+').alias('reg1')
    , df2.url.rlike(r'https?:[\/]{2}([a-zA-Z0-9]+\.[a-zA-Z]{2,4})(:[0-9]+)?').alias('reg2')
    , df2.url.rlike(r'https?:[\/]{2}([a-zA-Z0-9]+\.[a-zA-Z]{2,4})(:[0-9]+)').alias('reg3')
    , df2.url.rlike(r'https?:\/{2}').alias('reg4')
).show(10, False)

+----------------------------------+-----+----+-----+----+
|url                               |reg1 |reg2|reg3 |reg4|
+----------------------------------+-----+----+-----+----+
|http://site.com:8080/             |false|true|true |true|
|http://localhost.com:8080/?pub=200|false|true|true |true|
|http://server.com:1234            |false|true|true |true|
|http://server.com                 |false|true|false|true|
+----------------------------------+-----+----+-----+----+

