#**PySpark no Google Colab**
---

####Configurando o Google Colab para habilitar o uso do PySpark

In [1]:
# Instala o Java JDK 8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [2]:
# Download do Apache Spark 3.1.2
!wget -q https://downloads.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz

In [3]:
# Descompacta o Apache Spark 3.1.2
!tar xf spark-3.1.2-bin-hadoop3.2.tgz

In [4]:
# Remove o arquivo compactado do Apache Spark 3.1.2
!rm -rf spark-3.1.2-bin-hadoop3.2.tgz

In [5]:
# Instala os módulos FindSpark e PySpark
!pip install -q findspark
!pip install -q pyspark==3.1.2

[K     |████████████████████████████████| 281.3 MB 41 kB/s 
[K     |████████████████████████████████| 198 kB 38.0 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


####Configurando o ambiente para uso do PySpark

In [6]:
# Importa os módulos
import os
import findspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [7]:
#  Define as variáveis ambientes Home do Java e Spark
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"

In [8]:
# Inicia o FindSpark e cria a instância da sessão Spark
findspark.init()
spark = SparkSession.builder.master("local[*]").getOrCreate()

####PySpark pronto para uso, divirta-se!

In [9]:
dataframe = spark.read.format("json") \
  .option("multiLine",True) \
  .load("sample_data/anscombe.json")

In [10]:
dataframe.columns

['Series', 'X', 'Y']

In [11]:
dataframe.show(10)

+------+----+-----+
|Series|   X|    Y|
+------+----+-----+
|     I|10.0| 8.04|
|     I| 8.0| 6.95|
|     I|13.0| 7.58|
|     I| 9.0| 8.81|
|     I|11.0| 8.33|
|     I|14.0| 9.96|
|     I| 6.0| 7.24|
|     I| 4.0| 4.26|
|     I|12.0|10.84|
|     I| 7.0| 4.81|
+------+----+-----+
only showing top 10 rows



In [12]:
dataframe.printSchema()

root
 |-- Series: string (nullable = true)
 |-- X: double (nullable = true)
 |-- Y: double (nullable = true)



In [13]:
dataframe_agrupado = dataframe \
  .groupBy("Series") \
  .agg(F.avg("X").alias("X_agrupado")
        , F.avg("Y").alias("Y_agrupado")) \
  .orderBy("Series")

dataframe_agrupado.show()

In [14]:
dataframe_agrupado.explain()

== Physical Plan ==
*(3) Sort [Series#0 ASC NULLS FIRST], true, 0
+- Exchange rangepartitioning(Series#0 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [id=#38]
   +- *(2) HashAggregate(keys=[Series#0], functions=[avg(X#1), avg(Y#2)])
      +- Exchange hashpartitioning(Series#0, 200), ENSURE_REQUIREMENTS, [id=#34]
         +- *(1) HashAggregate(keys=[Series#0], functions=[partial_avg(X#1), partial_avg(Y#2)])
            +- FileScan json [Series#0,X#1,Y#2] Batched: false, DataFilters: [], Format: JSON, Location: InMemoryFileIndex[file:/content/sample_data/anscombe.json], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<Series:string,X:double,Y:double>




In [15]:
df = spark.createDataFrame(
    [ (1., 4.)
    , (2., 5.)
    , (3., 6.)]
    , ["A", "B"])

In [16]:
df.show()

+---+---+
|  A|  B|
+---+---+
|1.0|4.0|
|2.0|5.0|
|3.0|6.0|
+---+---+



In [17]:
df = spark.createDataFrame(
  [
      ('864.754.453-33,565.878.787-43',)
    , ('565.878.787-43 864.754.453-33 565.878.787-43',)
    , ('333.444.555-66 222.222.222-33',)
  ]
  , ["cpf",])

In [18]:
df.show(10,False)

+--------------------------------------------+
|cpf                                         |
+--------------------------------------------+
|864.754.453-33,565.878.787-43               |
|565.878.787-43 864.754.453-33 565.878.787-43|
|333.444.555-66 222.222.222-33               |
+--------------------------------------------+



In [19]:
df.select(
      df.cpf
    , F.length(F.regexp_replace(df.cpf, r'\d+\.\d+\.\d+\-\d+', '')).alias('reg1')
    , F.length(F.regexp_replace(df.cpf, r'\d{3}\.\d{3}\.\d{3}\-\d{2}', '')).alias('reg2')
    , df.cpf.rlike(r'\d{3}\.\d{3}\.\d{3}\-\d{2}').alias('reg3')
).show(10, False)

+--------------------------------------------+----+----+----+
|cpf                                         |reg1|reg2|reg3|
+--------------------------------------------+----+----+----+
|864.754.453-33,565.878.787-43               |1   |1   |true|
|565.878.787-43 864.754.453-33 565.878.787-43|2   |2   |true|
|333.444.555-66 222.222.222-33               |1   |1   |true|
+--------------------------------------------+----+----+----+



In [20]:
df2 = spark.createDataFrame(
  [
      ('http://site.com:8080/',)
    , ('http://localhost.com:8080/?pub=200',)
    , ('http://server.com:1234',)
    , ('http://server.com',)
  ]
  , ["url",])

In [21]:
df2.show(10,False)

+----------------------------------+
|url                               |
+----------------------------------+
|http://site.com:8080/             |
|http://localhost.com:8080/?pub=200|
|http://server.com:1234            |
|http://server.com                 |
+----------------------------------+



In [22]:
df2.select(
      df2.url
    , df2.url.rlike(r'https?:[\/]{2}\s+').alias('reg1')
    , df2.url.rlike(r'https?:[\/]{2}([a-zA-Z0-9]+\.[a-zA-Z]{2,4})(:[0-9]+)?').alias('reg2')
    , df2.url.rlike(r'https?:[\/]{2}([a-zA-Z0-9]+\.[a-zA-Z]{2,4})(:[0-9]+)').alias('reg3')
    , df2.url.rlike(r'https?:\/{2}').alias('reg4')
).show(10, False)

+----------------------------------+-----+----+-----+----+
|url                               |reg1 |reg2|reg3 |reg4|
+----------------------------------+-----+----+-----+----+
|http://site.com:8080/             |false|true|true |true|
|http://localhost.com:8080/?pub=200|false|true|true |true|
|http://server.com:1234            |false|true|true |true|
|http://server.com                 |false|true|false|true|
+----------------------------------+-----+----+-----+----+

