In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, FloatType
import pyspark.sql.functions as F

In [2]:

spark = SparkSession.builder \
    .master('local[*]') \
    .appName("Iniciando com Spark") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/19 23:28:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/01/19 23:28:33 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/01/19 23:28:33 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


### Lendo Csv

In [3]:
path_countries = '../../datalake/transient/departments/countries'
df_countries = spark.read.format('csv')\
.option("header", True)\
.option("sep", ",")\
.option("quote","\'")\
.option("inferSchema",True)\
.load(path_countries)
#transient\csv\olist

In [4]:
path_regions = '../../datalake/transient/departments/regions'
df_regions = spark.read.format('csv')\
.option("header", True)\
.option("sep", ",")\
.option("quote","\'")\
.option("inferSchema",True)\
.load(path_regions)

In [5]:
path_jobs = '../../datalake/transient/departments/jobs'
df_jobs = spark.read.format('csv')\
.option("header", True)\
.option("sep", ",")\
.option("quote","\'")\
.option("inferSchema",True)\
.load(path_jobs)

In [6]:
path_employees = '../../datalake/transient/departments/employees'
df_employees = spark.read.format('csv')\
.option("header", True)\
.option("sep", ",")\
.option("quote","\'")\
.option("inferSchema",True)\
.load(path_employees)

In [7]:
df_countries.show(2)

+----------+------------+---------+
|country_id|country_name|region_id|
+----------+------------+---------+
|        AR|   Argentina|        2|
|        AU|   Australia|        3|
+----------+------------+---------+
only showing top 2 rows



In [8]:
df_regions.show(20)

+---------+--------------------+
|region_id|         region_name|
+---------+--------------------+
|        1|              Europe|
|        2|            Americas|
|        3|                Asia|
|        4|Middle East and A...|
+---------+--------------------+



In [9]:
data = [(97,"Terra Média"),(98,"Westeros"),(98,"Esteros"),(100,"Sistema Solar")]


schema = StructType([ \
    StructField("region_id",IntegerType(),True), \
    StructField("region_name",StringType(),True)
  ])

df_region2 = spark.createDataFrame(data=data,schema=schema)

In [10]:
data = [(50,"Valfenda",91),(51,"Kings Landing",98),(51,"Terra",101)]

#country_id|country_name|region_id
schema = StructType([ \
    StructField("country_id",IntegerType(),True), \
    StructField("country_name",StringType(),True), \
    StructField("region_id",IntegerType(),True),
  ])

df_countries2 = spark.createDataFrame(data=data,schema=schema)

### **Union** ###
Podemos unir dataframes que tenham o mesmo schema, o efeito seria o mesmo de empilhar os dataframes <br>


In [11]:
df_regions3 = df_regions.union(df_region2)

In [12]:
df_regions3.show()

                                                                                

+---------+--------------------+
|region_id|         region_name|
+---------+--------------------+
|        1|              Europe|
|        2|            Americas|
|        3|                Asia|
|        4|Middle East and A...|
|       97|         Terra Média|
|       98|            Westeros|
|       98|             Esteros|
|      100|       Sistema Solar|
+---------+--------------------+



In [13]:
df_countries3 = df_countries.union(df_countries2)

In [14]:
df_countries3.show(30)

+----------+--------------------+---------+
|country_id|        country_name|region_id|
+----------+--------------------+---------+
|        AR|           Argentina|        2|
|        AU|           Australia|        3|
|        BE|             Belgium|        1|
|        BR|              Brazil|        2|
|        CA|              Canada|        2|
|        CH|         Switzerland|        1|
|        CN|               China|        3|
|        DE|             Germany|        1|
|        DK|             Denmark|        1|
|        EG|               Egypt|        4|
|        FR|              France|        1|
|        HK|            HongKong|        3|
|        IL|              Israel|        4|
|        IN|               India|        3|
|        IT|               Italy|        1|
|        JP|               Japan|        3|
|        KW|              Kuwait|        4|
|        MX|              Mexico|        2|
|        NG|             Nigeria|        4|
|        NL|         Netherlands

### **Join** ###
Outra possibilidade muito utilizada é a **junção** ou **join** entre dataframes, a junção necessita que os dataframes envolvidos tenham um campo em comum, semelhante a relação de chaves primarias e estrageiras do SQL tradicional<br>
**Tipos de junções**;<br>
• inner - Junção padrão, só realiza a junção se a mesma chave exista em todos os dataframes envolvidos ;<br>
• left - Sempre retorna os elementos do dataframe da esquerda, os caso os elementos do dataframe da esquerda não sejam encontrados, as colunas desse dataframe ;<br>
• full - ;<br>
• anti - ;<br>

### **Inner Join** ###
Outra possibilidade muito utilizada é a **junção** ou **join** entre dataframes, a junção necessita que os dataframes envolvidos tenham um campo em comum, semelhante a relação de chaves primarias e estrageiras do SQL tradicional<br>

In [15]:
condicao = df_regions3.region_id == df_countries3.region_id
df_join = df_regions3.join(df_countries3, condicao ,'inner')

In [16]:
df_join.show(30)



+---------+--------------------+----------+--------------------+---------+
|region_id|         region_name|country_id|        country_name|region_id|
+---------+--------------------+----------+--------------------+---------+
|        1|              Europe|        BE|             Belgium|        1|
|        1|              Europe|        CH|         Switzerland|        1|
|        1|              Europe|        DE|             Germany|        1|
|        1|              Europe|        DK|             Denmark|        1|
|        1|              Europe|        FR|              France|        1|
|        1|              Europe|        IT|               Italy|        1|
|        1|              Europe|        NL|         Netherlands|        1|
|        1|              Europe|        UK|      United Kingdom|        1|
|        2|            Americas|        AR|           Argentina|        2|
|        2|            Americas|        BR|              Brazil|        2|
|        2|            Am

                                                                                

### **Left Join** ###
Sempre retorna os elementos do dataframe da esquerda, os caso os elementos do dataframe da direita não sejam encontrados, as colunas desse dataframe aparecem como nulas;<br>

In [17]:
condicao = df_regions3.region_id == df_countries3.region_id
df_join = df_regions3.join(df_countries3, condicao ,'left')

In [18]:
df_join.show(30)

+---------+--------------------+----------+--------------------+---------+
|region_id|         region_name|country_id|        country_name|region_id|
+---------+--------------------+----------+--------------------+---------+
|        1|              Europe|        UK|      United Kingdom|        1|
|        1|              Europe|        NL|         Netherlands|        1|
|        1|              Europe|        IT|               Italy|        1|
|        1|              Europe|        FR|              France|        1|
|        1|              Europe|        DK|             Denmark|        1|
|        1|              Europe|        DE|             Germany|        1|
|        1|              Europe|        CH|         Switzerland|        1|
|        1|              Europe|        BE|             Belgium|        1|
|        3|                Asia|        SG|           Singapore|        3|
|        3|                Asia|        JP|               Japan|        3|
|        3|              

### **Full Join** ###
Sempre retorna os elementos do dataframe da esquerda, os caso os elementos do dataframe da direita não sejam encontrados, as colunas desse dataframe aparecem como nulas;<br>

In [19]:
condicao = df_regions3.region_id == df_countries3.region_id
df_join = df_regions3.join(df_countries3, condicao ,'full')

In [20]:
df_join.show(30)

+---------+--------------------+----------+--------------------+---------+
|region_id|         region_name|country_id|        country_name|region_id|
+---------+--------------------+----------+--------------------+---------+
|        1|              Europe|        BE|             Belgium|        1|
|        1|              Europe|        CH|         Switzerland|        1|
|        1|              Europe|        DE|             Germany|        1|
|        1|              Europe|        DK|             Denmark|        1|
|        1|              Europe|        FR|              France|        1|
|        1|              Europe|        IT|               Italy|        1|
|        1|              Europe|        NL|         Netherlands|        1|
|        1|              Europe|        UK|      United Kingdom|        1|
|        2|            Americas|        AR|           Argentina|        2|
|        2|            Americas|        BR|              Brazil|        2|
|        2|            Am

### **Anti Join** ###
Sempre retorna os elementos do dataframe da direita, quem não sejam encontrados no dataframe da esquerda;<br>

In [21]:
condicao = df_regions3.region_id == df_countries.region_id
df_join = df_regions3.join(df_countries, condicao ,'anti')

In [22]:
df_join.show()

+---------+-------------+
|region_id|  region_name|
+---------+-------------+
|       97|  Terra Média|
|       98|     Westeros|
|       98|      Esteros|
|      100|Sistema Solar|
+---------+-------------+



In [23]:
###Pivot

In [24]:
data = [(1,"Profit",100.0),
        (2,"Profit",100.0),
        (3,"Profit",100.0),
        (4,"Profit",100.0),
        (5,"Profit",100.0),
        (6,"Profit",100.0),
        (7,"Profit",100.0),
        (8,"Profit",100.0),
        (9,"Profit",100.0),
       (10,"Profit",100.0),
       (11,"Profit",100.0),
       (12,"Profit",100.0),
         (1,"Revenue",500.0),
        (2,"Revenue",500.0),
        (3,"Revenue",500.0),
        (4,"Revenue",500.0),
        (5,"Revenue",555.0),
        (6,"Revenue",777.0),
        (7,"Revenue",800.0),
        (8,"Revenue",900.0),
        (9,"Revenue",1000.0),
       (10,"Revenue",300.0),
       (12,"Revenue",400.0)
      ]

#country_id|country_name|region_id
schema = StructType([ \
    StructField("Month",IntegerType(),True), \
    StructField("Indicator",StringType(),True), \
    StructField("Amount",FloatType(),True),
  ])

df_profit = spark.createDataFrame(data=data,schema=schema)

In [25]:

df_profit.show()

+-----+---------+------+
|Month|Indicator|Amount|
+-----+---------+------+
|    1|   Profit| 100.0|
|    2|   Profit| 100.0|
|    3|   Profit| 100.0|
|    4|   Profit| 100.0|
|    5|   Profit| 100.0|
|    6|   Profit| 100.0|
|    7|   Profit| 100.0|
|    8|   Profit| 100.0|
|    9|   Profit| 100.0|
|   10|   Profit| 100.0|
|   11|   Profit| 100.0|
|   12|   Profit| 100.0|
|    1|  Revenue| 500.0|
|    2|  Revenue| 500.0|
|    3|  Revenue| 500.0|
|    4|  Revenue| 500.0|
|    5|  Revenue| 555.0|
|    6|  Revenue| 777.0|
|    7|  Revenue| 800.0|
|    8|  Revenue| 900.0|
+-----+---------+------+
only showing top 20 rows



In [26]:
df_pivot = df_profit.groupBy("Indicator").pivot("Month").sum("Amount")
df_pivot.printSchema()
df_pivot.show(truncate=False)

root
 |-- Indicator: string (nullable = true)
 |-- 1: double (nullable = true)
 |-- 2: double (nullable = true)
 |-- 3: double (nullable = true)
 |-- 4: double (nullable = true)
 |-- 5: double (nullable = true)
 |-- 6: double (nullable = true)
 |-- 7: double (nullable = true)
 |-- 8: double (nullable = true)
 |-- 9: double (nullable = true)
 |-- 10: double (nullable = true)
 |-- 11: double (nullable = true)
 |-- 12: double (nullable = true)

+---------+-----+-----+-----+-----+-----+-----+-----+-----+------+-----+-----+-----+
|Indicator|1    |2    |3    |4    |5    |6    |7    |8    |9     |10   |11   |12   |
+---------+-----+-----+-----+-----+-----+-----+-----+-----+------+-----+-----+-----+
|Profit   |100.0|100.0|100.0|100.0|100.0|100.0|100.0|100.0|100.0 |100.0|100.0|100.0|
|Revenue  |500.0|500.0|500.0|500.0|555.0|777.0|800.0|900.0|1000.0|300.0|null |400.0|
+---------+-----+-----+-----+-----+-----+-----+-----+-----+------+-----+-----+-----+



In [27]:
months = [1,2,3,4,5,6,7,8,9,10,11,12]
df_pivot = df_profit.groupBy("Indicator").pivot("Month", months).sum("Amount")
df_pivot.printSchema()
df_pivot.show(truncate=False)

root
 |-- Indicator: string (nullable = true)
 |-- 1: double (nullable = true)
 |-- 2: double (nullable = true)
 |-- 3: double (nullable = true)
 |-- 4: double (nullable = true)
 |-- 5: double (nullable = true)
 |-- 6: double (nullable = true)
 |-- 7: double (nullable = true)
 |-- 8: double (nullable = true)
 |-- 9: double (nullable = true)
 |-- 10: double (nullable = true)
 |-- 11: double (nullable = true)
 |-- 12: double (nullable = true)

+---------+-----+-----+-----+-----+-----+-----+-----+-----+------+-----+-----+-----+
|Indicator|1    |2    |3    |4    |5    |6    |7    |8    |9     |10   |11   |12   |
+---------+-----+-----+-----+-----+-----+-----+-----+-----+------+-----+-----+-----+
|Profit   |100.0|100.0|100.0|100.0|100.0|100.0|100.0|100.0|100.0 |100.0|100.0|100.0|
|Revenue  |500.0|500.0|500.0|500.0|555.0|777.0|800.0|900.0|1000.0|300.0|null |400.0|
+---------+-----+-----+-----+-----+-----+-----+-----+-----+------+-----+-----+-----+



In [28]:
###UnPivot

In [29]:
from pyspark.sql.functions import expr
unPivotDF = df_pivot.unpivot(['Indicator'], ['1','2','3','4','5','6','7','8','9','10','11','12'],\
                             'Month', 'Amount')

unPivotDF.show(truncate=False)


+---------+-----+------+
|Indicator|Month|Amount|
+---------+-----+------+
|Profit   |1    |100.0 |
|Profit   |2    |100.0 |
|Profit   |3    |100.0 |
|Profit   |4    |100.0 |
|Profit   |5    |100.0 |
|Profit   |6    |100.0 |
|Profit   |7    |100.0 |
|Profit   |8    |100.0 |
|Profit   |9    |100.0 |
|Profit   |10   |100.0 |
|Profit   |11   |100.0 |
|Profit   |12   |100.0 |
|Revenue  |1    |500.0 |
|Revenue  |2    |500.0 |
|Revenue  |3    |500.0 |
|Revenue  |4    |500.0 |
|Revenue  |5    |555.0 |
|Revenue  |6    |777.0 |
|Revenue  |7    |800.0 |
|Revenue  |8    |900.0 |
+---------+-----+------+
only showing top 20 rows



### Agregações

count()	Use groupBy() count() to return the number of rows for each group. <br>
mean()	Returns the mean of values for each group. <br>
max()	Returns the maximum of values for each group. <br>
min()	Returns the minimum of values for each group. <br>
sum()	Returns the total for values for each group. <br>
avg()	Returns the average for values for each group. <br>
agg()	Using groupBy() agg() function, we can calculate more than one aggregate at a time. <br>

In [32]:
unPivotDF.count()

24

In [33]:
unPivotDF.groupBy('Indicator').mean('Amount').show()

+---------+-----------+
|Indicator|avg(Amount)|
+---------+-----------+
|   Profit|      100.0|
|  Revenue|      612.0|
+---------+-----------+



In [34]:
unPivotDF.groupBy('Indicator').sum('Amount').show()

+---------+-----------+
|Indicator|sum(Amount)|
+---------+-----------+
|   Profit|     1200.0|
|  Revenue|     6732.0|
+---------+-----------+



In [35]:
unPivotDF.groupBy('Indicator').max('Amount').show()

+---------+-----------+
|Indicator|max(Amount)|
+---------+-----------+
|   Profit|      100.0|
|  Revenue|     1000.0|
+---------+-----------+



In [37]:
unPivotDF.groupBy('Indicator').min('Amount').show()

+---------+-----------+
|Indicator|min(Amount)|
+---------+-----------+
|   Profit|      100.0|
|  Revenue|      300.0|
+---------+-----------+



In [38]:
df_grouped = unPivotDF.groupBy('Indicator')

In [39]:
from pyspark.sql import types as T, functions as F

In [40]:
df_grouped.agg(F.collect_list(F.col('Amount'))
              ).show(truncate = False)

+---------+------------------------------------------------------------------------------------+
|Indicator|collect_list(Amount)                                                                |
+---------+------------------------------------------------------------------------------------+
|Profit   |[100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0]|
|Revenue  |[500.0, 500.0, 500.0, 500.0, 555.0, 777.0, 800.0, 900.0, 1000.0, 300.0, 400.0]      |
+---------+------------------------------------------------------------------------------------+



In [41]:
df_jobs.show()

+------+--------------------+----------+----------+
|job_id|           jod_title|min_salary|max_salary|
+------+--------------------+----------+----------+
|     1|   Public Accountant|    4200.0|    9000.0|
|     2|  Accounting Manager|    8200.0|   16000.0|
|     3|Administration As...|    3000.0|    6000.0|
|     4|           President|   20000.0|   40000.0|
|     5|Administration Vi...|   15000.0|   30000.0|
|     6|          Accountant|    4200.0|    9000.0|
|     7|     Finance Manager|    8200.0|   16000.0|
|     8|Human Resources R...|    4000.0|    9000.0|
|     9|          Programmer|    4000.0|   10000.0|
|    10|   Marketing Manager|    9000.0|   15000.0|
|    11|Marketing Represe...|    4000.0|    9000.0|
|    12|Public Relations ...|    4500.0|   10500.0|
|    13|    Purchasing Clerk|    2500.0|    5500.0|
|    14|  Purchasing Manager|    8000.0|   15000.0|
|    15|       Sales Manager|   10000.0|   20000.0|
|    16|Sales Representative|    6000.0|   12000.0|
|    17|    

In [42]:
condicao = df_employees.job_id == df_jobs.job_id
df_jobs_joined = df_jobs.join(df_employees, condicao , 'inner').select(df_employees['*'],\
                                                                       df_jobs['jod_title'],\
                                                                       df_jobs['min_salary'],\
                                                                       df_jobs['max_salary'])

In [43]:
df_jobs_joined.show()

+-----------+-----------+----------+--------------------+------------+----------+------+-------+----------+-------------+--------------------+----------+----------+
|employee_id| first_name| last_name|               email|phone_number| hire_date|job_id| salary|manager_id|department_id|           jod_title|min_salary|max_salary|
+-----------+-----------+----------+--------------------+------------+----------+------+-------+----------+-------------+--------------------+----------+----------+
|        100|     Steven|      King|steven.king@sqltu...|515.123.4567|1987-06-17|     4|24000.0|      NULL|            9|           President|   20000.0|   40000.0|
|        101|      Neena|   Kochhar|neena.kochhar@sql...|515.123.4568|1989-09-21|     5|17000.0|       100|            9|Administration Vi...|   15000.0|   30000.0|
|        102|        Lex|   De Haan|lex.de haan@sqltu...|515.123.4569|1993-01-13|     5|17000.0|       100|            9|Administration Vi...|   15000.0|   30000.0|
|        1

In [44]:
from pyspark.sql.functions import sum,avg,max,count,mean

unPivotDF.groupBy('Indicator').agg(
    sum('Amount').alias('sum'),
    mean('Amount').alias('mean')
).show()

+---------+------+-----+
|Indicator|   sum| mean|
+---------+------+-----+
|   Profit|1200.0|100.0|
|  Revenue|6732.0|612.0|
+---------+------+-----+

