In [99]:
from pyspark.sql import SparkSession, dataframe, Row
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType
from pyspark.sql import HiveContext
from pyspark.sql.functions import *
from pyspark.sql import functions as f
from pyspark.sql.functions import col,trim,ltrim,rtrim,when,regexp_replace,concat_ws, lit, sha2

import os
import re 

In [100]:
#sessão do spark com o Hive
#spark = SparkSession.builder.master("local[*]")\
#    .enableHiveSupport()\
#    .getOrCreate()

In [101]:
#Iniciando o tratamento dos dados
df_clientes = spark.sql("select * from desafio_curso.tbl_clientes")

In [102]:
#Convertendo os tipos de dados
df_clientes = df_clientes.withColumn("address_number",col("address_number").cast(IntegerType()))\
        .withColumn("business_unit",col("business_unit").cast(IntegerType()))\
        .withColumn("customerkey",col("customerkey").cast(IntegerType()))\
        .withColumn("division",col("division").cast(IntegerType()))\
        .withColumn("region_code",col("region_code").cast(IntegerType()))

In [103]:
#Tratando as colunas vazias
df_clientes = df_clientes.withColumn('line_of_business', regexp_replace('line_of_business', '   ', 'N/I'))

In [104]:
df_clientes.count()

684

In [105]:
#Removendo tuplas duplicadas
df_clientes = df_clientes.distinct()
df_clientes.count()

684

In [106]:
df_clientes.printSchema()

root
 |-- address_number: integer (nullable = true)
 |-- business_family: string (nullable = true)
 |-- business_unit: integer (nullable = true)
 |-- customer: string (nullable = true)
 |-- customerkey: integer (nullable = true)
 |-- customer_type: string (nullable = true)
 |-- division: integer (nullable = true)
 |-- line_of_business: string (nullable = true)
 |-- phone: string (nullable = true)
 |-- region_code: integer (nullable = true)
 |-- regional_sales_mgr: string (nullable = true)
 |-- search_type: string (nullable = true)
 |-- dt_foto: string (nullable = true)



In [107]:
df_clientes.createOrReplaceTempView('tb_clientes')

In [108]:
df_divisao = spark.sql("select * from desafio_curso.tbl_divisao")

In [109]:
df_divisao = df_divisao.withColumn("division",col("division").cast(IntegerType()))

In [110]:
df_divisao.createOrReplaceTempView('tb_divisao')

In [111]:
df_endereco = spark.sql("select * from desafio_curso.tbl_endereco")

In [112]:
df_endereco.printSchema()

root
 |-- address_number: string (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- customer_address_1: string (nullable = true)
 |-- customer_address_2: string (nullable = true)
 |-- customer_address_3: string (nullable = true)
 |-- customer_address_4: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- dt_foto: string (nullable = true)



In [113]:
#Convertendo os tipos de dados
df_endereco = df_endereco.withColumn("address_number",col("address_number").cast(IntegerType()))

In [114]:
#Tratando as colunas vazias
df_endereco = df_endereco.withColumn('city', regexp_replace('city', '                        ', 'N/I'))\
            .withColumn('customer_address_1', regexp_replace('customer_address_1', '                                       ', 'N/I'))\
            .withColumn('customer_address_2', regexp_replace('customer_address_2', '                                       ', 'N/I'))\
            .withColumn('customer_address_3', regexp_replace('customer_address_3', '                                       ', 'N/I'))\
            .withColumn('customer_address_4', regexp_replace('customer_address_4', '                                       ', 'N/I'))\
            .withColumn('zip_code', regexp_replace('zip_code', '            ', 'N/I'))
df_endereco = df_endereco.select([when(col(c)=="",None).otherwise(col(c)).alias(c) for c in df_endereco.columns])
df_endereco = df_endereco.na.fill("N/I")

In [115]:
df_endereco.createOrReplaceTempView('tb_endereco')

In [116]:
df_regiao = spark.sql("select * from desafio_curso.tbl_regiao")

In [117]:
df_regiao.printSchema()

root
 |-- region_code: string (nullable = true)
 |-- region_name: string (nullable = true)
 |-- dt_foto: string (nullable = true)



In [118]:
#Convertendo os tipos de dados
df_regiao = df_regiao.withColumn("region_code",col("region_code").cast(IntegerType()))

In [119]:
df_regiao.createOrReplaceTempView('tb_regiao')

In [120]:
df_vendas = spark.sql("select * from desafio_curso.tbl_vendas")

In [121]:
df_vendas.printSchema()

root
 |-- actual_delivery_date: string (nullable = true)
 |-- customerkey: string (nullable = true)
 |-- datekey: string (nullable = true)
 |-- discount_amount: string (nullable = true)
 |-- invoice_date: string (nullable = true)
 |-- invoice_number: string (nullable = true)
 |-- item_class: string (nullable = true)
 |-- item_number: string (nullable = true)
 |-- item: string (nullable = true)
 |-- line_number: string (nullable = true)
 |-- list_price: string (nullable = true)
 |-- order_number: string (nullable = true)
 |-- promised_delivery_date: string (nullable = true)
 |-- sales_amount: string (nullable = true)
 |-- sales_amount_based_on_list_price: string (nullable = true)
 |-- sales_cost_amount: string (nullable = true)
 |-- sales_margin_amount: string (nullable = true)
 |-- sales_price: string (nullable = true)
 |-- sales_quantity: string (nullable = true)
 |-- sales_rep: string (nullable = true)
 |-- u_m: string (nullable = true)
 |-- dt_foto: string (nullable = true)



In [122]:
#Convertendo os tipos de dados
df_vendas = df_vendas.withColumn("customerkey",col("customerkey").cast(IntegerType()))\
        .withColumn("discount_amount",col("discount_amount").cast(DoubleType()))\
        .withColumn("invoice_number",col("invoice_number").cast(IntegerType()))\
        .withColumn("item_number",col("item_number").cast(IntegerType()))\
        .withColumn("line_number",col("item_number").cast(IntegerType()))\
        .withColumn("list_price",col("list_price").cast(DoubleType()))\
        .withColumn("order_number",col("order_number").cast(IntegerType()))\
        .withColumn("sales_amount",col("sales_amount").cast(DoubleType()))\
        .withColumn("sales_amount_based_on_list_price",col("sales_amount_based_on_list_price").cast(DoubleType()))\
        .withColumn("sales_cost_amount",col("sales_cost_amount").cast(DoubleType()))\
        .withColumn("sales_margin_amount",col("sales_margin_amount").cast(DoubleType()))\
        .withColumn("sales_price",col("sales_price").cast(DoubleType()))\
        .withColumn("sales_quantity",col("sales_quantity").cast(IntegerType()))\
        .withColumn("sales_rep",col("sales_rep").cast(IntegerType()))
df_vendas = df_vendas.select('discount_amount',
                             'invoice_number',
                             'item_class',
                             'item_number',
                             'item',
                             'line_number',
                             'list_price',
                             'order_number',
                             'sales_amount',
                             'sales_amount_based_on_list_price',
                             'sales_cost_amount',
                             'sales_margin_amount',
                             'sales_price',
                             'sales_quantity',
                             'sales_rep',
                             'u_m', 
                             'customerkey',
                             'dt_foto',
                             from_unixtime(unix_timestamp('actual_delivery_date', 'dd/MM/yyy')).alias('actual_delivery_date'),
                             from_unixtime(unix_timestamp('invoice_date', 'dd/MM/yyy')).alias('invoice_date'),
                             from_unixtime(unix_timestamp('promised_delivery_date', 'dd/MM/yyy')).alias('promised_delivery_date'),
                             from_unixtime(unix_timestamp('datekey', 'dd/MM/yyy')).alias('datekey')
                            )


In [123]:
#Tratando as colunas vazias
df_vendas = df_vendas.na.fill(value=0)
df_vendas = df_vendas.select([when(col(c)=="",None).otherwise(col(c)).alias(c) for c in df_vendas.columns])
df_vendas = df_vendas.na.fill("N/I")
df_vendas = df_vendas.withColumn("datekey",to_timestamp(col("datekey")))
df_vendas = df_vendas.withColumn("promised_delivery_date",to_timestamp(col("promised_delivery_date")))
df_vendas = df_vendas.withColumn("invoice_date",to_timestamp(col("invoice_date")))
df_vendas = df_vendas.withColumn("actual_delivery_date",to_timestamp(col("actual_delivery_date")))

In [124]:
df_vendas.printSchema()

root
 |-- discount_amount: double (nullable = true)
 |-- invoice_number: integer (nullable = true)
 |-- item_class: string (nullable = false)
 |-- item_number: integer (nullable = true)
 |-- item: string (nullable = false)
 |-- line_number: integer (nullable = true)
 |-- list_price: double (nullable = true)
 |-- order_number: integer (nullable = true)
 |-- sales_amount: double (nullable = true)
 |-- sales_amount_based_on_list_price: double (nullable = true)
 |-- sales_cost_amount: double (nullable = true)
 |-- sales_margin_amount: double (nullable = true)
 |-- sales_price: double (nullable = true)
 |-- sales_quantity: integer (nullable = true)
 |-- sales_rep: integer (nullable = true)
 |-- u_m: string (nullable = false)
 |-- customerkey: integer (nullable = true)
 |-- dt_foto: string (nullable = false)
 |-- actual_delivery_date: timestamp (nullable = true)
 |-- invoice_date: timestamp (nullable = true)
 |-- promised_delivery_date: timestamp (nullable = true)
 |-- datekey: timestamp (nu

In [125]:
df_vendas.createOrReplaceTempView('tb_vendas')

In [126]:
#Criando tabelão com todos os dados
query='''
SELECT    c.customerkey
          ,c.customer
          ,c.customer_type
          ,c.business_family
          ,c.business_unit
          ,c.division
          ,d.division_name
          ,c.line_of_business
          ,c.phone
          ,c.region_code
          ,r.region_name
          ,c.regional_sales_mgr
          ,c.search_type
          ,v.datekey
          ,v.actual_delivery_date
          ,v.discount_amount
          ,v.invoice_date
          ,v.invoice_number
          ,v.item_class
          ,v.item_number
          ,v.item
          ,v.line_number
          ,v.list_price
          ,v.order_number
          ,v.promised_delivery_date
          ,v.sales_amount
          ,v.sales_amount_based_on_list_price
          ,v.sales_cost_amount
          ,v.sales_margin_amount
          ,v.sales_price
          ,v.sales_quantity
          ,v.sales_rep
          ,v.u_m
          ,e.address_number
          ,e.city
          ,e.country
          ,e.customer_address_1
          ,e.customer_address_2
          ,e.customer_address_3
          ,e.customer_address_4
          ,e.state
          ,e.zip_code
          ,e.dt_foto
FROM      tb_vendas v
          INNER JOIN tb_clientes c ON v.customerkey == c.customerkey
          INNER JOIN tb_regiao r ON c.region_code == r.region_code
          INNER JOIN tb_divisao d ON c.division == d.division
          LEFT JOIN tb_endereco e ON c.address_number == e.address_number
'''

In [127]:
df_stage = spark.sql(query)

In [128]:
df_stage.printSchema()

root
 |-- customerkey: integer (nullable = true)
 |-- customer: string (nullable = true)
 |-- customer_type: string (nullable = true)
 |-- business_family: string (nullable = true)
 |-- business_unit: integer (nullable = true)
 |-- division: integer (nullable = true)
 |-- division_name: string (nullable = true)
 |-- line_of_business: string (nullable = true)
 |-- phone: string (nullable = true)
 |-- region_code: integer (nullable = true)
 |-- region_name: string (nullable = true)
 |-- regional_sales_mgr: string (nullable = true)
 |-- search_type: string (nullable = true)
 |-- datekey: timestamp (nullable = true)
 |-- actual_delivery_date: timestamp (nullable = true)
 |-- discount_amount: double (nullable = true)
 |-- invoice_date: timestamp (nullable = true)
 |-- invoice_number: integer (nullable = true)
 |-- item_class: string (nullable = false)
 |-- item_number: integer (nullable = true)
 |-- item: string (nullable = false)
 |-- line_number: integer (nullable = true)
 |-- list_price:

In [129]:
df_stage = (df_stage
            .withColumn('Ano', year(df_stage.invoice_date))
            .withColumn('Mes', month(df_stage.invoice_date))
            .withColumn('Dia', dayofmonth(df_stage.invoice_date))
            .withColumn('Trimestre', quarter(df_stage.invoice_date))
           )

df_stage = df_stage.select([when(col(c)=="",None).otherwise(col(c)).alias(c) for c in df_stage.columns])
df_stage = df_stage.na.fill("N/I")

In [130]:
df_stage.printSchema()

root
 |-- customerkey: integer (nullable = true)
 |-- customer: string (nullable = false)
 |-- customer_type: string (nullable = false)
 |-- business_family: string (nullable = false)
 |-- business_unit: integer (nullable = true)
 |-- division: integer (nullable = true)
 |-- division_name: string (nullable = false)
 |-- line_of_business: string (nullable = false)
 |-- phone: string (nullable = false)
 |-- region_code: integer (nullable = true)
 |-- region_name: string (nullable = false)
 |-- regional_sales_mgr: string (nullable = false)
 |-- search_type: string (nullable = false)
 |-- datekey: timestamp (nullable = true)
 |-- actual_delivery_date: timestamp (nullable = true)
 |-- discount_amount: double (nullable = true)
 |-- invoice_date: timestamp (nullable = true)
 |-- invoice_number: integer (nullable = true)
 |-- item_class: string (nullable = false)
 |-- item_number: integer (nullable = true)
 |-- item: string (nullable = false)
 |-- line_number: integer (nullable = true)
 |-- li

In [131]:
#Gerando keys para as DW

df_stage = df_stage.withColumn('key_cliente',sha2(col("customerkey").cast(StringType()),256))
df_stage = df_stage.withColumn('key_tempo',sha2(concat_ws('|', col('invoice_date'), col('Ano'),col('Mes'),col('Dia')),256))
df_stage = df_stage.withColumn('key_localidade',sha2(concat_ws('|', col('division'), col('region_code'),col('address_number')),256))

In [132]:
df_stage.createOrReplaceTempView('tb_stage')

In [133]:
df_stage.show()

+-----------+-------------+-------------+---------------+-------------+--------+-------------+----------------+------------+-----------+-------------+------------------+-----------+-------------------+--------------------+---------------+-------------------+--------------+----------+-----------+--------------------+-----------+----------+------------+----------------------+------------+--------------------------------+-----------------+-------------------+-----------+--------------+---------+---+--------------+----+-------+------------------+------------------+------------------+------------------+-----+--------+-------+----+---+---+---------+--------------------+--------------------+--------------------+
|customerkey|     customer|customer_type|business_family|business_unit|division|division_name|line_of_business|       phone|region_code|  region_name|regional_sales_mgr|search_type|            datekey|actual_delivery_date|discount_amount|       invoice_date|invoice_number|item_class|i

In [134]:
#DIM_CLIENTES
dim_clientes = spark.sql('''
    SELECT DISTINCT key_cliente
        ,business_family
        ,customer 
        ,customer_type 
        ,line_of_business
        ,regional_sales_mgr
        ,search_type
    FROM tb_stage    
''')

In [140]:
#Criando a dimensão de tempo
dim_tempo = spark.sql('''
    SELECT DISTINCT key_tempo
        ,invoice_date
        ,Ano 
        ,Mes 
        ,Dia
        ,Trimestre
    FROM tb_stage    
''')

dim_tempo = dim_tempo.withColumn('invoice_date',to_date('invoice_date'))

+--------------------+------------+----+---+---+---------+
|           key_tempo|invoice_date| Ano|Mes|Dia|Trimestre|
+--------------------+------------+----+---+---+---------+
|af1d8d847c7392684...|  2017-11-15|2017| 11| 15|        4|
|ab481df23dbaca220...|  2017-05-18|2017|  5| 18|        2|
|1d96767689980f9be...|  2018-05-20|2018|  5| 20|        2|
|272b0a31a575e7303...|  2018-10-07|2018| 10|  7|        4|
|9885b50a9c653332d...|  2017-03-26|2017|  3| 26|        1|
|5b91624741fa462cb...|  2017-08-03|2017|  8|  3|        3|
|073cfdabd303c6404...|  2017-09-01|2017|  9|  1|        3|
|4dbb2767bc9fd7194...|  2018-04-02|2018|  4|  2|        2|
|d232970174946dbcb...|  2018-02-15|2018|  2| 15|        1|
|2b1713e1d7ab07a57...|  2018-11-14|2018| 11| 14|        4|
|0ccdf0cbae67895e8...|  2017-06-04|2017|  6|  4|        2|
|fb01884f3c351c217...|  2018-03-26|2018|  3| 26|        1|
|68972f41b22cc8cd9...|  2018-11-12|2018| 11| 12|        4|
|2ad3453dbd4dc4231...|  2018-01-01|2018|  1|  1|        

In [136]:
#Criando a dimensão de localidade
dim_localidade = spark.sql('''
    SELECT DISTINCT key_localidade
        ,division_name
        ,region_name 
        ,country 
        ,state
        ,city
        ,zip_code
    FROM tb_stage    
''')

In [137]:
#Criando a fato
ft_vendas = spark.sql('''
    SELECT DISTINCT key_cliente
        ,key_tempo
        ,key_localidade
        ,count(distinct invoice_number) qty_vendas
        ,sum(sales_quantity) quantity
        ,sum(sales_amount) amount
        ,sum(sales_cost_amount) cost
        ,sum(sales_amount - sales_cost_amount) total_amount
    FROM tb_stage    
    GROUP BY key_cliente
        ,key_tempo
        ,key_localidade
''')

In [147]:
def criar_csv (df,name):
    
    df.coalesce(1).write\
        .format('csv')\
        .option('header',True)\
        .mode('overwrite')\
        .option('sep',';')\
        .save("/datalake/gold/"+name)
    
    copiar = "hdfs dfs -get /datalake/gold/"+name+"/*.csv /input/gold/"+name+".csv"
    
    os.system(copiar)
    
criar_csv(dim_tempo,'dim_tempo')
criar_csv(dim_localidade,'dim_localidade')
criar_csv(dim_clientes,'dim_clientes')
criar_csv(ft_vendas,'ft_vendas')
