In [5]:
#Cria sessão Spark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import pyspark.sql.functions as F
import pandas as pd
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from pyspark.sql.window import Window


In [2]:
#Instância Spark
spark = SparkSession.builder.appName('Curso Pyspark') \
         .config('spark.sql.repl.eagerEval.enabled', True) \
         .getOrCreate()      

In [3]:
# lendo um arquivos parquet e atribuindo a variaveis
compras = spark.read.parquet('./DATASETS/COMPRAS.parquet').select('id', 'data', 'cd_livro', 'cd_cliente').alias('compras')
livros = spark.read.parquet('./DATASETS/LIVROS.parquet').select('id', 'data_lancamento', 'preco').alias('livros')
autores = spark.read.parquet('./DATASETS/AUTORES.parquet').alias('autores')


In [4]:
df = compras.join(livros, compras.cd_livro == livros.id).join(autores, livros.id == autores.id).drop('livros.id', 'autores.id')
df

id,data,cd_livro,cd_cliente,id.1,data_lancamento,preco,id.2,titulo,autor
12389,2021-07-24,30334762,3339828,30334762,2013-05-08,123.47,30334762,Em Busca do Tempo...,Marcel Proust
12534,2021-07-15,14347542,7799936,14347542,2007-04-05,182.43,14347542,Fogo Morto,José Lins do Rego
12574,2020-05-19,10325500,6273720,10325500,2000-09-09,30.62,10325500,A Obscena Senhora D,Hilda Hilst
12675,2021-07-11,35940339,649001,35940339,2014-08-11,213.6,35940339,Adeus às Armas,Ernest Hemingway
13457,2021-10-26,21006591,4312106,21006591,2019-02-24,96.84,21006591,Paulicéia Desvair...,Mário de Andrade
13478,2020-05-10,19488257,670483,19488257,2013-05-19,17.11,19488257,O Ódio que Você S...,Angie Thomas
13562,2021-10-28,35940339,1275633,35940339,2014-08-11,213.6,35940339,Adeus às Armas,Ernest Hemingway
13679,2020-10-19,30144651,1261140,30144651,2011-04-14,242.2,30144651,Zero,Ignácio De Loyola...
13687,2020-05-29,21923195,4098904,21923195,2015-02-22,38.89,21923195,O Ex-Mágico,Murilo Rubião
13796,2021-10-15,16778973,4024706,16778973,2017-06-23,99.81,16778973,Crime e Castigo,Fiódor Dostoiévski


In [15]:
# ordena por ids
window1 = Window.orderBy('compras.id')
# ordena por ordem de compras (cada vez que o cliente mudar a contagem recomeca)
window2 = Window.partitionBy('cd_cliente').orderBy('data')
# para cada autor saber a ordem de lancamentos do livro
window3 = Window.partitionBy('autor').orderBy('data_lancamento')


In [20]:
(
        df
        .withColumn('num_linha', F.row_number().over(window1))
        .withColumn('ordem_compra', F.row_number().over(window2))
        .dropDuplicates(['cd_livro', 'autor'])
        .withColumn('ordem_lancamento', F.row_number().over(window3))
        # O total acumulado de valor de compras acumulado pelos clientes
        .withColumn('total_acumulado_cliente', F.round(F.sum('preco').over(window2), 2))
)

id,data,cd_livro,cd_cliente,id.1,data_lancamento,preco,id.2,titulo,autor,num_linha,ordem_compra,ordem_lancamento,total_acumulado_cliente
184365,2020-10-31,100520231,1010444,100520231,2009-08-17,35.55,100520231,Galáxias,Haroldo de Campos,3330,1,1,35.55
162408,2021-06-16,36059407,1010444,36059407,2021-01-20,222.98,36059407,As Histórias Comp...,Franz Kafka,2874,2,2,258.53
629783,2021-09-14,16581063,1010444,16581063,2013-05-16,145.25,16581063,Macunaíma – O Her...,Mário de Andrade,11521,3,1,403.78
594608,2021-10-11,30099528,1010444,30099528,2000-02-06,208.39,30099528,Retrato do Artist...,James Joyce,10824,4,1,612.17
561293,2021-11-02,30144651,1010444,30144651,2011-04-14,242.2,30144651,Zero,Ignácio De Loyola...,10118,5,1,854.37
648950,2022-01-15,22112497,1010444,22112497,2021-01-06,10.84,22112497,Mrs Dalloway,Virginia Woolf,11896,6,1,865.21
184567,2022-03-29,20414016,1010444,20414016,2011-06-09,212.1,20414016,Vidas Secas,Graciliano Ramos,3335,7,1,1077.31
26947,2022-05-04,36030824,1010444,36030824,2005-01-26,226.36,36030824,Tremor de Terra,Luiz Vilela,317,8,1,1303.67
472081,2022-06-19,26925428,1010444,26925428,2011-09-04,193.19,26925428,"Sing, Unburied, S...",Jesmyn Ward,8520,9,1,1496.86
27189,2022-06-26,12489208,1010444,12489208,2011-10-30,160.53,12489208,Triste Fim de Pol...,Lima Barreto,322,10,1,1657.39
