# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [None]:
%help

####  Run this cell to set up and start your interactive session.


In [None]:
%idle_timeout 2880
%glue_version 4.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, explode
from pyspark.sql.types import IntegerType, StructType, StructField, StringType, ArrayType

In [None]:
spark = SparkSession.builder.appName("dataeng-modulo-2").getOrCreate()

In [None]:
data = [
    ("João", [{"curso": "Matemática", "nota": 85}, {"curso": "História", "nota": 90}]),
    ("Maria", [{"curso": "Matemática", "nota": 95}, {"curso": "História", "nota": 80}])
]

In [None]:
schema = StructType([
    StructField("nome", StringType(), True),
    StructField("cursos", ArrayType(StructType([
        StructField("curso", StringType(), True),
        StructField("nota", IntegerType(), True)
    ])), True)
])

In [None]:
df = spark.createDataFrame(data, schema)


In [None]:
df.show(truncate=False)

# 2. Explodindo o Array para Linhas Individuais

In [None]:
df_exploded = df.withColumn("curso", explode(df["cursos"]))
df_exploded = df_exploded.select("nome", col("curso.curso"), col("curso.nota"))
df_exploded.show()

# 3. Definindo uma UDF para Calcular um Bônus na Nota


In [None]:
@udf(IntegerType())
def calcular_bonus(nota):
    return nota + 5

In [None]:
df_bonus = df_exploded.withColumn("nota_bonus", calcular_bonus(df_exploded["nota"]))
df_bonus.show()


# 4. Aplicação de Pivot

In [None]:
df_pivot_bonus = df_bonus.groupBy("nome").pivot("curso").agg({"nota_bonus": "max"})
df_pivot_bonus.show()

# 5. Rollup para Agregações Hierárquicas


In [None]:
df_rollup = df_exploded.rollup("nome", "curso").agg({"nota": "avg"}).orderBy("nome", "curso")
df_rollup.show()

# 6. Cube para Agregações Multidimensionais


In [None]:
df_cube = df_exploded.cube("nome", "curso").agg({"nota": "avg"}).orderBy("nome", "curso")
df_cube.show()

# Encerrando a SparkSession


In [None]:
spark.stop()