## Clube do Livro - Spark The definitive Guide
### Episódio 5

In [0]:
path = "/databricks-datasets/definitive-guide/data/retail-data/by-day/2010-12-01.csv"

df = (
    spark.read
        .format('csv')
        .option('header', 'true')
        .option('inferSchema', 'true')
        .load(path)
)

df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [0]:
from pyspark.sql.functions import lit, col

df.select(
        lit(1).alias('id'),
        lit('cabreira').alias('Nome')
).distinct().display()           



id,Nome
1,cabreira


In [0]:
from pyspark.sql.functions import col, instr 

#display(
#    df.selectExpr('InvoiceNo', 'Description')
#        .where("InvoiceNo = 536365")
#)

priceFilter = col('UnitPrice') > '600'
descriptionFilter = (
    instr(col('Description'), "POSTAGE") >= 1
)

display(
    df.selectExpr(
        'InvoiceNo',
        'Description',
        'UnitPrice'
    )
    .where(priceFilter | descriptionFilter)
)





InvoiceNo,Description,UnitPrice
536370,POSTAGE,18.0
536403,POSTAGE,15.0
536527,POSTAGE,18.0
536544,DOTCOM POSTAGE,569.77
536592,DOTCOM POSTAGE,607.49


In [0]:
display(df.describe())

summary,InvoiceNo,StockCode,Description,Quantity,UnitPrice,CustomerID,Country
count,3108,3108,3098,3108.0,3108.0,1968.0,3108
mean,536516.684944841,27834.304044117645,,8.627413127413128,4.151946589446603,15661.388719512195,
stddev,72.89447869788873,17407.897548583845,,26.371821677029203,15.638659854603892,1854.449699689363,
min,536365,10002,4 PURPLE FLOCK DINNER CANDLES,-24.0,0.0,12431.0,Australia
max,C536548,POST,ZINC WILLIE WINKIE CANDLE STICK,600.0,607.49,18229.0,United Kingdom


In [0]:
from pyspark.sql.functions import monotonically_increasing_id

display(
    df.withColumn(
        'id',
        monotonically_increasing_id()
    ).select('id', 'InvoiceNo')
    .limit(10)
    )

id,InvoiceNo
0,536365
1,536365
2,536365
3,536365
4,536365
5,536365
6,536365
7,536366
8,536366
9,536367


In [0]:
display(
    df.selectExpr(
        'Description',
        'initCap(Description) AS newCol',
        'ltrim("  LTRIM.  ") AS ltrim',
        'rtrim(".    RTRIM.   ") AS rtrim',
        'trim(".    TRIM.    ") AS trim'
    ).where("InvoiceNo = 536365")

)

Description,newCol,ltrim,rtrim,trim
WHITE HANGING HEART T-LIGHT HOLDER,White Hanging Heart T-light Holder,LTRIM.,. RTRIM.,. TRIM.
WHITE METAL LANTERN,White Metal Lantern,LTRIM.,. RTRIM.,. TRIM.
CREAM CUPID HEARTS COAT HANGER,Cream Cupid Hearts Coat Hanger,LTRIM.,. RTRIM.,. TRIM.
KNITTED UNION FLAG HOT WATER BOTTLE,Knitted Union Flag Hot Water Bottle,LTRIM.,. RTRIM.,. TRIM.
RED WOOLLY HOTTIE WHITE HEART.,Red Woolly Hottie White Heart.,LTRIM.,. RTRIM.,. TRIM.
SET 7 BABUSHKA NESTING BOXES,Set 7 Babushka Nesting Boxes,LTRIM.,. RTRIM.,. TRIM.
GLASS STAR FROSTED T-LIGHT HOLDER,Glass Star Frosted T-light Holder,LTRIM.,. RTRIM.,. TRIM.


In [0]:
#display(
#    df.selectExpr(
#        "Description",
#        """
#        regexp_replace(
#            Description,
#            'BLACK|WHITE|RED',
#            'COLOR'
#        ) AS newDescription"""
#    )
#    .where("InvoiceNo = 536365")
#)


#display(
#    df.selectExpr(
#        "Description",
#        """translate(
#            Description,
#            'LEET',
#            '1337'
#        ) AS newDescription"""
#    ).where( "invoiceNo = 536365")
#)

#display(
#    df.selectExpr(
#        "Description",
#        """translate(
#            Description,
#            'LEET',
#            '1337'
#        ) AS newDescription"""
#    ).where( "invoiceNo = 536365")
#)

display(
    df.selectExpr(
        "Description",
        """regexp_extract(
            Description,
            '(BLACK|WHITE|RED)'
        ) AS newDescription"""
    ).where( "invoiceNo = 536365")
)


Description,newDescription
WHITE HANGING HEART T-LIGHT HOLDER,WHITE
WHITE METAL LANTERN,WHITE
CREAM CUPID HEARTS COAT HANGER,
KNITTED UNION FLAG HOT WATER BOTTLE,
RED WOOLLY HOTTIE WHITE HEART.,RED
SET 7 BABUSHKA NESTING BOXES,
GLASS STAR FROSTED T-LIGHT HOLDER,


In [0]:
from pyspark.sql.functions import current_date, current_timestamp

dateDF = ( 
    spark.range(10)
    .withColumn(
        'today',
        current_date()
    )
    .withColumn(
        'now',
        current_timestamp()

    )
)

display(dateDF)

id,today,now
0,2024-12-14,2024-12-14T19:10:03.504Z
1,2024-12-14,2024-12-14T19:10:03.504Z
2,2024-12-14,2024-12-14T19:10:03.504Z
3,2024-12-14,2024-12-14T19:10:03.504Z
4,2024-12-14,2024-12-14T19:10:03.504Z
5,2024-12-14,2024-12-14T19:10:03.504Z
6,2024-12-14,2024-12-14T19:10:03.504Z
7,2024-12-14,2024-12-14T19:10:03.504Z
8,2024-12-14,2024-12-14T19:10:03.504Z
9,2024-12-14,2024-12-14T19:10:03.504Z


In [0]:
display(
    dateDF.selectExpr(
        "date_sub(today, 5) AS date_sub",
        "date_add(today, 5) AS date_add",
        """
        date_diff(
            date_add(today,5),
            today
        ) AS date_diff
        """,
        """
        months_between(
            date_add(today,365),
            today
        ) AS months_between
        """,
        """
        to_date(
            '2024-12-11',
            'yyyy-MM-dd'
        ) AS castWithFormat
        """,
        """
        to_timestamp(
            '2024-12-11',
            'yyyy-MM-dd'
        ) AS timestamp
        """
    )
)

date_sub,date_add,date_diff,months_between,castWithFormat,timestamp
2024-12-09,2024-12-19,5,12.0,2024-12-11,2024-12-11T00:00:00Z
2024-12-09,2024-12-19,5,12.0,2024-12-11,2024-12-11T00:00:00Z
2024-12-09,2024-12-19,5,12.0,2024-12-11,2024-12-11T00:00:00Z
2024-12-09,2024-12-19,5,12.0,2024-12-11,2024-12-11T00:00:00Z
2024-12-09,2024-12-19,5,12.0,2024-12-11,2024-12-11T00:00:00Z
2024-12-09,2024-12-19,5,12.0,2024-12-11,2024-12-11T00:00:00Z
2024-12-09,2024-12-19,5,12.0,2024-12-11,2024-12-11T00:00:00Z
2024-12-09,2024-12-19,5,12.0,2024-12-11,2024-12-11T00:00:00Z
2024-12-09,2024-12-19,5,12.0,2024-12-11,2024-12-11T00:00:00Z
2024-12-09,2024-12-19,5,12.0,2024-12-11,2024-12-11T00:00:00Z


In [0]:
 display(
        spark.sql(
         """
            SELECT 
                CAST(null AS string)  AS nullCol,
                ifnull(null,"ShowStr") AS ifNull,
                nullif("1","1") AS nullIf,
                coalesce(null,"valor") As coalesce
        """
        ) 
 )

nullCol,ifNull,nullIf,coalesce
,ShowStr,,valor


In [0]:
dfnull = spark.sql("""
    select 
        null AS col1,
        null AS col2, 
        Null AS col3


    union all

    select 
        "1" as col1, 
        "Cabreira" as col2,
        null AS col3

    union all

    select 
        "2" AS col1,
        "Joao" AS col2, 
        "33" AS col3

    union all

    select  
        "3" as col1, 
        null as col2, 
        "25" as col3

    union all

    select 
        "4" as col1,
        "" as col2,
        "" AS col3


""")

display(dfnull)

col1,col2,col3
,,
1.0,Cabreira,
2.0,Joao,33.0
3.0,,25.0
4.0,,


In [0]:
display(dfnull.na.drop('all'))

col1,col2,col3
1,Cabreira,
2,Joao,33.0
3,,25.0
4,,


In [0]:
display(dfnull.na.drop('any'))

col1,col2,col3
2,Joao,33.0
4,,


In [0]:
display(
    dfnull.na.drop(
        'any',
        subset= ['col1', 'col2'] ))

col1,col2,col3
1,Cabreira,
2,Joao,33.0
4,,


In [0]:
display(dfnull.na.fill('valor'))

col1,col2,col3
valor,valor,valor
1,Cabreira,valor
2,Joao,33
3,valor,25
4,,


In [0]:
display(dfnull.na.replace([""], 'blank'))

col1,col2,col3
,,
1.0,Cabreira,
2.0,Joao,33
3.0,,25
4.0,blank,blank


In [0]:
from pyspark.sql.functions import expr

display(
    df.withColumn(
        "complex",
        expr("""
            struct(
                Description,
                InvoiceNo

            )"""
        )
    ).select(
        'complex',
        'complex.InvoiceNo'

    ).limit(10)
)

complex,InvoiceNo
"List(WHITE HANGING HEART T-LIGHT HOLDER, 536365)",536365
"List(WHITE METAL LANTERN, 536365)",536365
"List(CREAM CUPID HEARTS COAT HANGER, 536365)",536365
"List(KNITTED UNION FLAG HOT WATER BOTTLE, 536365)",536365
"List(RED WOOLLY HOTTIE WHITE HEART., 536365)",536365
"List(SET 7 BABUSHKA NESTING BOXES, 536365)",536365
"List(GLASS STAR FROSTED T-LIGHT HOLDER, 536365)",536365
"List(HAND WARMER UNION JACK, 536366)",536366
"List(HAND WARMER RED POLKA DOT, 536366)",536366
"List(ASSORTED COLOUR BIRD ORNAMENT, 536367)",536367


In [0]:
display(
df.selectExpr(
    'Description',
    'split(Description, " ") AS arrayCol',
    'split(Description, " ")[0] AS arrayFirst',
    """
    size(
        split(Description, " ")
        ) AS ArraySize
    """,
    """
    array_contains(
        split(Description, " "),
        'WHITE'
    ) as containsWhite
    """
).limit(10)
)

Description,arrayCol,arrayFirst,ArraySize,containsWhite
WHITE HANGING HEART T-LIGHT HOLDER,"List(WHITE, HANGING, HEART, T-LIGHT, HOLDER)",WHITE,5,True
WHITE METAL LANTERN,"List(WHITE, METAL, LANTERN)",WHITE,3,True
CREAM CUPID HEARTS COAT HANGER,"List(CREAM, CUPID, HEARTS, COAT, HANGER)",CREAM,5,False
KNITTED UNION FLAG HOT WATER BOTTLE,"List(KNITTED, UNION, FLAG, HOT, WATER, BOTTLE)",KNITTED,6,False
RED WOOLLY HOTTIE WHITE HEART.,"List(RED, WOOLLY, HOTTIE, WHITE, HEART.)",RED,5,True
SET 7 BABUSHKA NESTING BOXES,"List(SET, 7, BABUSHKA, NESTING, BOXES)",SET,5,False
GLASS STAR FROSTED T-LIGHT HOLDER,"List(GLASS, STAR, FROSTED, T-LIGHT, HOLDER)",GLASS,5,False
HAND WARMER UNION JACK,"List(HAND, WARMER, UNION, JACK)",HAND,4,False
HAND WARMER RED POLKA DOT,"List(HAND, WARMER, RED, POLKA, DOT)",HAND,5,False
ASSORTED COLOUR BIRD ORNAMENT,"List(ASSORTED, COLOUR, BIRD, ORNAMENT)",ASSORTED,4,False


In [0]:
display(
    df.limit(1)
        .selectExpr(
            'InvoiceNo',
            'split(Description," ") as arrayCol',
            """
            explode(
                split(Description, " ")
            ) as explodeCol
            """

        )
)

InvoiceNo,arrayCol,explodeCol
536365,"List(WHITE, HANGING, HEART, T-LIGHT, HOLDER)",WHITE
536365,"List(WHITE, HANGING, HEART, T-LIGHT, HOLDER)",HANGING
536365,"List(WHITE, HANGING, HEART, T-LIGHT, HOLDER)",HEART
536365,"List(WHITE, HANGING, HEART, T-LIGHT, HOLDER)",T-LIGHT
536365,"List(WHITE, HANGING, HEART, T-LIGHT, HOLDER)",HOLDER


In [0]:
display(
    df.where('Description IS NOT NULL')
        .selectExpr(
            """
            map(
                'Description', Description,
                'InvoiceNo', InvoiceNo
            ) as mapCol
            """
        ).limit(10)
)

mapCol
"Map(Description -> WHITE HANGING HEART T-LIGHT HOLDER, InvoiceNo -> 536365)"
"Map(Description -> WHITE METAL LANTERN, InvoiceNo -> 536365)"
"Map(Description -> CREAM CUPID HEARTS COAT HANGER, InvoiceNo -> 536365)"
"Map(Description -> KNITTED UNION FLAG HOT WATER BOTTLE, InvoiceNo -> 536365)"
"Map(Description -> RED WOOLLY HOTTIE WHITE HEART., InvoiceNo -> 536365)"
"Map(Description -> SET 7 BABUSHKA NESTING BOXES, InvoiceNo -> 536365)"
"Map(Description -> GLASS STAR FROSTED T-LIGHT HOLDER, InvoiceNo -> 536365)"
"Map(Description -> HAND WARMER UNION JACK, InvoiceNo -> 536366)"
"Map(Description -> HAND WARMER RED POLKA DOT, InvoiceNo -> 536366)"
"Map(Description -> ASSORTED COLOUR BIRD ORNAMENT, InvoiceNo -> 536367)"


In [0]:
from pyspark.sql.functions import to_json, from_json
from pyspark.sql.types import * 


jsonDF = (
    df.selectExpr(
        "(InvoiceNo, Description) AS structCol"
    ).limit(10)
)

display(jsonDF)

structCol
"List(536365, WHITE HANGING HEART T-LIGHT HOLDER)"
"List(536365, WHITE METAL LANTERN)"
"List(536365, CREAM CUPID HEARTS COAT HANGER)"
"List(536365, KNITTED UNION FLAG HOT WATER BOTTLE)"
"List(536365, RED WOOLLY HOTTIE WHITE HEART.)"
"List(536365, SET 7 BABUSHKA NESTING BOXES)"
"List(536365, GLASS STAR FROSTED T-LIGHT HOLDER)"
"List(536366, HAND WARMER UNION JACK)"
"List(536366, HAND WARMER RED POLKA DOT)"
"List(536367, ASSORTED COLOUR BIRD ORNAMENT)"


In [0]:


jsonDF =  jsonDF.select(
        "structCol",
        to_json(col("structCol")).alias('stringCol')

)


display(jsonDF)

structCol,stringCol
"List(536365, WHITE HANGING HEART T-LIGHT HOLDER)","{""InvoiceNo"":""536365"",""Description"":""WHITE HANGING HEART T-LIGHT HOLDER""}"
"List(536365, WHITE METAL LANTERN)","{""InvoiceNo"":""536365"",""Description"":""WHITE METAL LANTERN""}"
"List(536365, CREAM CUPID HEARTS COAT HANGER)","{""InvoiceNo"":""536365"",""Description"":""CREAM CUPID HEARTS COAT HANGER""}"
"List(536365, KNITTED UNION FLAG HOT WATER BOTTLE)","{""InvoiceNo"":""536365"",""Description"":""KNITTED UNION FLAG HOT WATER BOTTLE""}"
"List(536365, RED WOOLLY HOTTIE WHITE HEART.)","{""InvoiceNo"":""536365"",""Description"":""RED WOOLLY HOTTIE WHITE HEART.""}"
"List(536365, SET 7 BABUSHKA NESTING BOXES)","{""InvoiceNo"":""536365"",""Description"":""SET 7 BABUSHKA NESTING BOXES""}"
"List(536365, GLASS STAR FROSTED T-LIGHT HOLDER)","{""InvoiceNo"":""536365"",""Description"":""GLASS STAR FROSTED T-LIGHT HOLDER""}"
"List(536366, HAND WARMER UNION JACK)","{""InvoiceNo"":""536366"",""Description"":""HAND WARMER UNION JACK""}"
"List(536366, HAND WARMER RED POLKA DOT)","{""InvoiceNo"":""536366"",""Description"":""HAND WARMER RED POLKA DOT""}"
"List(536367, ASSORTED COLOUR BIRD ORNAMENT)","{""InvoiceNo"":""536367"",""Description"":""ASSORTED COLOUR BIRD ORNAMENT""}"


In [0]:
schema = StructType([
            StructField('InvoiceNo', StringType()),
            StructField('Description', StringType())

])

display(
    jsonDF.select(
        "structCol",
        "stringCol",
        from_json(col("stringCol"), schema).alias("newJson")
    )
)

structCol,stringCol,newJson
"List(536365, WHITE HANGING HEART T-LIGHT HOLDER)","{""InvoiceNo"":""536365"",""Description"":""WHITE HANGING HEART T-LIGHT HOLDER""}","List(536365, WHITE HANGING HEART T-LIGHT HOLDER)"
"List(536365, WHITE METAL LANTERN)","{""InvoiceNo"":""536365"",""Description"":""WHITE METAL LANTERN""}","List(536365, WHITE METAL LANTERN)"
"List(536365, CREAM CUPID HEARTS COAT HANGER)","{""InvoiceNo"":""536365"",""Description"":""CREAM CUPID HEARTS COAT HANGER""}","List(536365, CREAM CUPID HEARTS COAT HANGER)"
"List(536365, KNITTED UNION FLAG HOT WATER BOTTLE)","{""InvoiceNo"":""536365"",""Description"":""KNITTED UNION FLAG HOT WATER BOTTLE""}","List(536365, KNITTED UNION FLAG HOT WATER BOTTLE)"
"List(536365, RED WOOLLY HOTTIE WHITE HEART.)","{""InvoiceNo"":""536365"",""Description"":""RED WOOLLY HOTTIE WHITE HEART.""}","List(536365, RED WOOLLY HOTTIE WHITE HEART.)"
"List(536365, SET 7 BABUSHKA NESTING BOXES)","{""InvoiceNo"":""536365"",""Description"":""SET 7 BABUSHKA NESTING BOXES""}","List(536365, SET 7 BABUSHKA NESTING BOXES)"
"List(536365, GLASS STAR FROSTED T-LIGHT HOLDER)","{""InvoiceNo"":""536365"",""Description"":""GLASS STAR FROSTED T-LIGHT HOLDER""}","List(536365, GLASS STAR FROSTED T-LIGHT HOLDER)"
"List(536366, HAND WARMER UNION JACK)","{""InvoiceNo"":""536366"",""Description"":""HAND WARMER UNION JACK""}","List(536366, HAND WARMER UNION JACK)"
"List(536366, HAND WARMER RED POLKA DOT)","{""InvoiceNo"":""536366"",""Description"":""HAND WARMER RED POLKA DOT""}","List(536366, HAND WARMER RED POLKA DOT)"
"List(536367, ASSORTED COLOUR BIRD ORNAMENT)","{""InvoiceNo"":""536367"",""Description"":""ASSORTED COLOUR BIRD ORNAMENT""}","List(536367, ASSORTED COLOUR BIRD ORNAMENT)"


In [0]:
# get schema on the fly - helps in case os schema evolution 
schemaOnTheFly = jsonDF.selectExpr("schema_of_json_agg(stringCol)").collect()[0][0]

display(
    jsonDF.select(
        "structCol",
        "stringCol",
        from_json(
            col('stringCol'),
            schemaOnTheFly
        ).alias('newJson')
))

structCol,stringCol,newJson
"List(536365, WHITE HANGING HEART T-LIGHT HOLDER)","{""InvoiceNo"":""536365"",""Description"":""WHITE HANGING HEART T-LIGHT HOLDER""}","List(WHITE HANGING HEART T-LIGHT HOLDER, 536365)"
"List(536365, WHITE METAL LANTERN)","{""InvoiceNo"":""536365"",""Description"":""WHITE METAL LANTERN""}","List(WHITE METAL LANTERN, 536365)"
"List(536365, CREAM CUPID HEARTS COAT HANGER)","{""InvoiceNo"":""536365"",""Description"":""CREAM CUPID HEARTS COAT HANGER""}","List(CREAM CUPID HEARTS COAT HANGER, 536365)"
"List(536365, KNITTED UNION FLAG HOT WATER BOTTLE)","{""InvoiceNo"":""536365"",""Description"":""KNITTED UNION FLAG HOT WATER BOTTLE""}","List(KNITTED UNION FLAG HOT WATER BOTTLE, 536365)"
"List(536365, RED WOOLLY HOTTIE WHITE HEART.)","{""InvoiceNo"":""536365"",""Description"":""RED WOOLLY HOTTIE WHITE HEART.""}","List(RED WOOLLY HOTTIE WHITE HEART., 536365)"
"List(536365, SET 7 BABUSHKA NESTING BOXES)","{""InvoiceNo"":""536365"",""Description"":""SET 7 BABUSHKA NESTING BOXES""}","List(SET 7 BABUSHKA NESTING BOXES, 536365)"
"List(536365, GLASS STAR FROSTED T-LIGHT HOLDER)","{""InvoiceNo"":""536365"",""Description"":""GLASS STAR FROSTED T-LIGHT HOLDER""}","List(GLASS STAR FROSTED T-LIGHT HOLDER, 536365)"
"List(536366, HAND WARMER UNION JACK)","{""InvoiceNo"":""536366"",""Description"":""HAND WARMER UNION JACK""}","List(HAND WARMER UNION JACK, 536366)"
"List(536366, HAND WARMER RED POLKA DOT)","{""InvoiceNo"":""536366"",""Description"":""HAND WARMER RED POLKA DOT""}","List(HAND WARMER RED POLKA DOT, 536366)"
"List(536367, ASSORTED COLOUR BIRD ORNAMENT)","{""InvoiceNo"":""536367"",""Description"":""ASSORTED COLOUR BIRD ORNAMENT""}","List(ASSORTED COLOUR BIRD ORNAMENT, 536367)"
