# PySpark: Data Transformation

In [None]:
import findspark; findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)
spark.conf.set('spark.sql.repl.eagerEval.truncate', 1000)

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql.window import Window

import numpy as np
import pandas as pd

## 1. Miscellaneous techniques

### 1.1. Mapping
Mapping in PySpark requires using the `udf()` function, which allows a Python function to work on PySpark dataframes.

In [1]:
import findspark; findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

import pyspark.sql.functions as F
import pyspark.sql.types as T

import numpy as np
import pandas as pd

In [2]:
dfCar = pd.read_csv('../data/cars.csv')
dfCar = spark.createDataFrame(dfCar.astype(str)).replace('nan', None)
dfCar = dfCar.withColumn('price', F.col('price').cast('float'))
dfCar.limit(5)

manufacturer,model,type,min_price,price,max_price,mpg_city,mpg_highway,airbags,drive_train,cylinders,engine_size,horsepower,rpm,rev_per_mile,man_trans_avail,fuel_tank_capacity,passengers,length,wheelbase,width,turn_circle,rear_seat_room,luggage_room,weight,origin,make
Chevrolet,Cavalier,Compact,8.5,13.4,18.3,25,36,,Front,4.0,2.2,110,5200,2380,Yes,15.2,5,182,101,66,38,25.0,13.0,2490,USA,Chevrolet Cavalier
Chevrolet,Corsica,Compact,11.4,11.4,11.4,25,34,Driver only,Front,4.0,2.2,110,5200,2665,Yes,15.6,5,184,103,68,39,26.0,14.0,2785,USA,Chevrolet Corsica
Chevrolet,Camaro,Sporty,13.4,15.1,16.8,19,28,Driver & Passenger,Rear,6.0,3.4,160,4600,1805,Yes,15.5,4,193,101,74,43,25.0,13.0,3240,USA,Chevrolet Camaro
Chevrolet,Lumina,Midsize,13.4,15.9,18.4,21,29,,Front,4.0,2.2,110,5200,2595,No,16.5,6,198,108,71,40,28.5,16.0,3195,USA,Chevrolet Lumina
Chevrolet,Lumina_APV,Van,14.7,16.3,18.0,18,23,,Front,6.0,3.8,170,4800,1690,No,20.0,7,178,110,74,44,30.5,,3715,USA,Chevrolet Lumina_APV


In [3]:
dfCar.approxQuantile('price', [0.25, 0.5, 0.75], relativeError=0)

[11.600000381469727, 15.899999618530273, 18.799999237060547]

In [4]:
def tmp_getPriceLevel(price):
    if price < 11.6:
        return 'very low'
    elif price < 15.9:
        return 'low'
    elif price < 18.8:
        return 'high'
    else:
        return 'very high'

getPriceLevel = F.udf(lambda price: tmp_getPriceLevel(price))

In [5]:
dfCar.select('price', getPriceLevel('price').alias('price_level')).limit(5)

price,price_level
13.4,low
11.4,very low
15.1,low
15.9,low
16.3,high


### 1.2. Window functions

In [6]:
import findspark; findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

import pyspark.sql.functions as F
import pyspark.sql.types as T

import numpy as np
import pandas as pd

In [7]:
data = (
    ('James', 'Sales', 3000),
    ('Harry', 'Sales', 3500),
    ('Ash', 'Sales', 3000),
    ('Michael', 'Sales', 4600),
    ('Robert', 'Sales', 4100),
    ('Maria', 'Finance', 3000),
    ('Wayne', 'Sales', 3000),
    ('Scott', 'Finance', 3300),
    ('Jen', 'Finance', 3900),
    ('Jeff', 'Marketing', 3000),
    ('Kumar', 'Marketing', 2000),
    ('Saif', 'Sales', 4100))
 
schema = ['employee', 'department', 'salary']

dfSalary = spark.createDataFrame(data, schema)

In [7]:
dfSalary

employee,department,salary
James,Sales,3000
Harry,Sales,3500
Ash,Sales,3000
Michael,Sales,4600
Robert,Sales,4100
Maria,Finance,3000
Wayne,Sales,3000
Scott,Finance,3300
Jen,Finance,3900
Jeff,Marketing,3000


In order to activate window functions, firstly initialize the window.

In [8]:
from pyspark.sql.window import Window
window = Window.partitionBy('department').orderBy('salary')

#### Ranking

In [9]:
dfSalary\
    .withColumn('row_number', F.row_number().over(window))\
    .withColumn('rank', F.rank().over(window))\
    .withColumn('dense_rank', F.dense_rank().over(window))\
    .withColumn('percent_rank', F.round(F.percent_rank().over(window), 2))\
    .withColumn('cume_dist', F.round(F.cume_dist().over(window), 2))\
    .withColumn('ntile_3', F.ntile(3).over(window))

employee,department,salary,row_number,rank,dense_rank,percent_rank,cume_dist,ntile_3
James,Sales,3000,1,1,1,0.0,0.43,1
Ash,Sales,3000,2,1,1,0.0,0.43,1
Wayne,Sales,3000,3,1,1,0.0,0.43,1
Harry,Sales,3500,4,4,2,0.5,0.57,2
Robert,Sales,4100,5,5,3,0.67,0.86,2
Saif,Sales,4100,6,5,3,0.67,0.86,3
Michael,Sales,4600,7,7,4,1.0,1.0,3
Maria,Finance,3000,1,1,1,0.0,0.33,1
Scott,Finance,3300,2,2,2,0.5,0.67,2
Jen,Finance,3900,3,3,3,1.0,1.0,3


#### Shifting

In [10]:
dfSalary\
    .withColumn('lag_1', F.lag('salary', 1).over(window))\
    .withColumn('lead_2', F.lead('salary', 2).over(window))

employee,department,salary,lag_1,lead_2
James,Sales,3000,,3000.0
Ash,Sales,3000,3000.0,3500.0
Wayne,Sales,3000,3000.0,4100.0
Harry,Sales,3500,3000.0,4100.0
Robert,Sales,4100,3500.0,4600.0
Saif,Sales,4100,4100.0,
Michael,Sales,4600,4100.0,
Maria,Finance,3000,,3900.0
Scott,Finance,3300,3000.0,
Jen,Finance,3900,3300.0,


#### Aggregating

In [11]:
dfSalary\
    .withColumn('cumsum', F.sum('salary').over(window))

employee,department,salary,cumsum
James,Sales,3000,9000
Ash,Sales,3000,9000
Wayne,Sales,3000,9000
Harry,Sales,3500,12500
Robert,Sales,4100,20700
Saif,Sales,4100,20700
Michael,Sales,4600,25300
Maria,Finance,3000,3000
Scott,Finance,3300,6300
Jen,Finance,3900,10200


## 2. Pivot table

### 2.1. Unpivot

In [12]:
import findspark; findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

import pyspark.sql.functions as F
import pyspark.sql.types as T

import numpy as np
import pandas as pd

In [13]:
data = (
    ('red', 1000, 1200, 1500),
    ('green', 1500, 1500, 1575),
    ('blue', 2000, 2200, 2000)
)

schema = ['color', 'small', 'medium', 'large']

dfProduct = spark.createDataFrame(data, schema)

In [13]:
dfProduct

color,small,medium,large
red,1000,1200,1500
green,1500,1500,1575
blue,2000,2200,2000


In [14]:
dfProduct.select('color', F.expr('stack(3, "small", small, "medium", medium, "large", large) as (size, price)'))

color,size,price
red,small,1000
red,medium,1200
red,large,1500
green,small,1500
green,medium,1500
green,large,1575
blue,small,2000
blue,medium,2200
blue,large,2000


### 2.2. Pivot table

In [15]:
import findspark; findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

import pyspark.sql.functions as F
import pyspark.sql.types as T

import numpy as np
import pandas as pd

In [16]:
dfSupermarket = spark.read.csv('../data/supermarket_sales.csv', header=True)
dfSupermarket.limit(5)

invoice_id,brand,city,customer_type,gender,product_line,unit_price,quantity,tax,date,time,payment,cost,gross_margin_percentage,profit,rating
750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,01/05/2019,13:08,Ewallet,522.83,4.761904762,26.1415,9.1
226-31-3081,C,Naypyitaw,Normal,Female,Electronic access...,15.28,5,3.82,03/08/2019,10:29,Cash,76.4,4.761904762,3.82,9.6
631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,03/03/2019,13:23,Credit card,324.31,4.761904762,16.2155,7.4
123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,23.288,1/27/2019,20:33,Ewallet,465.76,4.761904762,23.288,8.4
373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.2085,02/08/2019,10:37,Ewallet,604.17,4.761904762,30.2085,5.3


In [17]:
dfSupermarket\
    .groupBy('product_line')\
    .pivot('city')\
    .agg(F.round(F.mean('profit'), 2))

product_line,Mandalay,Naypyitaw,Yangon
Home and lifestyle,16.71,14.7,16.42
Fashion accessories,12.61,15.79,15.25
Health and beauty,17.95,15.22,12.76
Electronic access...,14.76,16.42,14.54
Food and beverages,14.49,17.15,14.09
Sports and travel,15.35,16.68,15.64


#### Multivariate pivoting

In [18]:
dfSupermarket\
    .withColumn('info', F.concat(F.col('gender'), F.lit(', '), F.col('customer_type')))\
    .groupBy('product_line')\
    .pivot('info')\
    .agg(F.round(F.mean('profit'), 2))

product_line,"Female, Member","Female, Normal","Male, Member","Male, Normal"
Home and lifestyle,17.46,19.05,14.21,13.84
Fashion accessories,15.32,14.88,13.68,14.03
Health and beauty,13.3,14.26,19.33,13.95
Electronic access...,15.17,15.5,14.78,15.38
Food and beverages,18.3,16.57,13.02,13.03
Sports and travel,15.55,15.34,15.31,16.98


### 2.3. Crosstab
Crosstab is a special case of pivot table, in which `count` is selected as the aggregate function.

In [19]:
import findspark; findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

import pyspark.sql.functions as F
import pyspark.sql.types as T

import numpy as np
import pandas as pd

In [20]:
dfSupermarket = spark.read.csv('../data/supermarket_sales.csv', header=True)
dfSupermarket.limit(5)

invoice_id,brand,city,customer_type,gender,product_line,unit_price,quantity,tax,date,time,payment,cost,gross_margin_percentage,profit,rating
750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,01/05/2019,13:08,Ewallet,522.83,4.761904762,26.1415,9.1
226-31-3081,C,Naypyitaw,Normal,Female,Electronic access...,15.28,5,3.82,03/08/2019,10:29,Cash,76.4,4.761904762,3.82,9.6
631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,03/03/2019,13:23,Credit card,324.31,4.761904762,16.2155,7.4
123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,23.288,1/27/2019,20:33,Ewallet,465.76,4.761904762,23.288,8.4
373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.2085,02/08/2019,10:37,Ewallet,604.17,4.761904762,30.2085,5.3


In [21]:
dfSupermarket.crosstab('city', 'payment')

city_payment,Cash,Credit card,Ewallet
Naypyitaw,124,98,106
Mandalay,110,109,113
Yangon,110,104,126


## 3. Combining datasets

### 3.1. Union
PySpark supports two union methods:
- `DataFrame.union()`: union using the current order of columns
- `DataFrame.unionByName()`: union using column names

In [22]:
import findspark; findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

import pyspark.sql.functions as F
import pyspark.sql.types as T

import numpy as np
import pandas as pd

In [23]:
data = (
    (2019, 1, 2500),
    (2019, 2, 3500),
    (2019, 3, 4000),
    (2019, 4, 5000)
)

schema = ['year', 'quarter', 'profit']

dfProfit2019 = spark.createDataFrame(data, schema)

In [23]:
dfProfit2019

year,quarter,profit
2019,1,2500
2019,2,3500
2019,3,4000
2019,4,5000


In [24]:
data = (
    (2020, 1, 2700),
    (2020, 2, 3900),
    (2020, 3, 5000),
    (2020, 4, 8000)
)

schema = ['year', 'quarter', 'profit']

dfProfit2020 = spark.createDataFrame(data, schema)

In [24]:
dfProfit2020

year,quarter,profit
2020,1,2700
2020,2,3900
2020,3,5000
2020,4,8000


In [25]:
dfProfit2019.union(dfProfit2020)

year,quarter,profit
2019,1,2500
2019,2,3500
2019,3,4000
2019,4,5000
2020,1,2700
2020,2,3900
2020,3,5000
2020,4,8000


In [26]:
dfProfit2019.unionByName(dfProfit2020)

year,quarter,profit
2019,1,2500
2019,2,3500
2019,3,4000
2019,4,5000
2020,1,2700
2020,2,3900
2020,3,5000
2020,4,8000


### 3.2. Join

In [27]:
import findspark; findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

import pyspark.sql.functions as F
import pyspark.sql.types as T

import numpy as np
import pandas as pd

#### Conditional join
Conditional join combines two dataframes but only keeps matching values in both tables. There are 4 types of conditional join: *inner*, *left*, *right* and *outer*.

In [28]:
data = (
    ('Hannah', 1200, 'Allowance'),
    ('James', 3000, 'Basic'),
    ('Gabriel', 700, 'Allowance'),
    ('Smith', 2000, 'Basic'),
    ('Alex', 10000, 'Higher'),
)

schema = ['name', 'income_before_tax', 'tax_band']

dfIncome = spark.createDataFrame(data, schema)

In [28]:
dfIncome

name,income_before_tax,tax_band
Hannah,1200,Allowance
James,3000,Basic
Gabriel,700,Allowance
Smith,2000,Basic
Alex,10000,Higher


In [29]:
data = (
    ('Allowance', 'Up to 12,500', 0.0),
    ('Basic', '12,501 to 50,000', 0.2),
    ('Higher', '50,001 to 150,000', 0.4),
    ('Additional', 'Over 150,000', 0.45),
)

schema = ['band', 'income_range', 'tax_rate']

dfTax = spark.createDataFrame(data, schema)

In [29]:
dfTax

band,income_range,tax_rate
Allowance,"Up to 12,500",0.0
Basic,"12,501 to 50,000",0.2
Higher,"50,001 to 150,000",0.4
Additional,"Over 150,000",0.45


In [30]:
dfIncome.join(dfTax, dfIncome.tax_band==dfTax.band, how='left')

name,income_before_tax,tax_band,band,income_range,tax_rate
Hannah,1200,Allowance,Allowance,"Up to 12,500",0.0
Gabriel,700,Allowance,Allowance,"Up to 12,500",0.0
Alex,10000,Higher,Higher,"50,001 to 150,000",0.4
James,3000,Basic,Basic,"12,501 to 50,000",0.2
Smith,2000,Basic,Basic,"12,501 to 50,000",0.2


In [31]:
dfIncome\
    .withColumnRenamed('tax_band', 'band')\
    .join(dfTax, on='band', how='left')

band,name,income_before_tax,income_range,tax_rate
Allowance,Hannah,1200,"Up to 12,500",0.0
Allowance,Gabriel,700,"Up to 12,500",0.0
Higher,Alex,10000,"50,001 to 150,000",0.4
Basic,James,3000,"12,501 to 50,000",0.2
Basic,Smith,2000,"12,501 to 50,000",0.2


#### Cross join
Cross join performs a Catersian product, returning all combinations of rows in the two dataframes.

In [32]:
dfIncome.crossJoin(dfTax)

name,income_before_tax,tax_band,band,income_range,tax_rate
Hannah,1200,Allowance,Allowance,"Up to 12,500",0.0
Hannah,1200,Allowance,Basic,"12,501 to 50,000",0.2
Hannah,1200,Allowance,Higher,"50,001 to 150,000",0.4
Hannah,1200,Allowance,Additional,"Over 150,000",0.45
James,3000,Basic,Allowance,"Up to 12,500",0.0
James,3000,Basic,Basic,"12,501 to 50,000",0.2
James,3000,Basic,Higher,"50,001 to 150,000",0.4
James,3000,Basic,Additional,"Over 150,000",0.45
Gabriel,700,Allowance,Allowance,"Up to 12,500",0.0
Gabriel,700,Allowance,Basic,"12,501 to 50,000",0.2


## 4. Working with data structures
Arrays is a powerful data type provided in almost all data manipulation packages. It can save a lot of memory as it gathers multiple items into a single rows instead of having too many rows.

### 4.1. Creating array-type columns

In [33]:
import findspark; findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)
spark.conf.set('spark.sql.repl.eagerEval.truncate', 1000)

import pyspark.sql.functions as F
import pyspark.sql.types as T

import numpy as np
import pandas as pd

#### Construcing arrays
The `T.struct` function allows contructing complicated data structures.

In [34]:
data = [
    ('Hannah', ['clothes', 'books', 'shoes', 'travelling']),
    ('James', ['sports', 'science', 'comics']),
    ('Wayne', ['travelling', 'music', 'shoes', 'technology']),
    ('Jolie', ['cooking', 'boardgames', 'coffee'])
]

schema = T.StructType([
    T.StructField('customer', T.StringType()),
    T.StructField('interest', T.ArrayType(T.StringType()))
])

dfTmp = spark.createDataFrame(data, schema)

In [34]:
dfTmp

customer,interest
Hannah,"[clothes, books, shoes, travelling]"
James,"[sports, science, comics]"
Wayne,"[travelling, music, shoes, technology]"
Jolie,"[cooking, boardgames, coffee]"


In [35]:
dfTmp.printSchema()

root
 |-- customer: string (nullable = true)
 |-- interest: array (nullable = true)
 |    |-- element: string (containsNull = true)



#### Combining columns

In [36]:
data = [
    ('Hannah', 'clothes', 'books', 'shoes'),
    ('James', 'science', 'sports', 'comics'),
    ('Wayne', 'music', 'shoes', 'technology'),
    ('Jolie', 'cooking', 'boardgames', 'coffee')
]

schema = 'customer string, interest_1 string, interest_2 string, interest_3 string'

dfTmp = spark.createDataFrame(data, schema)
dfTmp.selectExpr('customer', 'ARRAY(interest_1, interest_2, interest_3) AS interest')

customer,interest
Hannah,"[clothes, books, shoes]"
James,"[science, sports, comics]"
Wayne,"[music, shoes, technology]"
Jolie,"[cooking, boardgames, coffee]"


#### Splitting a string

In [37]:
data = [
    ('Hannah', 'clothes; books; shoes; travelling'),
    ('James', 'sports; science; comics'),
    ('Wayne', 'travelling; music; shoes; technology'),
    ('Jolie', 'cooking; boardgames; coffee')
]

schema = 'customer string, interest string'

dfTmp = spark.createDataFrame(data, schema)
dfTmp.withColumn('interest_split', F.split('interest', '; '))
# dfTmp.withColumn('interest_split', F.expr('SPLIT(interest, ";")'))

customer,interest,interest_split
Hannah,clothes; books; shoes; travelling,"[clothes, books, shoes, travelling]"
James,sports; science; comics,"[sports, science, comics]"
Wayne,travelling; music; shoes; technology,"[travelling, music, shoes, technology]"
Jolie,cooking; boardgames; coffee,"[cooking, boardgames, coffee]"


#### Gathering rows
The `collect_list` functions gathers values in a column to form a vectors. However, this function sometime does not preserve the order of rows. The solution for this problem will be discussed later.

In [38]:
data = [
    ('James', 'sports'),
    ('James', 'science'),
    ('James', 'comics'),
    ('Jolie', 'cooking'),
    ('Jolie', 'boardgames'),
    ('Jolie', 'coffee'),
]

schema = 'customer string, interest string'

dfTmp = spark.createDataFrame(data, schema)
dfTmp.groupby('customer').agg(F.collect_list('interest').alias('interest'))
dfTmp.groupby('customer').agg(F.expr('COLLECT_LIST(interest) AS interest'))

customer,interest
James,"[sports, science, comics]"
Jolie,"[cooking, boardgames, coffee]"


### 4.2. Basic array manipulation

In [39]:
import findspark; findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)
spark.conf.set('spark.sql.repl.eagerEval.truncate', 1000)

import pyspark.sql.functions as F
import pyspark.sql.types as T

import numpy as np
import pandas as pd

In [40]:
data = [
    ('James', ['sports', 'comics', None, 'travelling']),
    ('Jolie', ['boardgames', 'shopping', 'boardgames', None])
]

schema = T.StructType([
    T.StructField('customer', T.StringType()),
    T.StructField('interest', T.ArrayType(T.StringType()))
])

dfInterest = spark.createDataFrame(data, schema)
dfInterest

customer,interest
James,"[sports, comics, null, travelling]"
Jolie,"[boardgames, shopping, boardgames, null]"


#### Exploding
Exploding is the technique that transform each array into a number of rows equal to the number of array elements. It can be thought as the inverse process of gathering rows.

In [41]:
# using SQL expressions (recommended)
dfInterest.selectExpr('customer', 'EXPLODE(interest) AS interest')

# using PySpark functions
# dfInterest.select('customer', F.explode('interest').alias('interest'))

customer,interest
James,sports
James,comics
James,
James,travelling
Jolie,boardgames
Jolie,shopping
Jolie,boardgames
Jolie,


In [42]:
# using SQL expressions (recommended)
dfInterest.selectExpr('customer', 'POSEXPLODE(interest) AS (rank, interest)')

# using PySpark functions
# dfInterest.select('customer', F.posexplode('interest')).select('customer', F.col('pos').alias('rank'), F.col('col').alias('interest'))

customer,rank,interest
James,0,sports
James,1,comics
James,2,
James,3,travelling
Jolie,0,boardgames
Jolie,1,shopping
Jolie,2,boardgames
Jolie,3,


#### Array join
Array join is the inverse process of splitting a string column into an array column. It works exactly the same as the Python `str.join` method.

In [43]:
dfInterest.selectExpr('customer', 'ARRAY_JOIN(interest, "; ") AS interest')

customer,interest
James,sports; comics; travelling
Jolie,boardgames; shopping; boardgames


In [44]:
dfInterest.selectExpr('customer', 'ARRAY_JOIN(interest, "; ", "nan") AS interest')

customer,interest
James,sports; comics; nan; travelling
Jolie,boardgames; shopping; boardgames; nan


#### Arrays concatenating

In [45]:
dfInterest\
    .withColumn('common_interest', F.expr('ARRAY("shopping", "travelling")'))\
    .withColumn('individual_interest', F.expr('FILTER(interest, x -> x IS NOT NULL)'))\
    .selectExpr(
        'customer',
        'individual_interest',
        'common_interest',
        'CONCAT(individual_interest, common_interest) AS concatenation',
        'ARRAY_INTERSECT(individual_interest, common_interest) AS intersect',
        'ARRAY_UNION(individual_interest, common_interest) AS union',
    )

customer,individual_interest,common_interest,concatenation,intersect,union
James,"[sports, comics, travelling]","[shopping, travelling]","[sports, comics, travelling, shopping, travelling]",[travelling],"[sports, comics, travelling, shopping]"
Jolie,"[boardgames, shopping, boardgames]","[shopping, travelling]","[boardgames, shopping, boardgames, shopping, travelling]",[shopping],"[boardgames, shopping, travelling]"


#### Miscellaneous techniques

In [46]:
dfInterest.selectExpr(
    'interest AS original_array',
    'SHUFFLE(interest) AS shuffled_array',
    'ARRAY_SORT(interest) AS sorted_array',
    'ARRAY_MAX(interest) AS array_max',
    'ARRAY_MIN(interest) AS array_min',
)

original_array,shuffled_array,sorted_array,array_max,array_min
"[sports, comics, null, travelling]","[null, travelling, sports, comics]","[comics, sports, travelling, null]",travelling,comics
"[boardgames, shopping, boardgames, null]","[shopping, boardgames, boardgames, null]","[boardgames, boardgames, shopping, null]",shopping,boardgames


In [47]:
dfInterest.selectExpr(
    'interest AS original_array',
    'SLICE(interest, 1, 2) AS sliced_array',
    'FILTER(interest, x -> x IS NOT NULL) AS notnull_array',
    'ARRAY_DISTINCT(interest) AS distinct_array'
)

original_array,sliced_array,notnull_array,distinct_array
"[sports, comics, null, travelling]","[sports, comics]","[sports, comics, travelling]","[sports, comics, null, travelling]"
"[boardgames, shopping, boardgames, null]","[boardgames, shopping]","[boardgames, shopping, boardgames]","[boardgames, shopping, null]"


### 4.3. Structure type manipulation

In [48]:
import findspark; findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)
spark.conf.set('spark.sql.repl.eagerEval.truncate', 1000)

import pyspark.sql.functions as F
import pyspark.sql.types as T

import numpy as np
import pandas as pd

#### Constructing nested arrays
A `struct` type column can be thought as a group of smaller columns. It should not be confused with `array` data type, whose array size may vary and elements cannot be accessed by name.

In [49]:
data = [
    (('James', None, 'Smith'), 'M', 3100),
    (('Michael', 'Rose', None), 'M', 4300),
    (('Robert', None, 'Williams'), 'M', 1400),
    (('Maria', 'Anne', 'Jones'), 'F', 5500),
    (('Jen', 'Mary', 'Brown'), 'F', 2500)
]

schema = T.StructType([
    T.StructField('name', T.StructType([
        T.StructField('firstname', T.StringType(), True),
        T.StructField('middlename', T.StringType(), True),
        T.StructField('lastname', T.StringType(), True)
    ])),
    T.StructField('gender', T.StringType(), True),
    T.StructField('salary', T.IntegerType(), True)
])

dfCustomer = spark.createDataFrame(data, schema)
dfCustomer

name,gender,salary
"{James, null, Smith}",M,3100
"{Michael, Rose, null}",M,4300
"{Robert, null, Williams}",M,1400
"{Maria, Anne, Jones}",F,5500
"{Jen, Mary, Brown}",F,2500


In [50]:
dfCustomer.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)



In [51]:
# child columns can be accessed easily
dfCustomer.select('name.lastname', 'name.middlename', 'name.lastname')

lastname,middlename,lastname.1
Smith,,Smith
,Rose,
Williams,,Williams
Jones,Anne,Jones
Brown,Mary,Brown


#### Grouping multiple columns

In [52]:
data = [
    ('James', 'Python', 9),
    ('James', 'Tableau', 7),
    ('James', 'SQL', 8),
    ('Jolie', 'R', 6),
    ('Jolie', 'Python', 9),
    ('Jolie', 'Power BI', 5)
]

schema = 'name string, skills string, score int'

dfScore = spark.createDataFrame(data, schema)

In [52]:
dfScore

name,skills,score
James,Python,9
James,Tableau,7
James,SQL,8
Jolie,R,6
Jolie,Python,9
Jolie,Power BI,5


In [53]:
dfScore.selectExpr('name', 'STRUCT(skills, score) AS strength')

name,strength
James,"{Python, 9}"
James,"{Tableau, 7}"
James,"{SQL, 8}"
Jolie,"{R, 6}"
Jolie,"{Python, 9}"
Jolie,"{Power BI, 5}"


In [54]:
dfScore\
    .groupby('name')\
    .agg(F.expr('COLLECT_LIST(STRUCT(score, skills)) AS strength'))\
    .selectExpr('name', 'ARRAY_SORT(strength).skills AS strength')

name,strength
James,"[Tableau, SQL, Python]"
Jolie,"[Power BI, R, Python]"


#### Arrays zipping

In [55]:
data = [
    ('James', ['Python', 'Tableau', 'SQL'], [9, 7, 8]),
    ('Jolie', ['R', 'Python', 'Power BI'], [6, 9, 5])
]

schema = T.StructType([
    T.StructField('name', T.StringType()),
    T.StructField('skills', T.ArrayType(T.StringType())),
    T.StructField('score', T.ArrayType(T.IntegerType()))
])

dfTmp = spark.createDataFrame(data, schema)
dfTmp.selectExpr('name', 'skills', 'score', 'ARRAYS_ZIP(skills, score) AS strength')

name,skills,score,strength
James,"[Python, Tableau, SQL]","[9, 7, 8]","[{Python, 9}, {Tableau, 7}, {SQL, 8}]"
Jolie,"[R, Python, Power BI]","[6, 9, 5]","[{R, 6}, {Python, 9}, {Power BI, 5}]"


### 4.4. Dictionary type manipulation