# Import

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark_session = SparkSession.builder.getOrCreate()

# Read Data

In [None]:
transactions = spark_session.read.option("inferSchema","true").csv("transactions.csv", header=True, sep="|")

In [None]:
transactions.show(3)

+---------+------+-----+------+-----+
|sessionID|itemID|click|basket|order|
+---------+------+-----+------+-----+
|        0| 21310|    1|     0|    0|
|        1| 73018|    1|     0|    0|
|        2| 19194|    1|     0|    0|
+---------+------+-----+------+-----+
only showing top 3 rows



# Data Preprocessing

### Add interaction level (binary, categorical, and weight based)

In [None]:
from pyspark.sql.functions import when,count

In [None]:
transactions = transactions.withColumn(
    "binary",
    when(transactions["click"] == 1, 1).
    when(transactions["basket"] == 1, 1).
    when(transactions["order"] == 1, 1).
    otherwise(0)
)

In [None]:
transactions = transactions.withColumn(
    "categorical",
    when(transactions["click"] == 1, 1).
    when(transactions["basket"] == 1, 2).
    when(transactions["order"] == 1, 3).
    otherwise(0)
)

In [None]:
transactions = transactions.withColumn(
    "weight",
    (transactions["click"] * 1) + (transactions["basket"] * 2) + (transactions["order"] * 3)
)

In [None]:
transactions.show(10)

+---------+------+-----+------+-----+------+-----------+------+
|sessionID|itemID|click|basket|order|binary|categorical|weight|
+---------+------+-----+------+-----+------+-----------+------+
|        0| 21310|    1|     0|    0|     1|          1|     1|
|        1| 73018|    1|     0|    0|     1|          1|     1|
|        2| 19194|    1|     0|    0|     1|          1|     1|
|        3| 40250|    1|     0|    0|     1|          1|     1|
|        4| 46107|    1|     0|    0|     1|          1|     1|
|        5| 34217|    1|     0|    0|     1|          1|     1|
|        6| 31436|    1|     0|    0|     1|          1|     1|
|        7| 14576|    1|     1|    0|     1|          1|     3|
|        7| 17731|    2|     1|    0|     1|          2|     4|
|        8| 58723|    1|     0|    0|     1|          1|     1|
+---------+------+-----+------+-----+------+-----------+------+
only showing top 10 rows



### Split into 3 datasets

In [None]:
transactions_binary = transactions.select("sessionID","itemID","binary")
transactions_categorical = transactions.select("sessionID","itemID","categorical")
transactions_weight = transactions.select("sessionID","itemID","weight")

In [None]:
transactions_binary.show(3)
transactions_categorical.show(3)
transactions_weight.show(3)

+---------+------+------+
|sessionID|itemID|binary|
+---------+------+------+
|        0| 21310|     1|
|        1| 73018|     1|
|        2| 19194|     1|
+---------+------+------+
only showing top 3 rows

+---------+------+-----------+
|sessionID|itemID|categorical|
+---------+------+-----------+
|        0| 21310|          1|
|        1| 73018|          1|
|        2| 19194|          1|
+---------+------+-----------+
only showing top 3 rows

+---------+------+------+
|sessionID|itemID|weight|
+---------+------+------+
|        0| 21310|     1|
|        1| 73018|     1|
|        2| 19194|     1|
+---------+------+------+
only showing top 3 rows



### Pivot

In [None]:
from pyspark.sql.functions import sum

In [None]:
spark_session.conf.set("spark.sql.pivotMaxValues",25000)

cara pake pivot:
https://stackoverflow.com/questions/46809879/convert-pyspark-groupeddata-object-to-spark-dataframe
1. groupBy = column
2. pivot = row
3. agg = cell -> bisa juga pake .count()

In [None]:
pivot_binary = transactions_binary.groupBy("itemID").pivot("sessionID").agg(sum("binary"))

In [None]:
pivot_categorical = transactions_categorical.groupBy("itemID").pivot("sessionID").agg(sum("categorical"))

In [None]:
pivot_weight = transactions_weight.groupBy("itemID").pivot("sessionID").agg(sum("weight"))

In [None]:
pivot_binary = pivot_binary.fillna(0)

In [None]:
pivot_categorical = pivot_categorical.fillna(0)

In [None]:
pivot_weight = pivot_weight.fillna(0)

In [None]:
pivot_binary.show()

+------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|itemID|  0|  1|  2|  3|  4|  5|  6|  7|  8|  9| 10| 11| 12| 13| 14| 15| 16| 17| 18| 19| 20| 21| 22| 23| 24| 25| 26| 27| 28| 29| 30| 31| 32| 34| 35| 36| 37| 38| 39| 40| 41| 42| 43| 44| 45| 46| 47| 48| 49| 50| 51| 52| 53| 54| 55| 56| 57| 58| 59| 60| 61| 62| 63| 64| 65| 66| 67| 68| 69| 70| 71| 72| 73| 74| 75| 76| 77|
+------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
| 21310|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0| 

### Dimensionality Reduction

In [None]:
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg.distributed import RowMatrix

In [None]:
mat_binary = pivot_binary.drop("itemID").rdd.map(lambda s : Vectors.dense(s))

In [None]:
mat_categorical = pivot_categorical.drop("itemID").rdd.map(lambda s : Vectors.dense(s))

In [None]:
mat_weight = pivot_weight.drop("itemID").rdd.map(lambda s : Vectors.dense(s))

In [None]:
mat_binary = RowMatrix(mat_binary)

In [None]:
mat_categorical = RowMatrix(mat_categorical)

In [None]:
mat_weight = RowMatrix(mat_weight)

### Principal component analysis (PCA)

In [None]:
pca_binary = mat_binary.computePrincipalComponents(5)

In [None]:
pca_categorical = mat_categorical.computePrincipalComponents(5)

In [None]:
pca_weight = mat_weight.computePrincipalComponents(5)

In [None]:
projected_binary = mat_binary.multiply(pca_binary)

In [None]:
projected_categorical = mat_binary.multiply(pca_categorical)

In [None]:
projected_weight = mat_binary.multiply(pca_weight)

In [None]:
print(projected_binary.rows.collect())

[DenseVector([0.0099, -0.0102, -0.0, 0.0149, 0.0]), DenseVector([0.0099, -0.0102, -0.0, 0.0149, -0.0]), DenseVector([0.0099, -0.0102, 0.0, 0.0149, 0.0]), DenseVector([0.0099, -0.0102, -0.0, 0.0149, -0.0]), DenseVector([0.0099, -0.0102, -0.0, 0.0149, -0.0]), DenseVector([0.0099, -0.0102, 0.0, 0.0149, 0.0]), DenseVector([0.0099, -0.0102, 0.0, 0.0149, -0.0]), DenseVector([0.0241, -0.0311, -0.0, 0.0634, 0.044]), DenseVector([0.0241, -0.0311, -0.0, 0.0634, 0.044]), DenseVector([0.0099, -0.0102, 0.0, 0.0149, 0.0]), DenseVector([0.0099, -0.0102, -0.0, 0.0149, 0.0]), DenseVector([0.0099, -0.0102, 0.0, 0.0149, 0.0]), DenseVector([0.0, 0.0, 0.0, 0.0, 0.0]), DenseVector([0.0462, -0.0997, 0.6325, -0.7536, -0.0]), DenseVector([0.0462, -0.0997, 0.6325, -0.7536, -0.0]), DenseVector([0.0462, -0.0997, 0.6325, -0.7536, -0.0]), DenseVector([0.0, 0.0, 0.0, 0.0, 0.0]), DenseVector([0.0241, -0.0311, -0.0, 0.0634, 0.3063]), DenseVector([0.0241, -0.0311, -0.0, 0.0634, 0.3063]), DenseVector([0.0241, -0.0311, -

In [None]:
projected_binary.numRows()

98

In [None]:
projected_binary.numCols()

5

In [None]:
print(projected_categorical.rows.collect())

[DenseVector([0.0055, -0.0, 0.0057, 0.0067, 0.0058]), DenseVector([0.0055, -0.0, 0.0057, 0.0067, 0.0058]), DenseVector([0.0055, -0.0, 0.0057, 0.0067, 0.0058]), DenseVector([0.0055, -0.0, 0.0057, 0.0067, 0.0058]), DenseVector([0.0055, -0.0, 0.0057, 0.0067, 0.0058]), DenseVector([0.0055, -0.0, 0.0057, 0.0067, 0.0058]), DenseVector([0.0055, -0.0, 0.0057, 0.0067, 0.0058]), DenseVector([0.0306, -0.0, 0.0349, 0.0488, 0.0938]), DenseVector([0.0306, -0.0, 0.0349, 0.0488, 0.0938]), DenseVector([0.0055, -0.0, 0.0057, 0.0067, 0.0058]), DenseVector([0.0055, -0.0, 0.0057, 0.0067, 0.0058]), DenseVector([0.0055, -0.0, 0.0057, 0.0067, 0.0058]), DenseVector([0.0, 0.0, 0.0, 0.0, 0.0]), DenseVector([0.0213, -0.0, 0.0228, 0.0285, 0.0292]), DenseVector([0.0213, -0.0, 0.0228, 0.0285, 0.0292]), DenseVector([0.0213, -0.0, 0.0228, 0.0285, 0.0292]), DenseVector([0.0, 0.0, 0.0, 0.0, 0.0]), DenseVector([0.0123, -0.0, 0.013, 0.0157, 0.0145]), DenseVector([0.0123, -0.0, 0.013, 0.0157, 0.0145]), DenseVector([0.0123,

In [None]:
print(projected_weight.rows.collect())

[DenseVector([0.0, -0.0016, 0.0032, -0.0034, 0.0044]), DenseVector([-0.0, -0.0016, 0.0032, -0.0034, 0.0044]), DenseVector([0.0, -0.0016, 0.0032, -0.0034, 0.0044]), DenseVector([0.0, -0.0016, 0.0032, -0.0034, 0.0044]), DenseVector([-0.0, -0.0016, 0.0032, -0.0034, 0.0044]), DenseVector([0.0, -0.0016, 0.0032, -0.0034, 0.0044]), DenseVector([-0.0, -0.0016, 0.0032, -0.0034, 0.0044]), DenseVector([0.0, -0.0184, 0.0948, -0.3078, -0.9436]), DenseVector([0.0, -0.0184, 0.0948, -0.3078, -0.9436]), DenseVector([-0.0, -0.0016, 0.0032, -0.0034, 0.0044]), DenseVector([0.0, -0.0016, 0.0032, -0.0034, 0.0044]), DenseVector([-0.0, -0.0016, 0.0032, -0.0034, 0.0044]), DenseVector([0.0, 0.0, 0.0, 0.0, 0.0]), DenseVector([-0.0, -0.0096, 0.0237, -0.0279, 0.0389]), DenseVector([-0.0, -0.0096, 0.0237, -0.0279, 0.0389]), DenseVector([-0.0, -0.0096, 0.0237, -0.0279, 0.0389]), DenseVector([0.0, 0.0, 0.0, 0.0, 0.0]), DenseVector([0.0, -0.007, 0.0154, -0.017, 0.0226]), DenseVector([0.0, -0.007, 0.0154, -0.017, 0.022