In [23]:
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.functions import collect_set
from pyspark.sql.functions import udf

from pyspark.ml.linalg import SparseVector, DenseVector
from pyspark.ml.linalg import Vectors, VectorUDT

In [24]:
# Let us use Vector Dense Matrix 
data2 = [(1, Vectors.dense([2.0, 2.0, 3.0]),),
         (1, Vectors.dense([2.0, 3.0, 3.0]),),
         (2, Vectors.dense([3.0, 2.0, 3.0]),),
         (2, Vectors.dense([3.0, 3.0, 3.0]),)]

df = spark.createDataFrame(data2, ["id", "features"])
df.printSchema()
df.show()

root
 |-- id: long (nullable = true)
 |-- features: vector (nullable = true)

+---+-------------+
| id|     features|
+---+-------------+
|  1|[2.0,2.0,3.0]|
|  1|[2.0,3.0,3.0]|
|  2|[3.0,2.0,3.0]|
|  2|[3.0,3.0,3.0]|
+---+-------------+



In [26]:
from pyspark.sql import functions as F
from pyspark.sql import types as T


def sum_array(v):
  new_array = list([float(x) for x in v])
  return new_array


sum_array_udf = F.udf(sum_array, T.ArrayType(T.FloatType()))

df11 = df.withColumn('features_array', sum_array_udf('features'))

df11.show()
df11.printSchema()


+---+-------------+---------------+
| id|     features| features_array|
+---+-------------+---------------+
|  1|[2.0,2.0,3.0]|[2.0, 2.0, 3.0]|
|  1|[2.0,3.0,3.0]|[2.0, 3.0, 3.0]|
|  2|[3.0,2.0,3.0]|[3.0, 2.0, 3.0]|
|  2|[3.0,3.0,3.0]|[3.0, 3.0, 3.0]|
+---+-------------+---------------+

root
 |-- id: long (nullable = true)
 |-- features: vector (nullable = true)
 |-- features_array: array (nullable = true)
 |    |-- element: float (containsNull = true)



In [27]:
# First define a function to do the sum map the results to float
def sum_array(v):
    return float(sum(v))

# Define this function as a user defined function 
sum_array_udf = F.udf(sum_array, T.FloatType())

# Use the withColumn operation to sum values and put it in new column 
df2=df.withColumn('features_array', sum_array_udf('features'))

df2.show()

+---+-------------+--------------+
| id|     features|features_array|
+---+-------------+--------------+
|  1|[2.0,2.0,3.0]|           7.0|
|  1|[2.0,3.0,3.0]|           8.0|
|  2|[3.0,2.0,3.0]|           8.0|
|  2|[3.0,3.0,3.0]|           9.0|
+---+-------------+--------------+



In [16]:
# We can use aggregate to group data rows with the same key in groups and collect them as list. 
# We can use the function collect_list to collect. 

df2.groupBy("id").agg(F.collect_list('features')).show()

+---+----------------------+
| id|collect_list(features)|
+---+----------------------+
|  1|  [[2.0,2.0,3.0], [...|
|  2|  [[3.0,2.0,3.0], [...|
+---+----------------------+



In [17]:
# newdf = df.withColumn('features_array', (df[col] for col in df.columns))
# newdf.show()

In [18]:
# df_agg = df.agg(F.array(*[F.avg(F.col('features_array')[i]) for i in range(2)]).alias("averages"))

# df_agg.show()

In [19]:
def vec2array(v):
  v = Vectors.dense(v)
  array = list([float(x) for x in v])
  return array

vec2array_udf = F.udf(vec2array, T.ArrayType(T.FloatType()))

# df = df.withColumn('Vec', vec2array_udf('Vec'))
# n = len(df.select('Vec').first()[0])
#bla = df.agg(F.array(*[F.sum(F.col("Vec")[i]) for i in range(n)]).alias("sum"))
# bla.show(truncate=False)

In [20]:
# df.groupBy("id").sum("features").show()

In [21]:

udf_to_array = udf(lambda v: [*map(float, v.toArray() )], 'array<float>')

df.withColumn('interactions', udf_to_array('features')).show()

# df.groupBy("id").agg("interactions").sum().show()

+---+-------------+---------------+
| id|     features|   interactions|
+---+-------------+---------------+
|  1|[2.0,2.0,3.0]|[2.0, 2.0, 3.0]|
|  1|[2.0,3.0,3.0]|[2.0, 3.0, 3.0]|
|  2|[3.0,2.0,3.0]|[3.0, 2.0, 3.0]|
|  2|[3.0,3.0,3.0]|[3.0, 3.0, 3.0]|
+---+-------------+---------------+



In [22]:
# df_agg = df.agg(F.array(*[F.avg(F.col('features_array')[i]) for i in range(10)]).alias("averages"))
# df_agg.show()
