# Scratch

## Datasets

In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import max

In [2]:
spark = SparkSession.builder.getOrCreate()

data = [[1.0, 3.0],
        [2.0, 2.0],
        [3.0, 1.0],
        [4.0, 1.0],
        [4.0, 0.0]]

pdf = pd.DataFrame(data, columns=['a', 'b'])
pdf

Unnamed: 0,a,b
0,1.0,3.0
1,2.0,2.0
2,3.0,1.0
3,4.0,1.0
4,4.0,0.0


In [3]:
pdf_arr = pd.DataFrame()
pdf_arr['x'] = pdf.values.tolist()
pdf_arr

Unnamed: 0,x
0,"[1.0, 3.0]"
1,"[2.0, 2.0]"
2,"[3.0, 1.0]"
3,"[4.0, 1.0]"
4,"[4.0, 0.0]"


### Spark DataFrame (Scalar)

In [4]:
df = spark.createDataFrame(pdf)
df.show(); df.printSchema()

+---+---+
|  a|  b|
+---+---+
|1.0|3.0|
|2.0|2.0|
|3.0|1.0|
|4.0|1.0|
|4.0|0.0|
+---+---+

root
 |-- a: double (nullable = true)
 |-- b: double (nullable = true)



### Spark DataFrame (Array)

In [5]:
df_arr = spark.createDataFrame(pdf_arr)
df_arr.show(); df_arr.printSchema()

+----------+
|         x|
+----------+
|[1.0, 3.0]|
|[2.0, 2.0]|
|[3.0, 1.0]|
|[4.0, 1.0]|
|[4.0, 0.0]|
+----------+

root
 |-- x: array (nullable = true)
 |    |-- element: double (containsNull = true)



### Spark DataFrame (Vector)

In [6]:
from pyspark.ml.functions import array_to_vector

df_vec = df_arr.select(array_to_vector("x").alias("x"))
df_vec.show(); df_vec.printSchema()

+---------+
|        x|
+---------+
|[1.0,3.0]|
|[2.0,2.0]|
|[3.0,1.0]|
|[4.0,1.0]|
|[4.0,0.0]|
+---------+

root
 |-- x: vector (nullable = true)



## Max (Array)

In [7]:
df_arr.select(max(df_arr.x)).show()



+----------+
|    max(x)|
+----------+
|[4.0, 1.0]|
+----------+



                                                                                

## Max (Vector)

In [8]:
df_vec.select(max(df_vec.x)).show()

+---------+
|   max(x)|
+---------+
|[4.0,1.0]|
+---------+



## Summarizer (Array)

In [9]:
from pyspark.ml.functions import array_to_vector
from pyspark.ml.stat import Summarizer

In [10]:
summarizer = Summarizer.metrics("min", "max")

In [11]:
df_arr.select(summarizer.summary(df_arr.x)).show()

AnalysisException: [DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE] Cannot resolve "aggregate_metrics(x, 1.0)" due to data type mismatch: Parameter 1 requires the "STRUCT<type: TINYINT, size: INT, indices: ARRAY<INT>, values: ARRAY<DOUBLE>>" type, however "x" has the type "ARRAY<DOUBLE>".;
'Aggregate [unresolvedalias(aggregate_metrics(Min, Max, ComputeMax, ComputeMin, ComputeNNZ, x#13, 1.0, 0, 0), Some(org.apache.spark.sql.Column$$Lambda$1525/491152434@71eb0e3d))]
+- LocalRelation [x#13]


## Summarizer (Vector)

In [12]:
df_vec.select(summarizer.summary(df_vec.x)).show(truncate=80)

+-------------------------+
|aggregate_metrics(x, 1.0)|
+-------------------------+
|   {[1.0,0.0], [4.0,3.0]}|
+-------------------------+



## Spark SQL

In [13]:
df_vec.createOrReplaceTempView("tmpview")

In [14]:
spark.sql("SELECT max(x) from tmpview").show()

+---------+
|   max(x)|
+---------+
|[4.0,1.0]|
+---------+



## Scalar Columns

In [15]:
from pyspark.sql.functions import min, max

In [16]:
df.select(min("a"), min("b"), max("a"), max("b")).show()

+------+------+------+------+
|min(a)|min(b)|max(a)|max(b)|
+------+------+------+------+
|   1.0|   0.0|   4.0|   3.0|
+------+------+------+------+

