## DataFrame

In [1]:
import json

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

22/12/21 00:18:51 WARN Utils: Your hostname, jump-windows resolves to a loopback address: 127.0.1.1; using 192.168.0.5 instead (on interface wlp3s0)
22/12/21 00:18:51 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/21 00:18:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/12/21 00:18:53 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
columns = ["Seqno","Name","Data"]
data = [("1", "john jones", [1,2,3]),
    ("2", "tracey smith", [4,5,6]),
    ("3", "amy sanders", [7,8,9])]

df = spark.createDataFrame(data=data,schema=columns)

df.show(truncate=False)

                                                                                

+-----+------------+---------+
|Seqno|Name        |Data     |
+-----+------------+---------+
|1    |john jones  |[1, 2, 3]|
|2    |tracey smith|[4, 5, 6]|
|3    |amy sanders |[7, 8, 9]|
+-----+------------+---------+



In [4]:
def convertCase(s: str):
    return s.title()

In [5]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StringType

In [6]:
convertUDF = udf(lambda z: convertCase(z), StringType())

In [7]:
df.select(col("Seqno"), \
    convertUDF(col("Name")).alias("Name") ) \
   .show(truncate=False)

[Stage 2:>                                                          (0 + 1) / 1]                                                                                

+-----+------------+
|Seqno|Name        |
+-----+------------+
|1    |John Jones  |
|2    |Tracey Smith|
|3    |Amy Sanders |
+-----+------------+



In [8]:
def upperCase(s: str):
    return s.upper()

In [9]:
upperCaseUDF = udf(lambda z:upperCase(z), StringType())   

In [10]:
df.withColumn("Cureated Name", upperCaseUDF(col("Name"))) \
  .show(truncate=False)

+-----+------------+---------+-------------+
|Seqno|Name        |Data     |Cureated Name|
+-----+------------+---------+-------------+
|1    |john jones  |[1, 2, 3]|JOHN JONES   |
|2    |tracey smith|[4, 5, 6]|TRACEY SMITH |
|3    |amy sanders |[7, 8, 9]|AMY SANDERS  |
+-----+------------+---------+-------------+



In [11]:
@udf(returnType=StringType()) 
def splitCase(s: str):
    return s.split()

In [12]:
df.withColumn("Cureated Name", splitCase(col("Name"))) \
.show(truncate=False)

+-----+------------+---------+---------------+
|Seqno|Name        |Data     |Cureated Name  |
+-----+------------+---------+---------------+
|1    |john jones  |[1, 2, 3]|[john, jones]  |
|2    |tracey smith|[4, 5, 6]|[tracey, smith]|
|3    |amy sanders |[7, 8, 9]|[amy, sanders] |
+-----+------------+---------+---------------+



In [13]:
columns = ["Seqno","Name","Data"]
data = [("1", "john jones", [1,2,3]),
    ("2", "tracey smith", [4,5,6]),
    ("3", None, [7,8,9])]

df = spark.createDataFrame(data=data,schema=columns)

df.show(truncate=False)

+-----+------------+---------+
|Seqno|Name        |Data     |
+-----+------------+---------+
|1    |john jones  |[1, 2, 3]|
|2    |tracey smith|[4, 5, 6]|
|3    |null        |[7, 8, 9]|
+-----+------------+---------+



In [14]:
df.withColumn("Cureated Name", splitCase(col("Name"))) \
.show(truncate=False)

22/12/21 00:19:09 ERROR Executor: Exception in task 2.0 in stage 11.0 (TID 23)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/tmp/ipykernel_7999/241544004.py", line 3, in splitCase
AttributeError: 'NoneType' object has no attribute 'split'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:559)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:86)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$2.read(PythonUDFRunner.scala:68)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:512)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/tmp/ipykernel_7999/241544004.py", line 3, in splitCase
AttributeError: 'NoneType' object has no attribute 'split'


In [15]:
@udf(returnType=StringType()) 
def splitCase(s: str):
    return s.split() if s else ''

In [16]:
df.withColumn("Cureated Name", splitCase(col("Name"))) \
.show(truncate=False)

+-----+------------+---------+---------------+
|Seqno|Name        |Data     |Cureated Name  |
+-----+------------+---------+---------------+
|1    |john jones  |[1, 2, 3]|[john, jones]  |
|2    |tracey smith|[4, 5, 6]|[tracey, smith]|
|3    |null        |[7, 8, 9]|               |
+-----+------------+---------+---------------+



In [17]:
from pyspark.sql.types import StringType, ArrayType, FloatType

@udf(returnType=ArrayType(FloatType())) 
def splitArray(s: list):
    import numpy as np
    if not s:
        return ''
    s = np.array(s)
    s = s / np.max(s)
    return s.tolist()

In [18]:
df.withColumn("Cureated Data", splitArray(col("Data"))) \
.show(truncate=False)

+-----+------------+---------+----------------------------+
|Seqno|Name        |Data     |Cureated Data               |
+-----+------------+---------+----------------------------+
|1    |john jones  |[1, 2, 3]|[0.33333334, 0.6666667, 1.0]|
|2    |tracey smith|[4, 5, 6]|[0.6666667, 0.8333333, 1.0] |
|3    |null        |[7, 8, 9]|[0.7777778, 0.8888889, 1.0] |
+-----+------------+---------+----------------------------+



In [19]:
import numpy as np
from pyspark.sql import functions as f
from pyspark.sql.types import ArrayType, FloatType, IntegerType

@udf(returnType=ArrayType(FloatType()))
def normalize(x: list):
    x: np.array = np.array(x)
    x = x / np.max(x)
    return x.tolist()


@udf(returnType=ArrayType(IntegerType()))
def idx_to_onehot(y: int):
    if not y or y < 0 or y >= 10:
        return -1
    y_true = np.zeros(10, dtype=np.int32)
    y_true[y] = 1
    return y_true.tolist()

In [20]:
columns = ["x","y"]
data = [([1,2,3], 3),
    ([4,5,6], 4),
    ([7,8,9], 1)]

df = spark.createDataFrame(data=data,schema=columns)

df.show(truncate=False)

+---------+---+
|x        |y  |
+---------+---+
|[1, 2, 3]|3  |
|[4, 5, 6]|4  |
|[7, 8, 9]|1  |
+---------+---+



In [21]:
df = df.withColumn("feature", normalize(f.col("x"))).withColumn("label", idx_to_onehot(f.col("y")))
df.show(truncate=False)

+---------+---+----------------------------+------------------------------+
|x        |y  |feature                     |label                         |
+---------+---+----------------------------+------------------------------+
|[1, 2, 3]|3  |[0.33333334, 0.6666667, 1.0]|[0, 0, 0, 1, 0, 0, 0, 0, 0, 0]|
|[4, 5, 6]|4  |[0.6666667, 0.8333333, 1.0] |[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]|
|[7, 8, 9]|1  |[0.7777778, 0.8888889, 1.0] |[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]|
+---------+---+----------------------------+------------------------------+



## Note:
- PySpark UDF is a User Defined Function that is used to create a reusable function in Spark.
- Once UDF created, that can be re-used on multiple DataFrames and SQL (after registering).
- The default type of the udf() is StringType.
- You need to handle nulls explicitly otherwise you will see side-effects.

## Schema

In [22]:
from pyspark.sql.types import StructType, StructField, IntegerType

In [23]:
StructField("x", ArrayType(FloatType())).jsonValue(),\
StructField("y", IntegerType()).jsonValue(),\
StructField("timestamp", StringType()).jsonValue()

({'name': 'x',
  'type': {'type': 'array', 'elementType': 'float', 'containsNull': True},
  'nullable': True,
  'metadata': {}},
 {'name': 'y', 'type': 'integer', 'nullable': True, 'metadata': {}},
 {'name': 'timestamp', 'type': 'string', 'nullable': True, 'metadata': {}})

In [24]:
response = {
    "subject": "entity-mnist",
    "type": "entity",
    "name": "mnist",
    "key": "",
    "properties": {
        "type": "struct",
        "fields": [
            {
                "name": "x",
                "type": {"type": "array", 'elementType': "float", "containsNull": True},
                "nullable": True,
                "metadata": {}
            },
            {
                "name": "y",
                "type": "integer",
                "nullable": True,
                "metadata": {}
            },
            {
                "name": "timestamp",
                "type": "string",
                "nullable": True,
                "metadata": {}
            }
        ]
    },
    "id": "63a183bff06069fd9982a5b6"
}

In [25]:
StructType.fromJson(response["properties"])

StructType([StructField('x', ArrayType(FloatType(), True), True), StructField('y', IntegerType(), True), StructField('timestamp', StringType(), True)])

In [26]:
import pymongo

db_uri = "mongodb://registry_user:registry_secret@localhost:27017/registry"
client = pymongo.MongoClient(db_uri)
db = client["registry"]
collection = db["featuregroup-mnist"]

In [30]:
query = {
    "timestamp": {"$gte": "2022-12-20T17:17:31", "$lte": "2022-12-20T17:17:41"},
}

In [31]:
import pandas as pd
df = pd.DataFrame(list(collection.find(query, {"_id": 0})))

In [32]:
df

Unnamed: 0,x,y,timestamp,feature,label
0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",5,2022-12-20T17:17:31.467910,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"
1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,2022-12-20T17:17:31.469386,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4,2022-12-20T17:17:31.469734,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"
3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,2022-12-20T17:17:31.469965,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"
4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",9,2022-12-20T17:17:31.470246,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
...,...,...,...,...,...
6047,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",6,2022-12-20T17:17:40.988628,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]"
6048,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3,2022-12-20T17:17:40.988956,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
6049,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4,2022-12-20T17:17:40.989274,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]"
6050,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",7,2022-12-20T17:17:40.989665,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"


In [33]:
client.close()