In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from afinn import Afinn

In [2]:
spark

In [3]:
sc

In [4]:
# (1) Define a normal Python function and match arguments to your UDF
model = Afinn()

def score_message_py(msg):
    global model
    return model.score(msg)

In [5]:
# (2) Register UDF function
score_message = udf(score_message_py, StringType()) # for DataFrame transformation API
spark.udf.register("score_message", score_message_py, StringType()) # for SQL

<function __main__.score_message_py(msg)>

In [6]:
# Get average user sentimental scores
df = spark.read.csv("hdfs://devenv/user/spark/spark_sql_101/messages/data",
                    header=True,
                    inferSchema=True)

In [7]:
# (i) Using DF transformations
scored = df.select(df["user"], score_message(df["message"]).alias("score"))
result = scored.groupBy("user").agg(avg("score"))
result.show(10000)

+---------------+-------------------+
|           user|         avg(score)|
+---------------+-------------------+
|     megan_rice|               -4.0|
|        Daniiej|               -2.0|
|         MeghTW|               -1.0|
|   candicebunny|                0.0|
|stranger_danger|                0.0|
|  divingkid2001|                0.0|
|    Lilli_Allen|                0.0|
|        caaaami|               -2.0|
|       J_Moneyy|               -1.0|
|        SoEdith|               -2.5|
|     convoy3571|                0.0|
|       kyrabeth|               -2.0|
|      kateblogs|               -2.0|
|    lovelylivxo|               -2.0|
|       irlbinky|                1.0|
|        Ste1987|-0.3333333333333333|
|       squintoo|               -3.0|
|     PhantomV48|               -1.0|
|        sophizz|                7.0|
|      tink68113|               -2.0|
|   melliejellie|               -0.5|
|        Merlene|                0.0|
|  marybethbeech|                1.0|
|     Svalen

In [8]:
# (ii) Using SQL
df.createOrReplaceTempView("messages")
result = spark.sql("select user, avg(score_message(message)) from messages group by user")
result.show(10000)

+---------------+-------------------------------------------+
|           user|avg(CAST(score_message(message) AS DOUBLE))|
+---------------+-------------------------------------------+
|     megan_rice|                                       -4.0|
|        Daniiej|                                       -2.0|
|         MeghTW|                                       -1.0|
|   candicebunny|                                        0.0|
|stranger_danger|                                        0.0|
|  divingkid2001|                                        0.0|
|    Lilli_Allen|                                        0.0|
|        caaaami|                                       -2.0|
|       J_Moneyy|                                       -1.0|
|        SoEdith|                                       -2.5|
|     convoy3571|                                        0.0|
|       kyrabeth|                                       -2.0|
|      kateblogs|                                       -2.0|
|    lov