In [3]:
# SQL문 안에서 사용할 수 있는 함수

from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local[4]').appName('udf').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/09 19:12:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/11/09 19:12:02 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [6]:
from pyspark.sql.types import StringType, DateType, IntegerType, StructType, StructField

transactions = [
    ('찹쌀탕수육+짜장2', '2021-11-07 13:20:00', 22000, 'KRW'),
    ('등심탕수육+크립새우+짜장면', '2021-10-24 11:19:00', 21500, 'KRW'), 
    ('월남 쌈 2인 세트', '2021-07-25 11:12:40', 42000, 'KRW'), 
    ('콩국수+열무비빔국수', '2021-07-10 08:20:00', 21250, 'KRW'), 
    ('장어소금+고추장구이', '2021-07-01 05:36:00', 68700, 'KRW'), 
    ('족발', '2020-08-19 19:04:00', 32000, 'KRW'),  
]

transactions_schema = StructType([
    StructField('name', StringType(), True),
    StructField('datetime', StringType(), True),
    StructField('price', IntegerType(), True),
    StructField('currency', StringType(), True),
])


transaction_df = spark.createDataFrame(transactions, transactions_schema)
transaction_df.show()

                                                                                

+--------------------------+-------------------+-----+--------+
|                      name|           datetime|price|currency|
+--------------------------+-------------------+-----+--------+
|          찹쌀탕수육+짜장2|2021-11-07 13:20:00|22000|     KRW|
|등심탕수육+크립새우+짜장면|2021-10-24 11:19:00|21500|     KRW|
|          월남 쌈 2인 세트|2021-07-25 11:12:40|42000|     KRW|
|       콩국수+열무비빔국수|2021-07-10 08:20:00|21250|     KRW|
|       장어소금+고추장구이|2021-07-01 05:36:00|68700|     KRW|
|                      족발|2020-08-19 19:04:00|32000|     KRW|
+--------------------------+-------------------+-----+--------+



In [9]:
# Temporary View 생성
transaction_df.createOrReplaceTempView('transaction')

In [11]:
spark.sql(
    """
    SELECT *
    FROM transaction
    """
).show()

+--------------------------+-------------------+-----+--------+
|                      name|           datetime|price|currency|
+--------------------------+-------------------+-----+--------+
|          찹쌀탕수육+짜장2|2021-11-07 13:20:00|22000|     KRW|
|등심탕수육+크립새우+짜장면|2021-10-24 11:19:00|21500|     KRW|
|          월남 쌈 2인 세트|2021-07-25 11:12:40|42000|     KRW|
|       콩국수+열무비빔국수|2021-07-10 08:20:00|21250|     KRW|
|       장어소금+고추장구이|2021-07-01 05:36:00|68700|     KRW|
|                      족발|2020-08-19 19:04:00|32000|     KRW|
+--------------------------+-------------------+-----+--------+



In [15]:
from pyspark.sql.types import LongType
# UDF (1)

def squared(n):
    return n ** 2

# 함수 Register 과정 필요함! 
# - return 값 Type 지정 (default는 String)
spark.udf.register('squared', squared, LongType())

spark.sql(
    """
    SELECT price, squared(price)
    FROM transaction
    """
).show()

23/11/09 19:21:09 WARN SimpleFunctionRegistry: The function squared replaced a previously registered function.


+-----+--------------+
|price|squared(price)|
+-----+--------------+
|22000|     484000000|
|21500|     462250000|
|42000|    1764000000|
|21250|     451562500|
|68700|    4719690000|
|32000|    1024000000|
+-----+--------------+



In [17]:
spark.sql(
    """
    SELECT price, squared(price)
    FROM transaction
    """
).printSchema()

root
 |-- price: integer (nullable = true)
 |-- squared(price): long (nullable = true)



In [40]:
# UDF (2) decorator 함수로 Register
from pyspark.sql.functions import udf

# @udf(returnType=LongType())
def multiply_10(n):
    return n * 10
spark.udf.register('multiply_10', multiply_10, LongType())

spark.sql(
    """
    SELECT price, multiply_10(price)
    FROM transaction
    """
).show()

+-----+------------------+
|price|multiply_10(price)|
+-----+------------------+
|22000|            220000|
|21500|            215000|
|42000|            420000|
|21250|            212500|
|68700|            687000|
|32000|            320000|
+-----+------------------+



In [None]:
def read_number(n):
    units = ["", "십", "백", "천", "만"]
    nums = '일이삼사오육칠팔구'
    result = []
    i = 0
    while n > 0:
        n, r = divmod(n, 10)
        if r > 0:
            result.append(nums[r-1]+units[i])
        i += 1
    return "".join(reversed(result))

print(read_number(21250))
print(read_number(68700))

In [None]:
spark.udf.register("read_number", read_number)

In [None]:
spark.sql("SELECT price, read_number(price) FROM transactions").show()

In [None]:
import cal