# UDF(User Defined Function)
- 사용자가 정의하는 `데이터 변환` 함수
- 데이터프레임에서 사용이 가능. SQL에서도 사용이 가능!

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("udf").getOrCreate()

transactions = [
    ('찹쌀탕수육+짜장2', '2021-11-07 13:20:00', 22000, 'KRW'),
    ('등심탕수육+크립새우+짜장면', '2021-10-24 11:19:00', 21500, 'KRW'), 
    ('월남 쌈 2인 세트', '2021-07-25 11:12:40', 42000, 'KRW'), 
    ('콩국수+열무비빔국수', '2021-07-10 08:20:00', 21250, 'KRW'), 
    ('장어소금+고추장구이', '2021-07-01 05:36:00', 68700, 'KRW'), 
    ('족발', '2020-08-19 19:04:00', 32000, 'KRW'),  
]

schema = ["name", "datetime", "price", "currency"]

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/29 14:40:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df = spark.createDataFrame(data=transactions, schema=schema)
df.createOrReplaceTempView("transactions")

UDF를 만든다
- 분산 병렬 처리 환경에서 사용할 수 있는 함수 만들기(Worker에서 작동하는 함수)
- 리턴 타입을 따로 지정하지 않으면 기본적으로 String 리턴

In [4]:
query = "select * from transactions;"
spark.sql(query).show(5)

                                                                                

+--------------------------+-------------------+-----+--------+
|                      name|           datetime|price|currency|
+--------------------------+-------------------+-----+--------+
|          찹쌀탕수육+짜장2|2021-11-07 13:20:00|22000|     KRW|
|등심탕수육+크립새우+짜장면|2021-10-24 11:19:00|21500|     KRW|
|          월남 쌈 2인 세트|2021-07-25 11:12:40|42000|     KRW|
|       콩국수+열무비빔국수|2021-07-10 08:20:00|21250|     KRW|
|       장어소금+고추장구이|2021-07-01 05:36:00|68700|     KRW|
+--------------------------+-------------------+-----+--------+
only showing top 5 rows



In [5]:
from pyspark.sql.types import LongType

def squared(n):
    return n*n

In [7]:
# register("Worker에서 사용할 이름(SQL)", 마스터에서 정의된 함수 이름, 리턴 타입)
spark.udf.register("squared", squared, LongType())

<function __main__.squared(n)>

In [8]:
spark.sql("""
SELECT price, squared(price)
FROM transactions;
""").show(5)

                                                                                

+-----+--------------+
|price|squared(price)|
+-----+--------------+
|22000|     484000000|
|21500|     462250000|
|42000|    1764000000|
|21250|     451562500|
|68700|    4719690000|
+-----+--------------+
only showing top 5 rows



In [9]:
def read_number(n):
    units = ["", "십", "백", "천", "만"]
    nums = '일이삼사오육칠팔구'
    result = []
    i = 0
    while n > 0:
        n, r = divmod(n, 10)
        if r > 0:
            result.append(nums[r-1]+units[i])
        i += 1
    return "".join(reversed(result))

In [10]:
spark.udf.register("read_number", read_number)

<function __main__.read_number(n)>

In [20]:
spark.sql("""
SELECT price, read_number(price)
FROM transactions;
""").show(5)

+-----+------------------+
|price|read_number(price)|
+-----+------------------+
|22000|          이만이천|
|21500|      이만일천오백|
|42000|          사만이천|
|21250|  이만일천이백오십|
|68700|      육만팔천칠백|
+-----+------------------+
only showing top 5 rows



In [21]:
def get_weekday(date):
    import calendar # 외부 라이브러리를 안쪽에서 임포트를 한다. 자바 코드로 자체적으로 바뀜
    return calendar.day_name[date.weekday()]

spark.udf.register("get_weekday", get_weekday)
query = """
SELECT datetime, get_weekday(TO_DATE(datetime)) as day_of_week
FROM transactions;
"""

spark.sql(query).show(5)

+-------------------+-----------+
|           datetime|day_of_week|
+-------------------+-----------+
|2021-11-07 13:20:00|     Sunday|
|2021-10-24 11:19:00|     Sunday|
|2021-07-25 11:12:40|     Sunday|
|2021-07-10 08:20:00|   Saturday|
|2021-07-01 05:36:00|   Thursday|
+-------------------+-----------+
only showing top 5 rows



In [22]:
filepath = "/home/ubuntu/working/spark-examples/data/titanic_train.csv"
titanic_sdf = spark.read.csv(filepath, inferSchema=True, header=True)

titanic_sdf.show(5)

                                                                                

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
+-----------+--------+------+--------------------+------+----+-----+-----+------

In [23]:
def get_category(age):
    cat = ''
    
    if age <= 5: cat = 'Baby'
    elif age <= 12: cat = 'Child'
    elif age <= 18: cat = 'Teenager'
    elif age <= 25: cat = 'Student'
    elif age <= 35: cat = 'Young Adult'
    elif age <= 60: cat = 'Adult'
    else : cat = 'Elderly'
    
    return cat

In [24]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType

udf_get_category = udf(lambda x : get_category(x), StringType()) # 데이터프레임에서 udf 쓰려면 이러한 방식이 필요함
udf_get_category

<function __main__.<lambda>(x)>

In [31]:
# Age NaN 값 처리
import pyspark.sql.functions as F

avg_age = titanic_sdf.select(F.avg(F.col("Age")))
avg_age_row_val = avg_age.first()[0]
titanic_sdf=titanic_sdf.fillna(value=avg_age_row_val, subset=["Age"])

In [32]:
titanic_sdf.withColumn("AgeCategory", udf_get_category(col("Age"))).show()

+-----------+--------+------+--------------------+------+-----------------+-----+-----+----------------+-------+-----+--------+-----------+
|PassengerId|Survived|Pclass|                Name|   Sex|              Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|AgeCategory|
+-----------+--------+------+--------------------+------+-----------------+-----+-----+----------------+-------+-----+--------+-----------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|             22.0|    1|    0|       A/5 21171|   7.25| null|       S|    Student|
|          2|       1|     1|Cumings, Mrs. Joh...|female|             38.0|    1|    0|        PC 17599|71.2833|  C85|       C|      Adult|
|          3|       1|     3|Heikkinen, Miss. ...|female|             26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|Young Adult|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|             35.0|    1|    0|          113803|   53.1| C123|       S|Young Adult|
|          5|       

In [33]:
spark.stop()