In [12]:
# SQL문 안에서 쓸 수 있는 하나의 Function을 정의하는 것을 의미함
## 패키지 임포트
spark.stop()
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName('udf') \
    .config("spark.ui.reverseProxyUrl", "http://localhost:4050") \
    .getOrCreate()

23/06/05 11:36:21 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


# 1. 가상의 데이터 만들기

In [13]:
transactions = [
    ('찹쌀탕수육+짜장', '2021-11-07 13:20:00', 22000, 'KRW'),
    ('찹쌀탕수육+짜장', '2021-11-07 13:20:00', 22000, 'KRW'),
    ('월남 쌈 2인 세트', '2021-07-25 11:12:40', 42000, 'KRW'),
    ('콩국수_열무비빔국수', '2021-07-10 08:20:00', 21250, 'KRW'),
    ('장어소금+고추장구이', '2021-07-01 05:36:00', 68700, 'KRW'),
    ('족발', '2020-08-19 19:04:00', 32000, 'KRW'),    
]

schema = ['name', 'datetime', 'price', 'currency']

In [14]:
df = spark.createDataFrame(data = transactions, schema = schema)

In [15]:
df.createOrReplaceTempView('transactions')

In [16]:
spark.sql('SELECT * FROM transactions').show()

                                                                                

+-------------------+-------------------+-----+--------+
|               name|           datetime|price|currency|
+-------------------+-------------------+-----+--------+
|    찹쌀탕수육+짜장|2021-11-07 13:20:00|22000|     KRW|
|    찹쌀탕수육+짜장|2021-11-07 13:20:00|22000|     KRW|
|   월남 쌈 2인 세트|2021-07-25 11:12:40|42000|     KRW|
|콩국수_열무비빔국수|2021-07-10 08:20:00|21250|     KRW|
|장어소금+고추장구이|2021-07-01 05:36:00|68700|     KRW|
|               족발|2020-08-19 19:04:00|32000|     KRW|
+-------------------+-------------------+-----+--------+



# 2. UDF 만들기

In [18]:
# UDF
def squared(n):
    return n * n

# register 하는 법
spark.udf.register("squared", squared)

<function __main__.squared(n)>

In [19]:
spark.sql("SELECT price, squared(price) FROM transactions").printSchema()

root
 |-- price: long (nullable = true)
 |-- squared(price): string (nullable = true)



INT를 받아서 STRING을 리턴하네? 
- 별도로 지정하지 않으면 default로 string을 리턴함

## 1) 데이터 타입 바꿔주기

In [22]:
# 데이터 타입을 지정해준다
from pyspark.sql.types import LongType

def squared(n):
    return n * n

# register 하는 법
spark.udf.register("squared", squared, LongType())

23/06/05 11:53:55 WARN SimpleFunctionRegistry: The function squared replaced a previously registered function.


<function __main__.squared(n)>

In [23]:
spark.sql("SELECT price, squared(price) FROM transactions").printSchema()

root
 |-- price: long (nullable = true)
 |-- squared(price): long (nullable = true)



##### (1) 파이썬으로 바꿔주기

In [24]:
from pyspark.sql.functions import udf
from pyspark.sql.types import LongType

@udf('long')
def squared(n):
    return n * n


In [25]:
spark.sql("SELECT price, squared(price) FROM transactions").printSchema()

root
 |-- price: long (nullable = true)
 |-- squared(price): long (nullable = true)



### 2) 가격을 한국어로 읽어주는 함수를 만들어보자

In [29]:
def read_number(n):
    unit = ("", "십", "백", "천", "만")
    nums = '일이삼사오육칠팔구'
    result = []
    i = 0 
    while n > 0:
        n, r = divmod(n,10)
        if r > 0:
            result.append(nums[r-1]+unit[i])
        i += 1
    return "".join(reversed(result))
            

print(read_number(21250))
print(read_number(68700))

이만일천이백오십
육만팔천칠백


In [30]:
# register해보자
spark.udf.register('read_number', read_number)

<function __main__.read_number(n)>

In [31]:
spark.sql("SELECT price, read_number(price) FROM transactions").show()

[Stage 3:>                                                          (0 + 1) / 1]                                                                                

+-----+------------------+
|price|read_number(price)|
+-----+------------------+
|22000|          이만이천|
|22000|          이만이천|
|42000|          사만이천|
|21250|  이만일천이백오십|
|68700|      육만팔천칠백|
|32000|          삼만이천|
+-----+------------------+



### 3) 날짜를 보고 요일을 찾아보자

In [34]:
def get_weekday(date):
    import calendar
    return calendar.day_name[date.weekday()]

spark.udf.register('get_weekday', get_weekday)

23/06/05 15:31:00 WARN SimpleFunctionRegistry: The function get_weekday replaced a previously registered function.


<function __main__.get_weekday(date)>

In [36]:
query = """
SELECT
    datetime
    , get_weekday(TO_DATE(datetime)) as day_of_week
FROM 
    transactions
"""

spark.sql(query).show()


+-------------------+-----------+
|           datetime|day_of_week|
+-------------------+-----------+
|2021-11-07 13:20:00|     Sunday|
|2021-11-07 13:20:00|     Sunday|
|2021-07-25 11:12:40|     Sunday|
|2021-07-10 08:20:00|   Saturday|
|2021-07-01 05:36:00|   Thursday|
|2020-08-19 19:04:00|  Wednesday|
+-------------------+-----------+

