In [1]:
# [+] SparkSession 설정
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local').appName('test-app').getOrCreate()

In [2]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType

In [3]:
# [+] Trip Data 스키마 정의
tripSchema = StructType([
    StructField('name', StringType(), False),
    StructField('age', IntegerType(), True),
])

In [4]:
path = "./data/"
name_data = "human1.csv"

df_name = spark.read.csv(path+name_data, schema=tripSchema, header=True)

In [5]:
df_name.show()

+---------------+---+
|           name|age|
+---------------+---+
|     Bill Gates| 67|
|      Elon Musk| 51|
|Mark Zuckerberg| 40|
|   Park Ji-Sung| 42|
|  Son Heung-min| 30|
|     Kim Minjae| 26|
|    Lee Jaeyong| 54|
| Warren Buffett| 94|
+---------------+---+



In [6]:
df_name.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)



In [7]:
df_name.createOrReplaceTempView('age')

In [8]:
# UDF 1 age
def age_category(age):
    if age < 35:
        return 'young'
    elif age < 60:
        return 'adult'
    else:
        return 'senior'

In [9]:
# UDF 등록
spark.udf.register('age_category', age_category)

<function __main__.age_category(age)>

In [10]:
spark.sql("SELECT name,age_category(age) FROM age").show()

+---------------+-----------------+
|           name|age_category(age)|
+---------------+-----------------+
|     Bill Gates|           senior|
|      Elon Musk|            adult|
|Mark Zuckerberg|            adult|
|   Park Ji-Sung|            adult|
|  Son Heung-min|            young|
|     Kim Minjae|            young|
|    Lee Jaeyong|            adult|
| Warren Buffett|           senior|
+---------------+-----------------+

