In [1]:
# SparkSession 설정
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').appName('age-type-3-with-sql').getOrCreate()

In [2]:
# 샘플 데이터: 사람들의 나이 및 직업 정보
personal_info = [
    ('Taylor Swift', 33, 'Singer-songwriter'),
    ('Gordon Ramsay', 56, 'Chef'),
    ('박찬암', 33, '보안전문가'),
    ('Miranda Cosgrove', 30, 'Actress'),
    ('유재석', 50, '연예인'),
    ('이혜정', 66, '요리연구가'),
    ('Daniel Fenton', 14, 'Student_character')
]


In [3]:
# 스키마 정의에 필요한 클래스 임포트
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
schema = StructType([
    StructField('person_name', StringType(), False),
    StructField('age', IntegerType(), False),
    StructField('job', StringType(), True)
])

In [4]:
# 데이터프레임 생성
df = spark.createDataFrame(data=personal_info, schema=schema)

In [5]:
# 데이터프레임 출력
df.show()

+----------------+---+-----------------+
|     person_name|age|              job|
+----------------+---+-----------------+
|    Taylor Swift| 33|Singer-songwriter|
|   Gordon Ramsay| 56|             Chef|
|          박찬암| 33|       보안전문가|
|Miranda Cosgrove| 30|          Actress|
|          유재석| 50|           연예인|
|          이혜정| 66|       요리연구가|
|   Daniel Fenton| 14|Student_character|
+----------------+---+-----------------+



In [6]:
# 데이터프레임 스키마 출력
df.printSchema()

root
 |-- person_name: string (nullable = false)
 |-- age: integer (nullable = false)
 |-- job: string (nullable = true)



In [7]:
# Temporary View 생성
df.createOrReplaceTempView('people')

In [8]:
# UDF: 나이 -> 청년, 장년, 노인으로 분류하는 함수
# 세계 최고령자 연령: 115(2023.01 기준)를 참고

def age_category(age):
    if age >= 0 and age < 35:
        return 'young'
    elif age < 60:
        return 'adult'
    elif age < 115:
        return 'senior'


In [9]:
# UDF 등록
from pyspark.sql.functions import udf
spark.udf.register('age_category', age_category)

<function __main__.age_category(age)>

In [10]:
df.show()

+----------------+---+-----------------+
|     person_name|age|              job|
+----------------+---+-----------------+
|    Taylor Swift| 33|Singer-songwriter|
|   Gordon Ramsay| 56|             Chef|
|          박찬암| 33|       보안전문가|
|Miranda Cosgrove| 30|          Actress|
|          유재석| 50|           연예인|
|          이혜정| 66|       요리연구가|
|   Daniel Fenton| 14|Student_character|
+----------------+---+-----------------+



In [11]:
# 과제 수행
spark.sql(
    "SELECT person_name, age_category(age) AS age_typed \
    FROM people").show()

+----------------+---------+
|     person_name|age_typed|
+----------------+---------+
|    Taylor Swift|    young|
|   Gordon Ramsay|    adult|
|          박찬암|    young|
|Miranda Cosgrove|    young|
|          유재석|    adult|
|          이혜정|   senior|
|   Daniel Fenton|    young|
+----------------+---------+



In [12]:
# 연령을 기준으로 오름차순 정렬을 추가로 수행
spark.sql(
    "SELECT person_name, age_category(age) AS age_typed \
    FROM people \
    ORDER BY age ASC").show()

+----------------+---------+
|     person_name|age_typed|
+----------------+---------+
|   Daniel Fenton|    young|
|Miranda Cosgrove|    young|
|    Taylor Swift|    young|
|          박찬암|    young|
|          유재석|    adult|
|   Gordon Ramsay|    adult|
|          이혜정|   senior|
+----------------+---------+

