In [49]:
pip install pyspark



In [50]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Spark 세션 초기화
spark = SparkSession.builder.appName("JsonFileRead").getOrCreate()

In [51]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [52]:
data = spark.read.option("multiline", "true").json("/content/drive/MyDrive/KHUDA_DE/KHUDA_DE_project/2405161941_new_fastcampus.json")
data.printSchema()

root
 |-- new_courses: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- accordion: array (nullable = true)
 |    |    |    |-- element: array (containsNull = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |    |-- badge: string (nullable = true)
 |    |    |-- course_img: string (nullable = true)
 |    |    |-- course_url: string (nullable = true)
 |    |    |-- intro: string (nullable = true)
 |    |    |-- parts: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- regular_price: string (nullable = true)
 |    |    |-- sale_price: string (nullable = true)
 |    |    |-- summary: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- tags: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- title: string (nullable = true)



**1. course 내부 필드 접근**

In [53]:
from pyspark.sql.functions import explode, col

In [54]:
# 배열 필드를 explode하여 개별 행으로 확장
df = data.select(explode("new_courses").alias("new_course"))

# 필요한 필드만 선택 -> 전체 필드 선택
df = df.select(
    col("new_course.title"),
    col("new_course.intro"),
    col("new_course.badge"),
    col("new_course.tags"),
    col("new_course.course_img"),
    col("new_course.course_url"),
    col("new_course.regular_price"),
    col("new_course.sale_price"),
    col("new_course.summary"),
    col("new_course.parts"),
    col("new_course.accordion")
)

# 결과 출력 (출력 크기 제한)
df.show(10)

+---------------------------------+----------------------------------+--------+--------------------------------+--------------------+--------------------+-------------+----------+-------------------------------+---------------------------+--------------------------------+
|                            title|                             intro|   badge|                            tags|          course_img|          course_url|regular_price|sale_price|                        summary|                      parts|                       accordion|
+---------------------------------+----------------------------------+--------+--------------------------------+--------------------+--------------------+-------------+----------+-------------------------------+---------------------------+--------------------------------+
|    2024 시그니처 프론트엔드 :...|   현재 FE 개발자에게 요구하는 ...|사전예약| [프론트엔드, 시그니처, 입문,...|https://storage.g...|https://fastcampu...|    653,000원| 190,000원|   [01. 입문-3년차 채용공고 ...|       [Part 1.

**2. 숫자: string -> int**

In [55]:
from pyspark.sql.functions import regexp_replace

In [56]:
df = df.withColumn(
    "regular_price",
    regexp_replace(col("regular_price"), "[^0-9]", "").cast("int")
)

In [57]:
df = df.withColumn(
    "sale_price",
    regexp_replace(col("sale_price"), "[^0-9]", "").cast("int")
)

In [58]:
df.show()

+----------------------------------+----------------------------------+--------+--------------------------------+--------------------+--------------------+-------------+----------+--------------------------------+---------------------------+--------------------------------+
|                             title|                             intro|   badge|                            tags|          course_img|          course_url|regular_price|sale_price|                         summary|                      parts|                       accordion|
+----------------------------------+----------------------------------+--------+--------------------------------+--------------------+--------------------+-------------+----------+--------------------------------+---------------------------+--------------------------------+
|     2024 시그니처 프론트엔드 :...|   현재 FE 개발자에게 요구하는 ...|사전예약| [프론트엔드, 시그니처, 입문,...|https://storage.g...|https://fastcampu...|       653000|    190000|    [01. 입문-3년차 채용공고 ...|     

**3. 숫자 필드 NULL 제거**

In [59]:
price_df = df.na.drop(subset=["regular_price","sale_price"])

In [60]:
price_df.show()

+---------------------------------+----------------------------------+--------+--------------------------------+--------------------+--------------------+-------------+----------+--------------------------------+---------------------------+--------------------------------+
|                            title|                             intro|   badge|                            tags|          course_img|          course_url|regular_price|sale_price|                         summary|                      parts|                       accordion|
+---------------------------------+----------------------------------+--------+--------------------------------+--------------------+--------------------+-------------+----------+--------------------------------+---------------------------+--------------------------------+
|    2024 시그니처 프론트엔드 :...|   현재 FE 개발자에게 요구하는 ...|사전예약| [프론트엔드, 시그니처, 입문,...|https://storage.g...|https://fastcampu...|       653000|    190000|    [01. 입문-3년차 채용공고 ...|       [P

price_df: 숫자 필드가 NULL인 행 제거한 df

**3-1. 할인율 col 추가**

In [61]:
from pyspark.sql.functions import expr

In [62]:
# 할인율 계산 및 데이터프레임에 추가
price_df = price_df.withColumn("discount_rate", expr("((regular_price - sale_price) / regular_price) * 100"))

In [63]:
price_df.show()

+---------------------------------+----------------------------------+--------+--------------------------------+--------------------+--------------------+-------------+----------+--------------------------------+---------------------------+--------------------------------+------------------+
|                            title|                             intro|   badge|                            tags|          course_img|          course_url|regular_price|sale_price|                         summary|                      parts|                       accordion|     discount_rate|
+---------------------------------+----------------------------------+--------+--------------------------------+--------------------+--------------------+-------------+----------+--------------------------------+---------------------------+--------------------------------+------------------+
|    2024 시그니처 프론트엔드 :...|   현재 FE 개발자에게 요구하는 ...|사전예약| [프론트엔드, 시그니처, 입문,...|https://storage.g...|https://fastcampu...|  

**3-2. 할인율 높은 순으로 정렬**

In [64]:
# 내림차순 정렬 -> 할인율 높은 순서대로 정렬
sorted_discount_rate_price_df = price_df.orderBy(col("discount_rate").desc())

In [65]:
sorted_discount_rate_price_df.show()

+---------------------------------+----------------------------------+--------+--------------------------------+--------------------+--------------------+-------------+----------+--------------------------------+---------------------------+--------------------------------+------------------+
|                            title|                             intro|   badge|                            tags|          course_img|          course_url|regular_price|sale_price|                         summary|                      parts|                       accordion|     discount_rate|
+---------------------------------+----------------------------------+--------+--------------------------------+--------------------+--------------------+-------------+----------+--------------------------------+---------------------------+--------------------------------+------------------+
|       처음 시작하는 AWS : AWS...|근본인 클라우드 컴퓨팅 이론부터...|전체오픈|   [AWS, 클라우드, 클라우드컴...|https://storage.g...|https://fastcampu...|

**3-3. 가격 낮은 순으로 정렬**

In [66]:
# 오름차순 정렬 -> 가격 낮은 순서대로 정렬
sorted_price_df = price_df.orderBy(col("sale_price"))

In [67]:
sorted_price_df.show()

+---------------------------------+----------------------------------+--------+--------------------------------+--------------------+--------------------+-------------+----------+--------------------------------+---------------------------+--------------------------------+------------------+
|                            title|                             intro|   badge|                            tags|          course_img|          course_url|regular_price|sale_price|                         summary|                      parts|                       accordion|     discount_rate|
+---------------------------------+----------------------------------+--------+--------------------------------+--------------------+--------------------+-------------+----------+--------------------------------+---------------------------+--------------------------------+------------------+
|       처음 시작하는 AWS : AWS...|근본인 클라우드 컴퓨팅 이론부터...|전체오픈|   [AWS, 클라우드, 클라우드컴...|https://storage.g...|https://fastcampu...|

**4. 단어 검색**

In [68]:
# 필터링할 키워드 정의
keyword = "프론트엔드"

In [69]:
from pyspark.sql.functions import array_contains

In [70]:
# accordion은 2차원 배열 -> accordion 배열 평탄화
df_flattened = df.withColumn("accordion", explode("accordion"))

In [72]:
df_flattened.show()

+-----------------------------+-------------------------------+--------+-------------------------------+--------------------+--------------------+-------------+----------+----------------------------+--------------------+-----------------------------+
|                        title|                          intro|   badge|                           tags|          course_img|          course_url|regular_price|sale_price|                     summary|               parts|                    accordion|
+-----------------------------+-------------------------------+--------+-------------------------------+--------------------+--------------------+-------------+----------+----------------------------+--------------------+-----------------------------+
|2024 시그니처 프론트엔드 :...|현재 FE 개발자에게 요구하는 ...|사전예약|[프론트엔드, 시그니처, 입문,...|https://storage.g...|https://fastcampu...|       653000|    190000|[01. 입문-3년차 채용공고 ...|[Part 1.HTML/CSS/...|[기본 정보, | 수강대상\n입...|
|2024 시그니처 프론트엔드 :...|현재 FE 개발자에게 요구하는 ...|사

df_flatteded: 단어 검색할 수 있게 평탄화된 df

In [75]:
# title 또는 tags에 키워드가 포함된 항목 필터링
df_filtered = df_flattened.filter(
    col("title").contains(keyword) |
    col("intro").contains(keyword) |
    array_contains(col("tags"), keyword) |
    array_contains(col("summary"), keyword) |
    array_contains(col("parts"), keyword) |
    array_contains(col("accordion"), keyword)
)

In [76]:
df_filtered.show(truncate=False) # (내용 생략 X)

+----------------------------------------------------------------------+----------------------------------------------------------------------------------------+--------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------+-----------------------------------------------+-------------+----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------