In [0]:
%sh

wget -O /dbfs/car_sales_data.csv https://s3-geospatial.s3.us-west-2.amazonaws.com/car_sales_data.csv

--2025-03-22 04:32:16--  https://s3-geospatial.s3.us-west-2.amazonaws.com/car_sales_data.csv
Resolving s3-geospatial.s3.us-west-2.amazonaws.com (s3-geospatial.s3.us-west-2.amazonaws.com)... 3.5.81.212, 52.92.237.250, 52.92.188.226, ...
Connecting to s3-geospatial.s3.us-west-2.amazonaws.com (s3-geospatial.s3.us-west-2.amazonaws.com)|3.5.81.212|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 45379 (44K) [text/csv]
Saving to: ‘/dbfs/car_sales_data.csv’

     0K .......... .......... .......... .......... ....      100%  169M=0s

2025-03-22 04:32:16 (169 MB/s) - ‘/dbfs/car_sales_data.csv’ saved [45379/45379]



In [0]:
%sh 

ls -tl /dbfs/

total 48
-rw-r--r-- 1 root root 45379 Mar 14 21:47 car_sales_data.csv


In [0]:
%fs ls /

path,name,size,modificationTime
dbfs:/car_sales_data.csv,car_sales_data.csv,45379,1742335789000
dbfs:/databricks-datasets/,databricks-datasets/,0,0
dbfs:/databricks-results/,databricks-results/,0,0
dbfs:/large_data.csv,large_data.csv,20182437859,1741819826000
dbfs:/large_data_100M.csv,large_data_100M.csv,2018258729,1742076822000
dbfs:/large_data_10M.csv,large_data_10M.csv,201822123,1742101769000
dbfs:/monthly_sales_by_brand/,monthly_sales_by_brand/,0,0
dbfs:/user/,user/,0,0


In [0]:
# 로컬 파일 시스템에서 DBFS로 복사
dbutils.fs.cp("file:/dbfs/car_sales_data.csv", "dbfs:/car_sales_data.csv")

True

In [0]:
# Databricks 노트북에서는 SparkSession 인스턴스를 별도로 생성할 필요가 없음
# 판매 정보를 Spark DataFrame으로 로드
df = spark.read.option("header", "true").csv("dbfs:/car_sales_data.csv")

In [0]:
# 타입이 모두 문자열로 기본 설정됨
df.printSchema()

root
 |-- cars_name: string (nullable = true)
 |-- car_price: string (nullable = true)
 |-- purchase_date: string (nullable = true)
 |-- brand_name: string (nullable = true)



In [0]:
# 이번에는 inferSchema 필드의 값을 true로 설정
df = spark.read.option("header", "true").option("inferSchema", "true").csv("dbfs:/car_sales_data.csv")

In [0]:
df.printSchema()

root
 |-- cars_name: string (nullable = true)
 |-- car_price: integer (nullable = true)
 |-- purchase_date: date (nullable = true)
 |-- brand_name: string (nullable = true)



In [0]:
# Pandas DataFrame과 다르게 Index가 존재하지 않음
df.show()

+-------------------+---------+-------------+----------+
|          cars_name|car_price|purchase_date|brand_name|
+-------------------+---------+-------------+----------+
|Chevrolet Silverado|    35000|   2024-11-15| Chevrolet|
|            Audi A4|    35000|   2024-05-08|     Tesla|
|            Audi A4|    24000|   2024-01-31|       BMW|
|    Hyundai Elantra|    21000|   2024-05-24|      Audi|
|   Mercedes C-Class|    45000|   2024-02-28|    Nissan|
|   Mercedes C-Class|    45000|   2024-12-01|   Hyundai|
|            Audi A4|    47000|   2024-05-23| Chevrolet|
|   Mercedes C-Class|    45000|   2024-10-25|       BMW|
|    Hyundai Elantra|    25000|   2024-10-13|      Audi|
|            Audi A4|    47000|   2024-05-17|      Audi|
|        Honda Civic|    42000|   2024-09-15|    Nissan|
|   Mercedes C-Class|    45000|   2025-01-10| Chevrolet|
|      Tesla Model 3|    35000|   2024-08-25|    Nissan|
|        Honda Civic|    42000|   2024-04-29|     Tesla|
|       Toyota Camry|    21000|

In [0]:
df.count()

1200

In [0]:
df.rdd.getNumPartitions()

1

In [0]:
# purchase_date 필드를 문자열에서 datetime 타입으로 변환. 
# df["purchase_date"] 대신에 df.purchase_date, col("purchase_date")을 사용해도 무방
df = df.withColumn("purchase_date", df["purchase_date"].cast("date"))

In [0]:
from pyspark.sql.functions import month, count, year, desc, col

# 2024년 데이터만 남기기
df_2024 = df.filter(year("purchase_date") == 2024)
df_2024_v2 = df.filter(year(col("purchase_date")) == 2024)
df_2024_v3 = df.filter(year(df["purchase_date"]) == 2024)
df_2024_v4 = df.filter(year(df.purchase_date) == 2024)
df_2024_v5 = df.filter("year(purchase_date) = 2024")

In [0]:
def dataframes_equal(df1, df2):
    # Check if schemas are equal
    if df1.schema != df2.schema:
        return False
    
    # Check if row counts are equal
    if df1.count() != df2.count():
        return False
    
    # Check if all rows match
    diff = df1.exceptAll(df2)
    return diff.count() == 0

In [0]:
dataframes_equal(df_2024, df_2024_v2)

True

In [0]:
dataframes_equal(df_2024_v3, df_2024_v4)

True

In [0]:
dataframes_equal(df_2024, df_2024_v3)

True

In [0]:
dataframes_equal(df_2024, df_2024_v5)

True

In [0]:
# 문제 1: 2024년 브랜드별로 월간 차량 판매 대수 계산
# 새로운 month라는 필드를 만들고 월 정보를 보존
df_2024 = df_2024.withColumn("month", month("purchase_date"))

In [0]:
df_2024.show()

+-------------------+---------+-------------+----------+-----+
|          cars_name|car_price|purchase_date|brand_name|month|
+-------------------+---------+-------------+----------+-----+
|Chevrolet Silverado|    35000|   2024-11-15| Chevrolet|   11|
|            Audi A4|    35000|   2024-05-08|     Tesla|    5|
|            Audi A4|    24000|   2024-01-31|       BMW|    1|
|    Hyundai Elantra|    21000|   2024-05-24|      Audi|    5|
|   Mercedes C-Class|    45000|   2024-02-28|    Nissan|    2|
|   Mercedes C-Class|    45000|   2024-12-01|   Hyundai|   12|
|            Audi A4|    47000|   2024-05-23| Chevrolet|    5|
|   Mercedes C-Class|    45000|   2024-10-25|       BMW|   10|
|    Hyundai Elantra|    25000|   2024-10-13|      Audi|   10|
|            Audi A4|    47000|   2024-05-17|      Audi|    5|
|        Honda Civic|    42000|   2024-09-15|    Nissan|    9|
|      Tesla Model 3|    35000|   2024-08-25|    Nissan|    8|
|        Honda Civic|    42000|   2024-04-29|     Tesla

In [0]:
df_2024.groupBy("brand_name", "month").count().show()

+----------+-----+-----+
|brand_name|month|count|
+----------+-----+-----+
|   Hyundai|    5|    8|
|    Nissan|    8|    5|
|  Mercedes|    4|   13|
|     Tesla|    4|   10|
|      Audi|   12|    9|
|   Hyundai|    2|   10|
| Chevrolet|   12|   14|
|      Ford|    3|   11|
|      Audi|    8|    8|
|      Audi|    7|    9|
| Chevrolet|    7|    7|
|       BMW|    8|    3|
|      Ford|    9|    5|
|       BMW|    3|    8|
|   Hyundai|   11|   11|
|       BMW|    9|   10|
|       BMW|    2|   14|
|  Mercedes|    5|    4|
| Chevrolet|   10|    8|
|    Nissan|    5|    9|
+----------+-----+-----+
only showing top 20 rows



In [0]:
# brand_name과 month로 그룹핑하고 카운트한 다음에 이를 sales_count라는 필드로 생성
# Pandas처럼 Series라는 것이 존재하지 않음
monthly_sales_by_brand = df_2024.groupBy("brand_name", "month") \
    .agg(count("*").alias("sales_count"))

In [0]:
# 앞서 연산들은 이제서야 실행됨 -> 이를 Lazy Execution이라 부름.
# Pandas는 Eager Execution (바로바로 처리)
monthly_sales_by_brand.show()

+----------+-----+-----------+
|brand_name|month|sales_count|
+----------+-----+-----------+
|   Hyundai|    5|          8|
|    Nissan|    8|          5|
|  Mercedes|    4|         13|
|     Tesla|    4|         10|
|      Audi|   12|          9|
|   Hyundai|    2|         10|
| Chevrolet|   12|         14|
|      Ford|    3|         11|
|      Audi|    8|          8|
|      Audi|    7|          9|
| Chevrolet|    7|          7|
|       BMW|    8|          3|
|      Ford|    9|          5|
|       BMW|    3|          8|
|   Hyundai|   11|         11|
|       BMW|    9|         10|
|       BMW|    2|         14|
|  Mercedes|    5|          4|
| Chevrolet|   10|          8|
|    Nissan|    5|          9|
+----------+-----+-----------+
only showing top 20 rows



In [0]:
monthly_sales_by_brand.write.mode("overwrite").csv("dbfs:/monthly_sales_by_brand", header=True)

In [0]:
%fs ls monthly_sales_by_brand/

path,name,size,modificationTime
dbfs:/monthly_sales_by_brand/_SUCCESS,_SUCCESS,0,1742618596000
dbfs:/monthly_sales_by_brand/_committed_671407255045041086,_committed_671407255045041086,208,1742618595000
dbfs:/monthly_sales_by_brand/_committed_7799594938157334490,_committed_7799594938157334490,111,1742603516000
dbfs:/monthly_sales_by_brand/_committed_vacuum5339320046666790796,_committed_vacuum5339320046666790796,96,1742618596000
dbfs:/monthly_sales_by_brand/_started_671407255045041086,_started_671407255045041086,0,1742618595000
dbfs:/monthly_sales_by_brand/part-00000-tid-671407255045041086-bb4d949a-b6e9-412c-bb90-68bb4985da1e-45-1-c000.csv,part-00000-tid-671407255045041086-bb4d949a-b6e9-412c-bb90-68bb4985da1e-45-1-c000.csv,1326,1742618595000


In [0]:
from pyspark.sql.functions import sum   # sum을 임포트 안 하면 파이썬의 기본 sum 함수가 사용되면서 이상한 에러 발생

df_2024 = df_2024.withColumn("car_price", col("car_price").cast("integer"))

df_2024.groupBy("brand_name", "month").agg(
    count("*").alias("sales_count"),
    sum("car_price").alias("sales_total")
).show()

+----------+-----+-----------+-----------+
|brand_name|month|sales_count|sales_total|
+----------+-----+-----------+-----------+
|   Hyundai|    5|          8|     241000|
|    Nissan|    8|          5|     170000|
|  Mercedes|    4|         13|     473000|
|     Tesla|    4|         10|     370000|
|      Audi|   12|          9|     288000|
|   Hyundai|    2|         10|     378000|
| Chevrolet|   12|         14|     510000|
|      Ford|    3|         11|     388000|
|      Audi|    8|          8|     259000|
|      Audi|    7|          9|     264000|
| Chevrolet|    7|          7|     205000|
|       BMW|    8|          3|     130000|
|      Ford|    9|          5|     189000|
|       BMW|    3|          8|     285000|
|   Hyundai|   11|         11|     426000|
|       BMW|    9|         10|     367000|
|       BMW|    2|         14|     549000|
|  Mercedes|    5|          4|     146000|
| Chevrolet|   10|          8|     287000|
|    Nissan|    5|          9|     306000|
+----------

In [0]:
# 문제 2: 2024년 가장 많이 팔린 차 5대를 가장 많이 팔린 차부터 찾기
top_5_cars = df_2024.groupBy("cars_name") \
    .agg(count("*").alias("sales_count")) \
    .orderBy(desc("sales_count")) \
    .limit(5)

In [0]:
top_5_cars.show()

+-------------------+-----------+
|          cars_name|sales_count|
+-------------------+-----------+
|   Mercedes C-Class|        140|
|    Hyundai Elantra|        126|
|Chevrolet Silverado|        115|
|       BMW 3 Series|        114|
|       Toyota Camry|        112|
+-------------------+-----------+

