In [0]:
import pyspark.sql.functions as f
from pyspark.sql.types import *

schema = StructType([ StructField("text", StringType(), True)])
csv_s3_url = "s3a://s3-geospatial/transfer_cost.txt"   # s3a는 S3를 분산파일시스템으로 접근할 때 사용하는 프로토콜
df = spark.read.schema(schema).text(csv_s3_url)

In [0]:
df.printSchema()

root
 |-- text: string (nullable = true)



In [0]:
df.count()

Out[3]: 3834

In [0]:
df.show(truncate=False)

+---------------------------------------------------------------------------+
|text                                                                       |
+---------------------------------------------------------------------------+
|On 2021-01-04 the cost per ton from 85001 to 85002 is $28.32 at ABC Hauling|
|On 2021-01-04 the cost per ton from 85001 to 85004 is $25.68 at ABC Hauling|
|On 2021-01-04 the cost per ton from 85001 to 85007 is 19.86 at ABC Hauling |
|On 2021-01-04 the cost per ton from 85001 to 85007 is 20.52 at Haul Today  |
|On 2021-01-04 the cost per ton from 85001 to 85010 is 20.72 at Haul Today  |
|On 2021-01-04 the cost per ton from 85001 to 85012 is $18.98 at ABC Hauling|
|On 2021-01-04 the cost per ton from 85001 to 85013 is 26.64 at Haul Today  |
|On 2021-01-04 the cost per ton from 85001 to 85020 is 26.34 at ABC Hauling |
|On 2021-01-04 the cost per ton from 85001 to 85021 is $20.15 at ABC Hauling|
|On 2021-01-04 the cost per ton from 85002 to 85001 is 21.57 at 

In [0]:
regex_str = r'On (\S+) the cost per ton from (\d+) to (\d+) is (\S+) at (.*)'

df_with_new_columns = df\
    .withColumn('week', f.regexp_extract('text', regex_str, 1))\
    .withColumn('departure_zipcode', f.regexp_extract(f.column('text'), regex_str, 2))\
    .withColumn('arrival_zipcode', f.regexp_extract(df.text, regex_str, 3))\
    .withColumn('cost', f.regexp_extract(f.col('text'), regex_str, 4))\
    .withColumn('vendor', f.regexp_extract(f.col('text'), regex_str, 5))

In [0]:
df_with_new_columns.printSchema()

root
 |-- text: string (nullable = true)
 |-- week: string (nullable = true)
 |-- departure_zipcode: string (nullable = true)
 |-- arrival_zipcode: string (nullable = true)
 |-- cost: string (nullable = true)
 |-- vendor: string (nullable = true)



In [0]:
final_df = df_with_new_columns.drop("text")

In [0]:
final_df.write.csv("extracted.csv", mode="overwrite")

In [0]:
%fs ls dbfs:/extracted.csv/

path,name,size,modificationTime
dbfs:/extracted.csv/_SUCCESS,_SUCCESS,0,1743495235000
dbfs:/extracted.csv/_committed_2870063434376686262,_committed_2870063434376686262,112,1743192105000
dbfs:/extracted.csv/_committed_4647331972062101456,_committed_4647331972062101456,209,1743495234000
dbfs:/extracted.csv/_committed_vacuum1421264936678791883,_committed_vacuum1421264936678791883,96,1743495235000
dbfs:/extracted.csv/_started_4647331972062101456,_started_4647331972062101456,0,1743495233000
dbfs:/extracted.csv/part-00000-tid-4647331972062101456-2e31b7ae-1cb8-40cc-8cf3-22819218b8a7-3-1-c000.csv,part-00000-tid-4647331972062101456-2e31b7ae-1cb8-40cc-8cf3-22819218b8a7-3-1-c000.csv,156423,1743495234000


In [0]:
final_df.write.format("json").save("extracted.json", mode="overwrite")

In [0]:
%fs ls dbfs:/extracted.json/

path,name,size,modificationTime
dbfs:/extracted.json/_SUCCESS,_SUCCESS,0,1743495329000
dbfs:/extracted.json/_committed_2194929421918048030,_committed_2194929421918048030,113,1743192181000
dbfs:/extracted.json/_committed_8397155499577508729,_committed_8397155499577508729,211,1743495329000
dbfs:/extracted.json/_committed_vacuum7933004407605867108,_committed_vacuum7933004407605867108,96,1743495330000
dbfs:/extracted.json/_started_8397155499577508729,_started_8397155499577508729,0,1743495328000
dbfs:/extracted.json/part-00000-tid-8397155499577508729-110b9591-4740-4073-81a7-b681cf20d96c-4-1-c000.json,part-00000-tid-8397155499577508729-110b9591-4740-4073-81a7-b681cf20d96c-4-1-c000.json,436305,1743495328000


In [0]:
df_with_new_columns.createOrReplaceTempView("transfer_cost")

In [0]:
df_most_popular_route_10 = spark.sql("""
    SELECT departure_zipcode, arrival_zipcode, COUNT(1) as count
    FROM transfer_cost
    GROUP BY 1, 2
    ORDER BY 3 DESC
    LIMIT 10
""")

In [0]:
df_most_popular_route_10.show()

+-----------------+---------------+-----+
|departure_zipcode|arrival_zipcode|count|
+-----------------+---------------+-----+
|            85004|          85001|   56|
|            85013|          85001|   56|
|            85020|          85021|   56|
|            85012|          85021|   55|
|            85001|          85007|   55|
|            85007|          85021|   55|
|            85012|          85013|   55|
|            85012|          85002|   55|
|            85004|          85021|   55|
|            85010|          85012|   55|
+-----------------+---------------+-----+

