In [1]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("spark_sql_basic2")
sc   = SparkContext(conf=conf)

In [None]:
# RDD만을 이용한 데이터 추출

In [2]:

movies_rdd = sc.parallelize([
    (1, ("어벤져스", "마블")),
    (2, ("슈퍼맨", "DC")),
    (3, ("배트맨", "DC")),
    (4, ("겨울왕국", "디즈니")),
    (5, ("아이언맨", "마블"))
])


attendances_rdd = sc.parallelize([
    (1, (13934592, "KR")),
    (2, (2182227,"KR")),
    (3, (4226242, "KR")),
    (4, (10303058, "KR")),
    (5, (4300365, "KR"))
])

In [None]:
# 마블 영화 중 관객 수가 500만 이상인 영화를 가져오기

In [3]:
# CASE1. join 먼저, filter 나중에
movie_att = movies_rdd.join(attendances_rdd)
movie_att.filter(
    lambda x : x[1][0][1] == "마블" and x[1][1][0] > 5000000
).collect()

[(1, (('어벤져스', '마블'), (13934592, 'KR')))]

In [4]:
# CASE 2. filter 먼저, join 나중에
filtered_movies = movies_rdd.filter(lambda x : x[1][1] == '마블')
filtered_att = attendances_rdd.filter(lambda x : x[1][0] > 5000000)

filtered_movies.join(filtered_att).collect()

[(1, (('어벤져스', '마블'), (13934592, 'KR')))]

In [None]:
# Spark SQL 사용해 보기

In [5]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("spark-sql").getOrCreate()

In [11]:
spark

In [12]:
# 컬럼 추가
movies = [
    (1, "어벤져스", "마블", 2012, 4, 26),
    (2, "슈퍼맨", "DC", 2013, 6, 13),
    (3, "배트맨", "DC", 2008, 8, 6),
    (4, "겨울왕국", "디즈니", 2014, 1, 16),
    (5, "아이언맨", "마블", 2008, 4, 30)
]

In [13]:
#스키마를 알아야 한다.
movie_schema = ["id", "name", "company", "year", "month", "day"]

In [None]:
# 2. 데이터 프레임 만들기

In [14]:
df = spark.createDataFrame(data=movies, schema=movie_schema)

In [15]:
df

DataFrame[id: bigint, name: string, company: string, year: bigint, month: bigint, day: bigint]

In [16]:
df.columns

['id', 'name', 'company', 'year', 'month', 'day']

In [18]:
df.dtypes

[('id', 'bigint'),
 ('name', 'string'),
 ('company', 'string'),
 ('year', 'bigint'),
 ('month', 'bigint'),
 ('day', 'bigint')]

In [19]:
df.select("name").show() #projection

+--------+
|    name|
+--------+
|어벤져스|
|  슈퍼맨|
|  배트맨|
|겨울왕국|
|아이언맨|
+--------+



In [None]:
df.filter(df.year >= 2010).show() #where

In [20]:
df

DataFrame[id: bigint, name: string, company: string, year: bigint, month: bigint, day: bigint]

In [21]:
df.createOrReplaceTempView("movies") #Table(view)의 이름

In [22]:
# 영화 이름만 가져오기

query = """

SELECT name
  FROM movies
  
"""
spark.sql(query).show()

+--------+
|    name|
+--------+
|어벤져스|
|  슈퍼맨|
|  배트맨|
|겨울왕국|
|아이언맨|
+--------+



In [23]:
# 2010년 이후에 개봉한 영화를 조회

query = """

SELECT name
  FROM movies
  WHERE year > 2010
  
"""
spark.sql(query).show()

+--------+
|    name|
+--------+
|어벤져스|
|  슈퍼맨|
|겨울왕국|
+--------+



In [24]:
# 2012년도 이전에 개봉한 영화의 이름과 회사를 출력
query = """

SELECT name, company
  FROM movies
  WHERE year < 2012

"""
spark.sql(query).show()

+--------+-------+
|    name|company|
+--------+-------+
|  배트맨|     DC|
|아이언맨|   마블|
+--------+-------+



In [27]:
# like 문자열 데이터에서 특정 단어나 문장을 포함한 데이터를 찾을 때
# % 기호를 사용해서 문장이 매칭되는지 확인 가능!
# 제목이 ~~맨으로 끝나는 데이터의 모든 정보를 조회
query = """

SELECT name
  FROM movies
  WHERE name LIKE '%맨'

"""
spark.sql(query).show()

+--------+
|    name|
+--------+
|  슈퍼맨|
|  배트맨|
|아이언맨|
+--------+



In [30]:

# BETWEEN 특정 데이터와 데이터 사이를 조회
# 개봉 월이 4 ~ 8월 사이. 4 <= 개봉월 <= 8

query = """

SELECT name
  FROM movies
  WHERE month BETWEEN 4 AND 8
  
"""

spark.sql(query).show()

+--------+
|    name|
+--------+
|어벤져스|
|  슈퍼맨|
|  배트맨|
|아이언맨|
+--------+



In [31]:
df.show()

+---+--------+-------+----+-----+---+
| id|    name|company|year|month|day|
+---+--------+-------+----+-----+---+
|  1|어벤져스|   마블|2012|    4| 26|
|  2|  슈퍼맨|     DC|2013|    6| 13|
|  3|  배트맨|     DC|2008|    8|  6|
|  4|겨울왕국| 디즈니|2014|    1| 16|
|  5|아이언맨|   마블|2008|    4| 30|
+---+--------+-------+----+-----+---+



In [None]:
# Join 구현하기

In [32]:

attendances = [
    (1, 13934592., "KR"),
    (2, 2182227.,"KR"),
    (3, 4226242., "KR"),
    (4, 10303058., "KR"),
    (5, 4300365., "KR")
]

In [33]:
# 직접 스키마 지정해 보기
from pyspark.sql.types import StringType, FloatType\
    , IntegerType\
    , StructType, StructField

In [34]:
att_schema = StructType([ # 모든 컬럼의 타입을 통칭 - 컬럼 데이터의 집합
    StructField("id", IntegerType(), True), # StructField : 컬럼
    StructField("att", FloatType(), True),
    StructField("theater_country", StringType(), True)
])

In [35]:

att_df = spark.createDataFrame(
    data=attendances,
    schema=att_schema
)

att_df.dtypes

[('id', 'int'), ('att', 'float'), ('theater_country', 'string')]

In [36]:
att_df.createOrReplaceTempView("att")

In [37]:
att_df.select('*').show()

+---+-----------+---------------+
| id|        att|theater_country|
+---+-----------+---------------+
|  1|1.3934592E7|             KR|
|  2|  2182227.0|             KR|
|  3|  4226242.0|             KR|
|  4|1.0303058E7|             KR|
|  5|  4300365.0|             KR|
+---+-----------+---------------+



In [41]:
#df와 join
query = """

SELECT movies.id, movies.name, movies.company, att.att 
FROM movies
JOIN att ON movies.id = att.id

"""

spark.sql(query).show()

+---+--------+-------+-----------+
| id|    name|company|        att|
+---+--------+-------+-----------+
|  1|어벤져스|   마블|1.3934592E7|
|  2|  슈퍼맨|     DC|  2182227.0|
|  3|  배트맨|     DC|  4226242.0|
|  4|겨울왕국| 디즈니|1.0303058E7|
|  5|아이언맨|   마블|  4300365.0|
+---+--------+-------+-----------+



In [None]:
# 데이터 프레임 API

In [None]:
# select
df.select("*").collect()

In [None]:
df.select("name", "company").collect()

In [None]:
df.select(df.name, (df.year-2000).alias("year")).show()

In [None]:
# agg : Aggreagte의 약자로써, 그룹핑 후 데이터를 하나로 합쳐주는 역할
df.agg({"id": "count"}).collect()

In [None]:
from pyspark.sql import functions as F
df.agg(F.min(df.year)).collect()

In [None]:
df.groupBy().avg().collect()

In [None]:
# 회사별 개봉월의 평균
df.groupBy('company').agg({"month": "mean"}).collect()

In [None]:
# 회사 별 월 별 영화 개수 정보


In [None]:
# join : 다른 데이터 프레임과 사용자가 지정한 컬럼을 기준으로 합치는 작업
df.join(att_df, 'id').select(df.name, att_df.att).show()

In [None]:
# select, where, orderBy 절 사용
marvel_df = df.select("name", "company", "year").where("company=='마블'").orderBy("id")
marvel_df.collect()

In [42]:
spark.stop()
sc.stop()

In [None]:
# SQL 최적화

In [47]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("trip_count_sql").getOrCreate()

In [48]:
trip_file = "learning_spark_data/fhvhv_tripdata_2020-03.csv"

In [51]:
# inferSchema : 자동으로 스키마 예측하게 하기
data = spark.read.csv(trip_file, inferSchema=True, header=True)

In [52]:
data.createOrReplaceTempView("mobility_data")

In [53]:
query = """
select *
from mobility_data
limit 5
"""
spark.sql(query).show()

+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|hvfhs_license_num|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|           HV0005|              B02510|2020-03-01 00:03:40|2020-03-01 00:23:39|          81|         159|   NULL|
|           HV0005|              B02510|2020-03-01 00:28:05|2020-03-01 00:38:57|         168|         119|   NULL|
|           HV0003|              B02764|2020-03-01 00:03:07|2020-03-01 00:15:04|         137|         209|      1|
|           HV0003|              B02764|2020-03-01 00:18:42|2020-03-01 00:38:42|         209|          80|   NULL|
|           HV0003|              B02764|2020-03-01 00:44:24|2020-03-01 00:58:44|         256|         226|   NULL|
+-----------------+--------------------+-------------------+-------------------+

In [None]:
# 스파크 SQL을 사용하는 이유

In [54]:
#첫번째 쿼리

query = """

select split(pickup_datetime, ' ')[0] as pickup_date, count(*) as trips
from mobility_data

group by pickup_date
"""

spark.sql(query).show()

+-----------+------+
|pickup_date| trips|
+-----------+------+
| 2020-03-03|697880|
| 2020-03-02|648986|
| 2020-03-01|784246|
| 2020-03-06|872012|
| 2020-03-05|731165|
| 2020-03-04|707879|
| 2020-03-09|628940|
| 2020-03-08|731222|
| 2020-03-07|886071|
| 2020-03-10|626474|
| 2020-03-12|643257|
| 2020-03-11|628601|
| 2020-03-16|391518|
| 2020-03-13|660914|
| 2020-03-15|448125|
| 2020-03-14|569397|
| 2020-03-26|141607|
| 2020-03-25|141088|
| 2020-03-20|261900|
| 2020-03-24|141686|
+-----------+------+
only showing top 20 rows



In [55]:
# 실행 계획 살펴보기
spark.sql(query).explain(True)

== Parsed Logical Plan ==
'Aggregate ['pickup_date], ['split('pickup_datetime,  )[0] AS pickup_date#288, 'count(1) AS trips#289]
+- 'UnresolvedRelation [mobility_data], [], false

== Analyzed Logical Plan ==
pickup_date: string, trips: bigint
Aggregate [split(cast(pickup_datetime#215 as string),  , -1)[0]], [split(cast(pickup_datetime#215 as string),  , -1)[0] AS pickup_date#288, count(1) AS trips#289L]
+- SubqueryAlias mobility_data
   +- View (`mobility_data`, [hvfhs_license_num#213,dispatching_base_num#214,pickup_datetime#215,dropoff_datetime#216,PULocationID#217,DOLocationID#218,SR_Flag#219])
      +- Relation [hvfhs_license_num#213,dispatching_base_num#214,pickup_datetime#215,dropoff_datetime#216,PULocationID#217,DOLocationID#218,SR_Flag#219] csv

== Optimized Logical Plan ==
Aggregate [_groupingexpression#293], [_groupingexpression#293 AS pickup_date#288, count(1) AS trips#289L]
+- Project [split(cast(pickup_datetime#215 as string),  , -1)[0] AS _groupingexpression#293]
   +- Rel

In [56]:
# 두번째 쿼리
spark.sql("""select 
                pickup_date, 
                count(*) as trips
             from ( select
                          split(pickup_datetime, ' ')[0] as pickup_date
                          from mobility_data )
             group by pickup_date""").explain(True)

== Parsed Logical Plan ==
'Aggregate ['pickup_date], ['pickup_date, 'count(1) AS trips#297]
+- 'SubqueryAlias __auto_generated_subquery_name
   +- 'Project ['split('pickup_datetime,  )[0] AS pickup_date#296]
      +- 'UnresolvedRelation [mobility_data], [], false

== Analyzed Logical Plan ==
pickup_date: string, trips: bigint
Aggregate [pickup_date#296], [pickup_date#296, count(1) AS trips#297L]
+- SubqueryAlias __auto_generated_subquery_name
   +- Project [split(cast(pickup_datetime#215 as string),  , -1)[0] AS pickup_date#296]
      +- SubqueryAlias mobility_data
         +- View (`mobility_data`, [hvfhs_license_num#213,dispatching_base_num#214,pickup_datetime#215,dropoff_datetime#216,PULocationID#217,DOLocationID#218,SR_Flag#219])
            +- Relation [hvfhs_license_num#213,dispatching_base_num#214,pickup_datetime#215,dropoff_datetime#216,PULocationID#217,DOLocationID#218,SR_Flag#219] csv

== Optimized Logical Plan ==
Aggregate [pickup_date#296], [pickup_date#296, count(1) AS tri

In [58]:
spark.sql("""select 
                pickup_date, 
                count(*) as trips
             from ( select
                          split(pickup_datetime, ' ')[0] as pickup_date
                          from mobility_data )
             group by pickup_date""").show()

+-----------+------+
|pickup_date| trips|
+-----------+------+
| 2020-03-03|697880|
| 2020-03-02|648986|
| 2020-03-01|784246|
| 2020-03-06|872012|
| 2020-03-05|731165|
| 2020-03-04|707879|
| 2020-03-09|628940|
| 2020-03-08|731222|
| 2020-03-07|886071|
| 2020-03-10|626474|
| 2020-03-12|643257|
| 2020-03-11|628601|
| 2020-03-16|391518|
| 2020-03-13|660914|
| 2020-03-15|448125|
| 2020-03-14|569397|
| 2020-03-26|141607|
| 2020-03-25|141088|
| 2020-03-20|261900|
| 2020-03-24|141686|
+-----------+------+
only showing top 20 rows



In [58]:
spark.sql("""select 
                pickup_date, 
                count(*) as trips
             from ( select
                          split(pickup_datetime, ' ')[0] as pickup_date
                          from mobility_data )
             group by pickup_date""").show()

+-----------+------+
|pickup_date| trips|
+-----------+------+
| 2020-03-03|697880|
| 2020-03-02|648986|
| 2020-03-01|784246|
| 2020-03-06|872012|
| 2020-03-05|731165|
| 2020-03-04|707879|
| 2020-03-09|628940|
| 2020-03-08|731222|
| 2020-03-07|886071|
| 2020-03-10|626474|
| 2020-03-12|643257|
| 2020-03-11|628601|
| 2020-03-16|391518|
| 2020-03-13|660914|
| 2020-03-15|448125|
| 2020-03-14|569397|
| 2020-03-26|141607|
| 2020-03-25|141088|
| 2020-03-20|261900|
| 2020-03-24|141686|
+-----------+------+
only showing top 20 rows



In [59]:
spark.stop()

In [60]:
trip_file = "fhvhv_tripdata_2020-03.csv"
zone_file = "taxi+_zone_lookup.csv"

In [61]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("trip_count_sql").getOrCreate()

In [62]:
#운행 데이터 프레임 생성, Zone 데이터프레임 생성
trip_data = spark.read.format("csv")\
    .option("header", 'true')\
    .option('inferSchema', 'true')\
    .load('learning_spark_data/fhvhv_tripdata_2020-03.csv')
zone_data = spark.read.format("csv")\
    .option("header", 'true')\
    .option('inferSchema', 'true')\
    .load('learning_spark_data/taxi+_zone_lookup.csv')

In [63]:
trip_data.printSchema()

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- SR_Flag: integer (nullable = true)



In [64]:
zone_data.printSchema()

root
 |-- LocationID: integer (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Zone: string (nullable = true)
 |-- service_zone: string (nullable = true)



In [None]:
# 승차 Location(PULocationID)별 개수 세기
# 하차 Location(DOLocationID)별 개수 세기
#HV0003 운송사업자의 승차 지역별 트립 건수를 집계하고, 
#가장 많은 운송사업자순으로 정렬하는 분석 쿼리  hvfhs_license_num
#운송사별 운행 건수 비교
#승차 위치 Borough별 운행 건수
#서비스 존별 승차/하차 건수
# 질문 만들어보기 3개씩

In [67]:
spark.catalog.listTables()

[]

In [69]:
trip_data.createOrReplaceTempView("trip_data")

In [None]:
# 승차 Location(PULocationID)별 개수 세기
# 하차 Location(DOLocationID)별 개수 세기

In [70]:
spark.sql("""

SELECT PULocationID, DOLocationID, COUNT(*) as trip_count
FROM trip_data
GROUP BY PULocationID, DOLocationID

""").show()

+------------+------------+----------+
|PULocationID|DOLocationID|trip_count|
+------------+------------+----------+
|          95|         180|       323|
|         259|         259|      5106|
|         162|          80|       380|
|         163|           7|       837|
|         151|         116|       886|
|          25|          61|      3965|
|         181|         198|       491|
|         215|          39|       222|
|          80|         121|        58|
|         148|         229|       923|
|         198|          90|        81|
|         229|         239|      1008|
|         159|         147|      2783|
|         226|         133|        39|
|          49|          49|      3438|
|         100|         140|       775|
|         114|         223|       118|
|          33|          37|       756|
|         155|          14|       342|
|         163|         263|      1176|
+------------+------------+----------+
only showing top 20 rows



In [74]:
trip_data.show(3)

+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|hvfhs_license_num|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|           HV0005|              B02510|2020-03-01 00:03:40|2020-03-01 00:23:39|          81|         159|   NULL|
|           HV0005|              B02510|2020-03-01 00:28:05|2020-03-01 00:38:57|         168|         119|   NULL|
|           HV0003|              B02764|2020-03-01 00:03:07|2020-03-01 00:15:04|         137|         209|      1|
+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
only showing top 3 rows



In [77]:
#HV0003 운송사업자의 승차 지역별 트립 건수를 집계

spark.sql("""

SELECT hvfhs_license_num, dispatching_base_num, COUNT(*) as base_count
FROM trip_data
WHERE hvfhs_license_num == 'HV0003'
GROUP BY hvfhs_license_num, dispatching_base_num

""").show()

+-----------------+--------------------+----------+
|hvfhs_license_num|dispatching_base_num|base_count|
+-----------------+--------------------+----------+
|           HV0003|              B02880|    143604|
|           HV0003|              B02395|    142242|
|           HV0003|              B02888|    225468|
|           HV0003|              B02883|    304361|
|           HV0003|              B02872|    873212|
|           HV0003|              B02764|   1152287|
|           HV0003|              B02876|    271082|
|           HV0003|              B02765|    619986|
|           HV0003|              B02875|    821245|
|           HV0003|              B02866|    301181|
|           HV0003|              B02884|    303853|
|           HV0003|              B02835|    244198|
|           HV0003|              B02869|    484063|
|           HV0003|              B02871|    360259|
|           HV0003|              B02879|    282083|
|           HV0003|              B02617|    324423|
|           

In [79]:
#가장 많은 운송사업자순으로 정렬하는 분석 쿼리  hvfhs_license_num
#운송사별 운행 건수 비교

spark.sql("""

SELECT hvfhs_license_num, COUNT(*) as hvfhs_license_sum
FROM trip_data
GROUP BY hvfhs_license_num

""").show()

+-----------------+-----------------+
|hvfhs_license_num|hvfhs_license_sum|
+-----------------+-----------------+
|           HV0004|           336606|
|           HV0005|          3219535|
|           HV0003|          9836763|
+-----------------+-----------------+



In [80]:
zone_data.show()

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
|         6|Staten Island|Arrochar/Fort Wad...|   Boro Zone|
|         7|       Queens|             Astoria|   Boro Zone|
|         8|       Queens|        Astoria Park|   Boro Zone|
|         9|       Queens|          Auburndale|   Boro Zone|
|        10|       Queens|        Baisley Park|   Boro Zone|
|        11|     Brooklyn|          Bath Beach|   Boro Zone|
|        12|    Manhattan|        Battery Park| Yellow Zone|
|        13|    Manhattan|   Battery Park City| Yellow Zone|
|        14|     Brookly

In [84]:
zone_data.createOrReplaceTempView("zone_data")

In [85]:
#승차 위치 Borough별 운행 건수
spark.sql("""

SELECT Borough, COUNT(*) as count_Borough
FROM zone_data
GROUP BY Borough

""").show()

+-------------+-------------+
|      Borough|count_Borough|
+-------------+-------------+
|       Queens|           69|
|          EWR|            1|
|      Unknown|            2|
|     Brooklyn|           61|
|Staten Island|           20|
|    Manhattan|           69|
|        Bronx|           43|
+-------------+-------------+



In [94]:
#trip_data 와 zone_data join

trip_data.columns

['hvfhs_license_num',
 'dispatching_base_num',
 'pickup_datetime',
 'dropoff_datetime',
 'PULocationID',
 'DOLocationID',
 'SR_Flag']

In [95]:
zone_data.columns

['LocationID', 'Borough', 'Zone', 'service_zone']

In [97]:
# 출발지 zone_data에 alias "pu_zone"
# 도착지 zone_data에 alias "do_zone"
pu = zone_data.alias("pu_zone")
do = zone_data.alias("do_zone")

full_df = trip_data \
    .join(pu, trip_data.PULocationID == pu.LocationID, "left") \
    .join(do, trip_data.DOLocationID == do.LocationID, "left") \
    .select(
        "pickup_datetime", "dropoff_datetime",
        "PULocationID", "pu_zone.Borough", "pu_zone.Zone",
        "DOLocationID", "do_zone.Borough", "do_zone.Zone"
    ).show()


+-------------------+-------------------+------------+---------+--------------------+------------+---------+--------------------+
|    pickup_datetime|   dropoff_datetime|PULocationID|  Borough|                Zone|DOLocationID|  Borough|                Zone|
+-------------------+-------------------+------------+---------+--------------------+------------+---------+--------------------+
|2020-03-01 00:03:40|2020-03-01 00:23:39|          81|    Bronx|         Eastchester|         159|    Bronx|       Melrose South|
|2020-03-01 00:28:05|2020-03-01 00:38:57|         168|    Bronx|Mott Haven/Port M...|         119|    Bronx|          Highbridge|
|2020-03-01 00:03:07|2020-03-01 00:15:04|         137|Manhattan|            Kips Bay|         209|Manhattan|             Seaport|
|2020-03-01 00:18:42|2020-03-01 00:38:42|         209|Manhattan|             Seaport|          80| Brooklyn|   East Williamsburg|
|2020-03-01 00:44:24|2020-03-01 00:58:44|         256| Brooklyn|Williamsburg (Sou...|     

In [98]:
#서비스 존별 승차/하차 건수

zone_pu = zone_data.alias("zone_pu")
zone_do = zone_data.alias("zone_do")

trip_data \
    .join(zone_pu, trip_data.PULocationID == zone_pu.LocationID, "left") \
    .join(zone_do, trip_data.DOLocationID == zone_do.LocationID, "left") \
    .groupBy("zone_pu.service_zone", "zone_do.service_zone") \
    .count() \
    .withColumnRenamed("count", "trip_count") \
    .orderBy("trip_count", ascending=False) \
    .show()


+------------+------------+----------+
|service_zone|service_zone|trip_count|
+------------+------------+----------+
|   Boro Zone|   Boro Zone|   7808285|
| Yellow Zone| Yellow Zone|   2700816|
| Yellow Zone|   Boro Zone|    932399|
|   Boro Zone| Yellow Zone|    825156|
|   Boro Zone|    Airports|    227235|
| Yellow Zone|    Airports|    172029|
|   Boro Zone|         N/A|    171991|
| Yellow Zone|         N/A|    169644|
|    Airports|   Boro Zone|    143958|
|    Airports| Yellow Zone|    117548|
| Yellow Zone|         EWR|     50302|
|    Airports|         N/A|     45702|
|   Boro Zone|         EWR|     14230|
|    Airports|    Airports|     11882|
|    Airports|         EWR|       520|
|         N/A|         N/A|       421|
|         N/A|   Boro Zone|       391|
|         EWR| Yellow Zone|       249|
|         EWR|   Boro Zone|       103|
|         N/A| Yellow Zone|        18|
+------------+------------+----------+
only showing top 20 rows



In [99]:
# 서비스존별 승차건수

zone_pu = zone_data.alias("zone_pu")

trip_data.join(
    zone_pu,
    trip_data.PULocationID == zone_pu.LocationID,
    "inner"
).groupBy("zone_pu.service_zone") \
 .count() \
 .withColumnRenamed("count", "pickup_count") \
 .orderBy("pickup_count", ascending=False) \
 .show()


+------------+------------+
|service_zone|pickup_count|
+------------+------------+
|   Boro Zone|     9046897|
| Yellow Zone|     4025190|
|    Airports|      319610|
|         N/A|         845|
|         EWR|         362|
+------------+------------+



In [100]:
#서비스존별 하차건수

zone_do = zone_data.alias("zone_do")

trip_data.join(
    zone_do,
    trip_data.DOLocationID == zone_do.LocationID,
    "inner"
).groupBy("zone_do.service_zone") \
 .count() \
 .withColumnRenamed("count", "dropoff_count") \
 .orderBy("dropoff_count", ascending=False) \
 .show()


+------------+-------------+
|service_zone|dropoff_count|
+------------+-------------+
|   Boro Zone|      8885136|
| Yellow Zone|      3643787|
|    Airports|       411156|
|         N/A|       387759|
|         EWR|        65066|
+------------+-------------+



In [None]:
# 질문 만들어보기 3개씩
#1. 시간 리스트에 요일 추가하기
#2. 가장 혼잡한 시간대 : 요일, 시간별 트립 count로 상위만 추출
#3. 주말 vs 평일 비교