## Google Drive 연동

In [97]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Spark 설치

In [98]:
!apt-get install openjdk-8-jdk-headless
!wget -q https://dlcdn.apache.org/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz
!tar -zxf spark-3.5.1-bin-hadoop3.tgz

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
openjdk-8-jdk-headless is already the newest version (8u402-ga-2ubuntu1~22.04).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


In [99]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.1-bin-hadoop3"

In [100]:
!pip install findspark -q

In [101]:
import findspark
findspark.init()

In [102]:
import pyspark
spark_version = pyspark.__version__
print("Apache Spark 버전 확인: " + spark_version)

Apache Spark 버전 확인: 3.5.1


## Spark
- RDD : 다수의 서버에 분산 방식으로 저장함

In [103]:
from pyspark.sql import SparkSession

# Spark 세션 활성화
my_spark = SparkSession.builder.getOrCreate()
my_spark

- 데이터베이스 확인

In [104]:
my_spark.catalog.listDatabases()

[Database(name='default', catalog='spark_catalog', description='default database', locationUri='file:/content/spark-warehouse')]

- Spark SQL 쿼리 실행 (Database 보여주기)

In [105]:
my_spark.sql('show databases').show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [106]:
my_spark.catalog.currentDatabase()

'default'

In [107]:
# 기존 Spark 세션 종료
my_spark.stop()

In [108]:
# 새로운 Spark 세션 시작
my_spark = SparkSession.builder.master("local[1]").appName("SampleTutorial").getOrCreate()
rdd_sample = my_spark.sparkContext.parallelize([1, 2, 3, 4, 5])
print(type(rdd_sample))

<class 'pyspark.rdd.RDD'>


In [109]:
rdd_sample.take(num=2)

[1, 2]

## CSV 파일 불러오기

In [110]:
DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/data/flight_small.csv'
flights = my_spark.read.option('header', 'true').csv(DATA_PATH)
flights.show(2)

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|2014|   12|  8|     658|       -7|     935|       -5|     VX| N846VA|  1780|   SEA| LAX|     132|     954|   6|    58|
|2014|    1| 22|    1040|        5|    1505|        5|     AS| N559AS|   851|   SEA| HNL|     360|    2677|  10|    40|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
only showing top 2 rows



- flights 데이터프레임을 default 데이터베이스에 추가

In [111]:
flights.createOrReplaceTempView('flights')

- default 데이터베이스에 데이터가 추가가 되었는지 확인

In [112]:
my_spark.catalog.listTables('default')

[Table(name='flights', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]

- 데이터 조회할 때, SQL을 통해서 조회가 가능하다.


In [113]:
my_spark.sql('SHOW TABLES FROM default').show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|         |  flights|       true|
+---------+---------+-----------+



In [114]:
query = 'SELECT * FROM flights LIMIT 10'
# my_spark.sql(query).show()

flights10 = my_spark.sql(query)
flights10.show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|2014|   12|  8|     658|       -7|     935|       -5|     VX| N846VA|  1780|   SEA| LAX|     132|     954|   6|    58|
|2014|    1| 22|    1040|        5|    1505|        5|     AS| N559AS|   851|   SEA| HNL|     360|    2677|  10|    40|
|2014|    3|  9|    1443|       -2|    1652|        2|     VX| N847VA|   755|   SEA| SFO|     111|     679|  14|    43|
|2014|    4|  9|    1705|       45|    1839|       34|     WN| N360SW|   344|   PDX| SJC|      83|     569|  17|     5|
|2014|    3|  9|     754|       -1|    1015|        1|     AS| N612AS|   522|   SEA| BUR|     127|     937|   7|    54|
|2014|    1| 15|    1037|        7|    1

- origin, dest GroupBy 연산 데이터갯수 확인
- pandas 데이터프레임으로 변환 (메서드 찾아보기)
- (~10:25)

In [115]:
query = """
  SELECT origin, dest, COUNT(*) AS CNT
  FROM flights
  GROUP BY origin, dest
"""

result_spark = my_spark.sql(query)
# result.show()

# pandas 데이터프레임으로 변환
result_pd = result_spark.toPandas()
result_pd.head()

Unnamed: 0,origin,dest,CNT
0,SEA,RNO,8
1,SEA,DTW,98
2,SEA,CLE,2
3,SEA,LAX,450
4,PDX,SEA,144


In [116]:
# Pandas 데이터프레임에서 Spark로 변환
import pandas as pd
import numpy as np

# Generate a pandas DataFrame
pdf = pd.DataFrame(np.random.rand(100, 3))

# Create a Spark DataFrame from a pandas DataFrame using Arrow
df = my_spark.createDataFrame(pdf)
df.show()

+-------------------+--------------------+--------------------+
|                  0|                   1|                   2|
+-------------------+--------------------+--------------------+
|  0.966983433828194| 0.17517644552964262|  0.2257683510254349|
| 0.9684235605829024|  0.1756006548205209| 0.40752105338438405|
| 0.2951572979042393| 0.36839408968493137|  0.7405579846651007|
| 0.5332600740766239| 0.41079297821442784|  0.8946642530350235|
|0.29568051745617707|  0.6476984072763093|  0.5005127774528387|
| 0.4167927968754841| 0.11769242203512709| 0.26812321320060195|
|0.13974871671105116|  0.7443393497702879|  0.9683186028479476|
| 0.8429900252914837|  0.8720714761315832|  0.1508015466008893|
|0.07848293753836022|  0.3892279823956193|   0.520207607507914|
| 0.5474799402006345|0.005143561012568...|  0.8802590951962146|
|  0.418654622713422|  0.5266606225437571|  0.3842009837471515|
|0.13040291713115204| 0.31779945283456945|  0.4844490376296867|
| 0.5171190223993922|  0.862971756680942

In [117]:
# my_spark.stop()

## 정리할 시간
- 첫번째 : Spark 세션 생성
- 두번째 : 임의의 pandas 데이터프레임 생성
- 세번째 : Spark 데이터프레임으로 변환
- 네번째 : 변환된 데이터프레임을 데이터베이스에 추가
- 마지막 : listTables() 확인

## Chapter 3장

In [118]:
# Databricks notebook source
# MAGIC
# MAGIC %md
# MAGIC # Example 3.7

# from pyspark.sql.types import *
from pyspark.sql.types import *
from pyspark.sql.functions import col, expr, when, concat, lit
from pyspark.sql import SparkSession

# define schema for our data
schema = (StructType([
   StructField("Id", IntegerType(), False),
   StructField("First", StringType(), False),
   StructField("Last", StringType(), False),
   StructField("Url", StringType(), False),
   StructField("Published", StringType(), False),
   StructField("Hits", IntegerType(), False),
   StructField("Campaigns", ArrayType(StringType()), False)]))

ddl_schema = "`Id` INT,`First` STRING,`Last` STRING,`Url` STRING,`Published` STRING,`Hits` INT,`Campaigns` ARRAY<STRING>"

# create our data
data = [[1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter", "LinkedIn"]],
       [2, "Brooke","Wenig","https://tinyurl.2", "5/5/2018", 8908, ["twitter", "LinkedIn"]],
       [3, "Denny", "Lee", "https://tinyurl.3","6/7/2019",7659, ["web", "twitter", "FB", "LinkedIn"]],
       [4, "Tathagata", "Das","https://tinyurl.4", "5/12/2018", 10568, ["twitter", "FB"]],
       [5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web", "twitter", "FB", "LinkedIn"]],
       [6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568, ["twitter", "LinkedIn"]]
      ]

# create a DataFrame using the schema defined above
spark = SparkSession.builder.master("local[1]").appName("SampleTutorial").getOrCreate()

blogs_df = spark.createDataFrame(data, ddl_schema)
blogs_df.show()

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+



In [119]:
blogs_df2 = spark.createDataFrame(data, schema)
blogs_df2.show()

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+



In [120]:
blogs_df2.printSchema()

root
 |-- Id: integer (nullable = false)
 |-- First: string (nullable = false)
 |-- Last: string (nullable = false)
 |-- Url: string (nullable = false)
 |-- Published: string (nullable = false)
 |-- Hits: integer (nullable = false)
 |-- Campaigns: array (nullable = false)
 |    |-- element: string (containsNull = true)



In [121]:
# Create pd_temp
pd_temp = pd.DataFrame(np.random.random(10))

# Create spark_temp from pd_temp
spark_temp = my_spark.createDataFrame(pd_temp)

# Examine the tables in the catalog
print(my_spark.catalog.listTables())

# Add spark_temp to the catalog
spark_temp.createOrReplaceTempView("temp")

# Examine the tables in the catalog again
print(my_spark.catalog.listTables())

[Table(name='flights', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]
[Table(name='flights', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True), Table(name='temp', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]


In [122]:
# default DB에 테이블이 2개 존재
# 그 중에서 내가 원하는 테이블 취사 선택하는 예제
flights_2 = my_spark.table('flights')
flights_2.show()

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|2014|   12|  8|     658|       -7|     935|       -5|     VX| N846VA|  1780|   SEA| LAX|     132|     954|   6|    58|
|2014|    1| 22|    1040|        5|    1505|        5|     AS| N559AS|   851|   SEA| HNL|     360|    2677|  10|    40|
|2014|    3|  9|    1443|       -2|    1652|        2|     VX| N847VA|   755|   SEA| SFO|     111|     679|  14|    43|
|2014|    4|  9|    1705|       45|    1839|       34|     WN| N360SW|   344|   PDX| SJC|      83|     569|  17|     5|
|2014|    3|  9|     754|       -1|    1015|        1|     AS| N612AS|   522|   SEA| BUR|     127|     937|   7|    54|
|2014|    1| 15|    1037|        7|    1

## Spark 문법 활용 데이터 가공

In [123]:
# 데이터 컬럼 추가
# result = flights_2.
# flights_2['새로운'] = flights_2['air_time'] / 60
flights_2 = flights_2.withColumn("duration_hrs", flights_2.air_time/60)
flights_2.show(1)

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+------------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|duration_hrs|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+------------+
|2014|   12|  8|     658|       -7|     935|       -5|     VX| N846VA|  1780|   SEA| LAX|     132|     954|   6|    58|         2.2|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+------------+
only showing top 1 row



- 데이터를 필터링하는 코드 작성

In [124]:
result = flights_2.filter("distance >= 1000")
result.show(1)

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+------------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|duration_hrs|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+------------+
|2014|    1| 22|    1040|        5|    1505|        5|     AS| N559AS|   851|   SEA| HNL|     360|    2677|  10|    40|         6.0|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+------------+
only showing top 1 row



In [125]:
result2 = flights_2.filter(flights_2.distance > 1000)
result2.show(1)

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+------------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|duration_hrs|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+------------+
|2014|    1| 22|    1040|        5|    1505|        5|     AS| N559AS|   851|   SEA| HNL|     360|    2677|  10|    40|         6.0|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+------------+
only showing top 1 row



In [126]:
# 변수 선택, tailnum, origin, dest
result3 = flights_2.select("tailnum", "origin", "dest")
result3.show(1)

+-------+------+----+
|tailnum|origin|dest|
+-------+------+----+
| N846VA|   SEA| LAX|
+-------+------+----+
only showing top 1 row



In [127]:
result4 = flights_2.select(flights_2.tailnum, flights_2.origin, flights_2.dest)
result4.show(1)

+-------+------+----+
|tailnum|origin|dest|
+-------+------+----+
| N846VA|   SEA| LAX|
+-------+------+----+
only showing top 1 row



In [128]:
# 다중 필터 조건 걸어보기
filterA = flights_2.origin == "SEA"
filterB = flights_2.dest == "PDX"

tempA = result4.origin == "SEA"

print(type(filterA))
print(type(tempA))

selected = result4.filter(filterA).filter(filterB)
selected.show()

<class 'pyspark.sql.column.Column'>
<class 'pyspark.sql.column.Column'>
+-------+------+----+
|tailnum|origin|dest|
+-------+------+----+
| N810SK|   SEA| PDX|
| N822SK|   SEA| PDX|
| N586SW|   SEA| PDX|
| N223SW|   SEA| PDX|
| N580SW|   SEA| PDX|
| N520AS|   SEA| PDX|
| N809SK|   SEA| PDX|
| N295SW|   SEA| PDX|
| N221SW|   SEA| PDX|
| N294SW|   SEA| PDX|
| N581SW|   SEA| PDX|
| N563SW|   SEA| PDX|
| N297SW|   SEA| PDX|
| N564SW|   SEA| PDX|
| N468AS|   SEA| PDX|
| N229SW|   SEA| PDX|
| N565SW|   SEA| PDX|
| N580SW|   SEA| PDX|
| N817SK|   SEA| PDX|
| N564SW|   SEA| PDX|
+-------+------+----+
only showing top 20 rows



In [129]:
# avg_speed
avg_speed = (flights_2.distance/(flights_2.air_time/60)).alias("avg_speed")
speed_df = flights.select("origin", "dest", "tailnum", avg_speed)
speed_df.show()

+------+----+-------+------------------+
|origin|dest|tailnum|         avg_speed|
+------+----+-------+------------------+
|   SEA| LAX| N846VA| 433.6363636363636|
|   SEA| HNL| N559AS| 446.1666666666667|
|   SEA| SFO| N847VA|367.02702702702703|
|   PDX| SJC| N360SW| 411.3253012048193|
|   SEA| BUR| N612AS| 442.6771653543307|
|   PDX| DEN| N646SW|491.40495867768595|
|   PDX| OAK| N422WN|             362.0|
|   SEA| SFO| N361VA| 415.7142857142857|
|   SEA| SAN| N309AS| 466.6666666666667|
|   SEA| ORD| N564AS| 521.5151515151515|
|   SEA| LAX| N323AS| 440.3076923076923|
|   SEA| PHX| N305AS|431.29870129870125|
|   SEA| LAS| N433AS| 409.6062992125984|
|   SEA| ANC| N765AS|474.75409836065575|
|   SEA| SFO| N713AS| 315.8139534883721|
|   PDX| SFO| N27205| 366.6666666666667|
|   SEA| SMF| N626AS|477.63157894736844|
|   SEA| MDW| N8634A|481.38888888888886|
|   SEA| BOS| N597AS| 516.4137931034483|
|   PDX| BUR| N215AG| 441.6216216216216|
+------+----+-------+------------------+
only showing top

In [130]:
speed_df2 = flights.selectExpr("origin", "dest", "tailnum", "distance/(air_time/60) AS avg_speed")
speed_df2.show()

+------+----+-------+------------------+
|origin|dest|tailnum|         avg_speed|
+------+----+-------+------------------+
|   SEA| LAX| N846VA| 433.6363636363636|
|   SEA| HNL| N559AS| 446.1666666666667|
|   SEA| SFO| N847VA|367.02702702702703|
|   PDX| SJC| N360SW| 411.3253012048193|
|   SEA| BUR| N612AS| 442.6771653543307|
|   PDX| DEN| N646SW|491.40495867768595|
|   PDX| OAK| N422WN|             362.0|
|   SEA| SFO| N361VA| 415.7142857142857|
|   SEA| SAN| N309AS| 466.6666666666667|
|   SEA| ORD| N564AS| 521.5151515151515|
|   SEA| LAX| N323AS| 440.3076923076923|
|   SEA| PHX| N305AS|431.29870129870125|
|   SEA| LAS| N433AS| 409.6062992125984|
|   SEA| ANC| N765AS|474.75409836065575|
|   SEA| SFO| N713AS| 315.8139534883721|
|   PDX| SFO| N27205| 366.6666666666667|
|   SEA| SMF| N626AS|477.63157894736844|
|   SEA| MDW| N8634A|481.38888888888886|
|   SEA| BOS| N597AS| 516.4137931034483|
|   PDX| BUR| N215AG| 441.6216216216216|
+------+----+-------+------------------+
only showing top

## 집계함수
- groupby() + 집계함수

In [131]:
flights_2.printSchema()

root
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)
 |-- dep_time: string (nullable = true)
 |-- dep_delay: string (nullable = true)
 |-- arr_time: string (nullable = true)
 |-- arr_delay: string (nullable = true)
 |-- carrier: string (nullable = true)
 |-- tailnum: string (nullable = true)
 |-- flight: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- dest: string (nullable = true)
 |-- air_time: string (nullable = true)
 |-- distance: string (nullable = true)
 |-- hour: string (nullable = true)
 |-- minute: string (nullable = true)
 |-- duration_hrs: double (nullable = true)



In [132]:
# distance String --> int
flights_2 = flights_2.withColumn('distance', flights_2.distance.cast('int'))
flights_2.printSchema()

root
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)
 |-- dep_time: string (nullable = true)
 |-- dep_delay: string (nullable = true)
 |-- arr_time: string (nullable = true)
 |-- arr_delay: string (nullable = true)
 |-- carrier: string (nullable = true)
 |-- tailnum: string (nullable = true)
 |-- flight: string (nullable = true)
 |-- origin: string (nullable = true)
 |-- dest: string (nullable = true)
 |-- air_time: string (nullable = true)
 |-- distance: integer (nullable = true)
 |-- hour: string (nullable = true)
 |-- minute: string (nullable = true)
 |-- duration_hrs: double (nullable = true)



In [133]:
# origin, PDX 추출, groupby, distance 최소
flights_2.filter(flights_2.origin == "PDX").groupby("tailnum").sum("distance").show()

+-------+-------------+
|tailnum|sum(distance)|
+-------+-------------+
| N442AS|         8167|
| N36472|         5311|
| N567AA|         1616|
| N38451|         1739|
| N516UA|          991|
| N927DN|         1426|
| N954WN|         2103|
| N73283|          550|
| N102UW|         2282|
| N607AS|        11183|
| N622SW|         1199|
| N584AS|        13063|
| N914WN|          991|
| N445WN|          543|
| N3LDAA|         1739|
| N389HA|         5206|
| N578SW|          351|
| N430WN|         2518|
| N651SW|         1009|
| N611SW|         1772|
+-------+-------------+
only showing top 20 rows



In [134]:
# carrier
result = (flights_2
          .filter(flights_2.origin == "PDX")
          .filter(flights_2.carrier == "DL")
          .groupby("dest")
          .avg("distance"))
result.show()

+----+-------------+
|dest|avg(distance)|
+----+-------------+
| ATL|       2172.0|
| DTW|       1953.0|
| MSP|       1426.0|
| JFK|       2454.0|
| SLC|        630.0|
+----+-------------+



In [135]:
# carrier
result = (flights_2
          .filter(flights_2.origin == "PDX")
          .where(col("carrier") == "DL") # filter(flights_2.carrier == "DL")
          .groupby("dest")
          .avg("distance"))
result.show()

+----+-------------+
|dest|avg(distance)|
+----+-------------+
| ATL|       2172.0|
| DTW|       1953.0|
| MSP|       1426.0|
| JFK|       2454.0|
| SLC|        630.0|
+----+-------------+



In [136]:
import pyspark.sql.functions as F

flights_2= flights_2.withColumn('dep_delay', flights_2.distance.cast('int'))
by_month_dest = flights_2.groupBy("month", "dest")
by_month_dest.avg("dep_delay").show()

+-----+----+------------------+
|month|dest|    avg(dep_delay)|
+-----+----+------------------+
|   11| TUS|1183.6666666666667|
|   11| ANC|1453.5294117647059|
|    1| BUR|             877.0|
|    1| PDX|             129.0|
|    6| SBA|             877.0|
|    5| LAX| 919.7142857142857|
|   10| DTW|            1927.0|
|    6| SIT|             861.0|
|   10| DFW|1647.0588235294117|
|    3| FAI|            1533.0|
|   10| SEA|             129.0|
|    2| TUS|            1119.0|
|   12| OGG| 2625.818181818182|
|    9| DFW|1645.3333333333333|
|    5| EWR|2404.6666666666665|
|    3| RDM|             116.0|
|    8| DCA|            2335.3|
|    7| ATL|2178.2162162162163|
|    4| JFK|2431.1428571428573|
|   10| SNA| 962.1333333333333|
+-----+----+------------------+
only showing top 20 rows



In [137]:
by_month_dest.agg(F.stddev("dep_delay")).show()

+-----+----+------------------+
|month|dest| stddev(dep_delay)|
+-----+----+------------------+
|   11| TUS|  56.0029761113937|
|   11| ANC|22.450261919397587|
|    1| BUR| 61.55870112510924|
|    1| PDX|               0.0|
|    6| SBA| 62.00000000000001|
|    5| LAX| 54.56595650240686|
|   10| DTW|               0.0|
|    6| SIT|              NULL|
|   10| DFW|20.349880762766734|
|    3| FAI|               0.0|
|   10| SEA|               0.0|
|    2| TUS|               0.0|
|   12| OGG|31.552553563279734|
|    9| DFW|21.096385265356872|
|    5| EWR| 9.237604307033997|
|    3| RDM|               0.0|
|    8| DCA|10.143963722332614|
|    7| ATL| 4.916723926983699|
|    4| JFK|15.001831390031716|
|   10| SNA| 41.87202725766531|
+-----+----+------------------+
only showing top 20 rows



## 테이블 조인

In [138]:
my_spark.stop()

In [139]:
from pyspark.sql import SparkSession
my_spark = SparkSession.builder.master("local[1]").appName("SampleTutorial").getOrCreate()

DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/data/'

airports = my_spark.read.csv(DATA_PATH + "airports.csv", header=True)
airports.show(1)

flights = my_spark.read.csv(DATA_PATH + "flight_small.csv", header=True)
flights.show(1)

planes = my_spark.read.csv(DATA_PATH + "planes.csv", header=True)
planes.show(1)

+---+-----------------+----------+-----------+----+---+---+
|faa|             name|       lat|        lon| alt| tz|dst|
+---+-----------------+----------+-----------+----+---+---+
|04G|Lansdowne Airport|41.1304722|-80.6195833|1044| -5|  A|
+---+-----------------+----------+-----------+----+---+---+
only showing top 1 row

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|2014|   12|  8|     658|       -7|     935|       -5|     VX| N846VA|  1780|   SEA| LAX|     132|     954|   6|    58|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
only showing top 1 row

+-------+----+--------------------+----------------+

In [140]:
airports.show(10)

+---+--------------------+----------+------------+----+---+---+
|faa|                name|       lat|         lon| alt| tz|dst|
+---+--------------------+----------+------------+----+---+---+
|04G|   Lansdowne Airport|41.1304722| -80.6195833|1044| -5|  A|
|06A|Moton Field Munic...|32.4605722| -85.6800278| 264| -5|  A|
|06C| Schaumburg Regional|41.9893408| -88.1012428| 801| -6|  A|
|06N|     Randall Airport| 41.431912| -74.3915611| 523| -5|  A|
|09J|Jekyll Island Air...|31.0744722| -81.4277778|  11| -4|  A|
|0A9|Elizabethton Muni...|36.3712222| -82.1734167|1593| -4|  A|
|0G6|Williams County A...|41.4673056| -84.5067778| 730| -5|  A|
|0G7|Finger Lakes Regi...|42.8835647| -76.7812318| 492| -5|  A|
|0P2|Shoestring Aviati...|39.7948244| -76.6471914|1000| -5|  U|
|0S9|Jefferson County ...|48.0538086|-122.8106436| 108| -8|  A|
+---+--------------------+----------+------------+----+---+---+
only showing top 10 rows



In [141]:
flights.show(10)

+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|dest|air_time|distance|hour|minute|
+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+----+--------+--------+----+------+
|2014|   12|  8|     658|       -7|     935|       -5|     VX| N846VA|  1780|   SEA| LAX|     132|     954|   6|    58|
|2014|    1| 22|    1040|        5|    1505|        5|     AS| N559AS|   851|   SEA| HNL|     360|    2677|  10|    40|
|2014|    3|  9|    1443|       -2|    1652|        2|     VX| N847VA|   755|   SEA| SFO|     111|     679|  14|    43|
|2014|    4|  9|    1705|       45|    1839|       34|     WN| N360SW|   344|   PDX| SJC|      83|     569|  17|     5|
|2014|    3|  9|     754|       -1|    1015|        1|     AS| N612AS|   522|   SEA| BUR|     127|     937|   7|    54|
|2014|    1| 15|    1037|        7|    1

In [142]:
# 컬럼명 변경
airports = airports.withColumnRenamed("faa", "dest")
airports.show(1)

+----+-----------------+----------+-----------+----+---+---+
|dest|             name|       lat|        lon| alt| tz|dst|
+----+-----------------+----------+-----------+----+---+---+
| 04G|Lansdowne Airport|41.1304722|-80.6195833|1044| -5|  A|
+----+-----------------+----------+-----------+----+---+---+
only showing top 1 row



In [143]:
result = flights.join(airports, on = 'dest', how="leftouter")
result.show()

+----+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+--------+--------+----+------+--------------------+---------+-----------+----+---+---+
|dest|year|month|day|dep_time|dep_delay|arr_time|arr_delay|carrier|tailnum|flight|origin|air_time|distance|hour|minute|                name|      lat|        lon| alt| tz|dst|
+----+----+-----+---+--------+---------+--------+---------+-------+-------+------+------+--------+--------+----+------+--------------------+---------+-----------+----+---+---+
| LAX|2014|   12|  8|     658|       -7|     935|       -5|     VX| N846VA|  1780|   SEA|     132|     954|   6|    58|    Los Angeles Intl|33.942536|-118.408075| 126| -8|  A|
| HNL|2014|    1| 22|    1040|        5|    1505|        5|     AS| N559AS|   851|   SEA|     360|    2677|  10|    40|       Honolulu Intl|21.318681|-157.922428|  13|-10|  N|
| SFO|2014|    3|  9|    1443|       -2|    1652|        2|     VX| N847VA|   755|   SEA|     111|     679|  14|    43| 

In [144]:
my_spark.stop()

## zip 파일 데이터 불러오기

In [145]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]").appName("Sample").getOrCreate()
print(spark)

<pyspark.sql.session.SparkSession object at 0x7aea5a36db10>


In [146]:
import pyspark.sql.functions as F

# CSV 파일 불러오기
DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/data/'
aa_dfw_2014 = spark.read.format("csv").options(Header=True).load(DATA_PATH + "AA_DFW_2014_Departures_Short.csv.gz")
aa_dfw_2014.show(1)

+-----------------+-------------+-------------------+-----------------------------+
|Date (MM/DD/YYYY)|Flight Number|Destination Airport|Actual elapsed time (Minutes)|
+-----------------+-------------+-------------------+-----------------------------+
|       01/01/2014|         0005|                HNL|                          519|
+-----------------+-------------+-------------------+-----------------------------+
only showing top 1 row



In [147]:
aa_dfw_2014 = aa_dfw_2014.withColumn('airport', F.lower(aa_dfw_2014['Destination Airport']))
aa_dfw_2014.show(1)

+-----------------+-------------+-------------------+-----------------------------+-------+
|Date (MM/DD/YYYY)|Flight Number|Destination Airport|Actual elapsed time (Minutes)|airport|
+-----------------+-------------+-------------------+-----------------------------+-------+
|       01/01/2014|         0005|                HNL|                          519|    hnl|
+-----------------+-------------+-------------------+-----------------------------+-------+
only showing top 1 row



In [148]:
# 특정 컬럼 삭제
aa_dfw_2014 = aa_dfw_2014.drop(aa_dfw_2014['Destination Airport'])
aa_dfw_2014.show()

+-----------------+-------------+-----------------------------+-------+
|Date (MM/DD/YYYY)|Flight Number|Actual elapsed time (Minutes)|airport|
+-----------------+-------------+-----------------------------+-------+
|       01/01/2014|         0005|                          519|    hnl|
|       01/01/2014|         0007|                          505|    ogg|
|       01/01/2014|         0035|                          174|    slc|
|       01/01/2014|         0043|                          153|    dtw|
|       01/01/2014|         0052|                          137|    pit|
|       01/01/2014|         0058|                          174|    san|
|       01/01/2014|         0060|                          155|    mia|
|       01/01/2014|         0064|                          185|    jfk|
|       01/01/2014|         0090|                          126|    ord|
|       01/01/2014|         0096|                           91|    stl|
|       01/01/2014|         0099|                          182| 

In [149]:
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
import pyarrow as pa

df = pd.DataFrame({'one': [-1, 100, 2.5],
                   'two': ['foo', 'bar', 'baz'],
                   'three': [True, False, True]}, index=list('abc'))

table = pa.Table.from_pandas(df)
pq.write_table(table, DATA_PATH + 'example.parquet')

- 데이터 불러오기

In [150]:
example_df = spark.read.parquet(DATA_PATH + 'example.parquet')
example_df.show(1)

+----+---+-----+-----------------+
| one|two|three|__index_level_0__|
+----+---+-----+-----------------+
|-1.0|foo| true|                a|
+----+---+-----+-----------------+
only showing top 1 row



In [151]:
voter_df = spark.read.format('csv').options(Header=True).load(DATA_PATH + 'DallasCouncilVoters.csv.gz')
voter_df.show()

+----------+-------------+-------------------+
|      DATE|        TITLE|         VOTER_NAME|
+----------+-------------+-------------------+
|02/08/2017|Councilmember|  Jennifer S. Gates|
|02/08/2017|Councilmember| Philip T. Kingston|
|02/08/2017|        Mayor|Michael S. Rawlings|
|02/08/2017|Councilmember|       Adam Medrano|
|02/08/2017|Councilmember|       Casey Thomas|
|02/08/2017|Councilmember|Carolyn King Arnold|
|02/08/2017|Councilmember|       Scott Griggs|
|02/08/2017|Councilmember|   B. Adam  McGough|
|02/08/2017|Councilmember|       Lee Kleinman|
|02/08/2017|Councilmember|      Sandy Greyson|
|02/08/2017|Councilmember|  Jennifer S. Gates|
|02/08/2017|Councilmember| Philip T. Kingston|
|02/08/2017|        Mayor|Michael S. Rawlings|
|02/08/2017|Councilmember|       Adam Medrano|
|02/08/2017|Councilmember|       Casey Thomas|
|02/08/2017|Councilmember|Carolyn King Arnold|
|02/08/2017|Councilmember| Rickey D. Callahan|
|01/11/2017|Councilmember|  Jennifer S. Gates|
|04/25/2018|C

In [152]:
# 중복값 제거
voter_df.select(voter_df['VOTER_NAME']).distinct().show(10)

+--------------------+
|          VOTER_NAME|
+--------------------+
|      Tennell Atkins|
|  the  final   20...|
|        Scott Griggs|
|       Scott  Griggs|
|       Sandy Greyson|
| Michael S. Rawlings|
| the final 2018 A...|
|        Kevin Felder|
|        Adam Medrano|
|       Casey  Thomas|
+--------------------+
only showing top 10 rows



### 문제
- 객체명 : result
- 메서드 filter() 사용
- length() 활용해서  0보다 크고, 20보다 작은 VOTER_NAME을 가져온다.
- 결과

In [153]:
voter_df = voter_df.filter('length(VOTER_NAME) > 0 and length(VOTER_NAME) < 20')
voter_df.select('VOTER_NAME').distinct().show(10, truncate=False)

+-------------------+
|VOTER_NAME         |
+-------------------+
|Tennell Atkins     |
|Scott Griggs       |
|Scott  Griggs      |
|Sandy Greyson      |
|Michael S. Rawlings|
|Kevin Felder       |
|Adam Medrano       |
|Casey  Thomas      |
|011018__42         |
|Mark  Clayton      |
+-------------------+
only showing top 10 rows



- 밑줄이 그어진 행 `011018__42`은 제거를 하자.

In [154]:
voter_df = voter_df.filter(~F.col('VOTER_NAME').contains('_'))
voter_df.select('VOTER_NAME').distinct().show(10, truncate=False)

+-------------------+
|VOTER_NAME         |
+-------------------+
|Tennell Atkins     |
|Scott Griggs       |
|Scott  Griggs      |
|Sandy Greyson      |
|Michael S. Rawlings|
|Kevin Felder       |
|Adam Medrano       |
|Casey  Thomas      |
|Mark  Clayton      |
|Casey Thomas       |
+-------------------+
only showing top 10 rows



## Spark 데이터프레임 수정
- 문자열 다루는 코드

In [155]:
# 정규표현식('\s+') : 공백을 기준으로 문자열을 분리 하겠다.
voter_df = voter_df.withColumn('splits', F.split(voter_df.VOTER_NAME, '\s+'))
voter_df = voter_df.withColumn('first_name', voter_df.splits.getItem(0))
voter_df = voter_df.withColumn('last_name', voter_df.splits.getItem(F.size('splits') - 1))
voter_df = voter_df.drop('splits')
voter_df.show(1)

+----------+-------------+-----------------+----------+---------+
|      DATE|        TITLE|       VOTER_NAME|first_name|last_name|
+----------+-------------+-----------------+----------+---------+
|02/08/2017|Councilmember|Jennifer S. Gates|  Jennifer|    Gates|
+----------+-------------+-----------------+----------+---------+
only showing top 1 row





## when() 조건문 활용법

In [156]:
import pyspark.sql.functions as F

# Councilmember 일 경우에는무작위 숫자를 입력
voter_df1 = voter_df.withColumn('random_val', F.when(voter_df.TITLE == 'Councilmember', F.rand()).otherwise('not yet'))
voter_df1.show()

+----------+-------------+-------------------+----------+---------+--------------------+
|      DATE|        TITLE|         VOTER_NAME|first_name|last_name|          random_val|
+----------+-------------+-------------------+----------+---------+--------------------+
|02/08/2017|Councilmember|  Jennifer S. Gates|  Jennifer|    Gates| 0.17946043145892765|
|02/08/2017|Councilmember| Philip T. Kingston|    Philip| Kingston| 0.33549387996047897|
|02/08/2017|        Mayor|Michael S. Rawlings|   Michael| Rawlings|             not yet|
|02/08/2017|Councilmember|       Adam Medrano|      Adam|  Medrano|  0.6072084589512748|
|02/08/2017|Councilmember|       Casey Thomas|     Casey|   Thomas|  0.4640569506079616|
|02/08/2017|Councilmember|Carolyn King Arnold|   Carolyn|   Arnold| 0.38392875927146886|
|02/08/2017|Councilmember|       Scott Griggs|     Scott|   Griggs| 0.26665542846998636|
|02/08/2017|Councilmember|   B. Adam  McGough|        B.|  McGough|  0.7936125936162445|
|02/08/2017|Councilme

### 다중조건문
- Councilmember일 경우에는 무작위 숫자
- Mayer일 경우에는 숫자 2를 입력
- 그 외에 나머지는 0으로 입력해라 (otherwise)

F.when(조건문, 결과값).when(조건문, 결과값).otherwise()

In [157]:
voter_df2 = voter_df.withColumn('random_val', F.when(voter_df.TITLE == 'Councilmember', F.rand()).when(voter_df.TITLE == 'Mayor', 2).otherwise(0))
voter_df2.show()

+----------+-------------+-------------------+----------+---------+--------------------+
|      DATE|        TITLE|         VOTER_NAME|first_name|last_name|          random_val|
+----------+-------------+-------------------+----------+---------+--------------------+
|02/08/2017|Councilmember|  Jennifer S. Gates|  Jennifer|    Gates|0.002195911511971...|
|02/08/2017|Councilmember| Philip T. Kingston|    Philip| Kingston|  0.5668929687744322|
|02/08/2017|        Mayor|Michael S. Rawlings|   Michael| Rawlings|                 2.0|
|02/08/2017|Councilmember|       Adam Medrano|      Adam|  Medrano|  0.7003372430701884|
|02/08/2017|Councilmember|       Casey Thomas|     Casey|   Thomas|  0.7945008932685371|
|02/08/2017|Councilmember|Carolyn King Arnold|   Carolyn|   Arnold|   0.672792663744973|
|02/08/2017|Councilmember|       Scott Griggs|     Scott|   Griggs|  0.6388327425636032|
|02/08/2017|Councilmember|   B. Adam  McGough|        B.|  McGough|  0.7267561268900237|
|02/08/2017|Councilme

### 행 추출
- random_val 중에서 0인 경우에만
- filter()

In [159]:
voter_df3 = voter_df2.filter(voter_df2.random_val == 0)
voter_df3.show()

+----------+--------------------+-----------------+----------+---------+----------+
|      DATE|               TITLE|       VOTER_NAME|first_name|last_name|random_val|
+----------+--------------------+-----------------+----------+---------+----------+
|04/25/2018|Deputy Mayor Pro Tem|     Adam Medrano|      Adam|  Medrano|       0.0|
|04/25/2018|       Mayor Pro Tem|Dwaine R. Caraway|    Dwaine|  Caraway|       0.0|
|06/20/2018|Deputy Mayor Pro Tem|     Adam Medrano|      Adam|  Medrano|       0.0|
|06/20/2018|       Mayor Pro Tem|Dwaine R. Caraway|    Dwaine|  Caraway|       0.0|
|06/20/2018|Deputy Mayor Pro Tem|     Adam Medrano|      Adam|  Medrano|       0.0|
|06/20/2018|       Mayor Pro Tem|Dwaine R. Caraway|    Dwaine|  Caraway|       0.0|
|08/15/2018|Deputy Mayor Pro Tem|     Adam Medrano|      Adam|  Medrano|       0.0|
|08/15/2018|Deputy Mayor Pro Tem|     Adam Medrano|      Adam|  Medrano|       0.0|
|09/18/2018|Deputy Mayor Pro Tem|     Adam Medrano|      Adam|  Medrano|    

## 사용자 정의 함수

In [162]:
voter_df = voter_df.withColumn('splits', F.split(voter_df.VOTER_NAME, '\s+'))
voter_df.show(2)

+----------+-------------+------------------+----------+---------+--------------------+
|      DATE|        TITLE|        VOTER_NAME|first_name|last_name|              splits|
+----------+-------------+------------------+----------+---------+--------------------+
|02/08/2017|Councilmember| Jennifer S. Gates|  Jennifer|    Gates|[Jennifer, S., Ga...|
|02/08/2017|Councilmember|Philip T. Kingston|    Philip| Kingston|[Philip, T., King...|
+----------+-------------+------------------+----------+---------+--------------------+
only showing top 2 rows



In [169]:
from pyspark.sql.types import *
from pyspark.sql.functions import udf

@udf(returnType = StringType())
def getFirstAndMiddle(names):
  return ' '.join(names[:-1])

voter_df = voter_df.withColumn('result', getFirstAndMiddle(voter_df.splits))
voter_df.show(2)

+----------+-------------+------------------+----------+---------+--------------------+-----------+
|      DATE|        TITLE|        VOTER_NAME|first_name|last_name|              splits|     result|
+----------+-------------+------------------+----------+---------+--------------------+-----------+
|02/08/2017|Councilmember| Jennifer S. Gates|  Jennifer|    Gates|[Jennifer, S., Ga...|Jennifer S.|
|02/08/2017|Councilmember|Philip T. Kingston|    Philip| Kingston|[Philip, T., King...|  Philip T.|
+----------+-------------+------------------+----------+---------+--------------------+-----------+
only showing top 2 rows



## ROW_ID 추가
- 기존 데이터프레임에서 ROW_ID 추가

In [179]:
voter_df = voter_df.withColumn('ROW_ID', F.monotonically_increasing_id())
voter_df.show()

+----------+-------------+-------------------+----------+---------+--------------------+------------+------+
|      DATE|        TITLE|         VOTER_NAME|first_name|last_name|              splits|      result|ROW_ID|
+----------+-------------+-------------------+----------+---------+--------------------+------------+------+
|02/08/2017|Councilmember|  Jennifer S. Gates|  Jennifer|    Gates|[Jennifer, S., Ga...| Jennifer S.|     0|
|02/08/2017|Councilmember| Philip T. Kingston|    Philip| Kingston|[Philip, T., King...|   Philip T.|     1|
|02/08/2017|        Mayor|Michael S. Rawlings|   Michael| Rawlings|[Michael, S., Raw...|  Michael S.|     2|
|02/08/2017|Councilmember|       Adam Medrano|      Adam|  Medrano|     [Adam, Medrano]|        Adam|     3|
|02/08/2017|Councilmember|       Casey Thomas|     Casey|   Thomas|     [Casey, Thomas]|       Casey|     4|
|02/08/2017|Councilmember|Carolyn King Arnold|   Carolyn|   Arnold|[Carolyn, King, A...|Carolyn King|     5|
|02/08/2017|Council

In [180]:
result_df = voter_df.select(voter_df['VOTER_NAME']).distinct()
result_df.count()

27

In [182]:
result_df = result_df.withColumn('ROW_ID', F.monotonically_increasing_id() + 1)
result_df.orderBy(result_df.ROW_ID.desc()).show(10)



+-------------------+------+
|         VOTER_NAME|ROW_ID|
+-------------------+------+
|       Lee Kleinman|    27|
|        Erik Wilson|    26|
|Carolyn King Arnold|    25|
|Rickey D.  Callahan|    24|
|   Monica R. Alonzo|    23|
|    Lee M. Kleinman|    22|
|  Jennifer S. Gates|    21|
|Philip T.  Kingston|    20|
|  Dwaine R. Caraway|    19|
| Rickey D. Callahan|    18|
+-------------------+------+
only showing top 10 rows



In [183]:
result_df.select('ROW_ID').rdd.max()[0]

27

## 속도 측정

In [186]:
import time
import pyspark.sql.functions as F
departures_df = spark.read.format('csv').options(Header=True).load('/content/drive/MyDrive/Colab Notebooks/data/AA_DFW_2014_Departures_Short.csv.gz')
departures_df.show(1)

+-----------------+-------------+-------------------+-----------------------------+
|Date (MM/DD/YYYY)|Flight Number|Destination Airport|Actual elapsed time (Minutes)|
+-----------------+-------------+-------------------+-----------------------------+
|       01/01/2014|         0005|                HNL|                          519|
+-----------------+-------------+-------------------+-----------------------------+
only showing top 1 row



In [188]:
start_time = time.time()

# cache() 메서드를 활용해서 속도가 실제로 향상이 되는지 확인한다.
# 처음 메서드 실행할 때와, 재 실행할 때랑 차이가 좀 나더라.

departures_df = departures_df.distinct().cache()
print(departures_df.count(), time.time() - start_time) # 9.744 seconds

157198 9.497545957565308


In [189]:
start_time = time.time()
print(departures_df.count(), time.time() - start_time) # 2.035 seconds

157198 2.035612106323242


In [190]:
# cache에 저장이 되었는지 확인하자.
departures_df.is_cached

True

In [191]:
# cache 메모리에 있는 것을 제거하자
departures_df.unpersist()

DataFrame[Date (MM/DD/YYYY): string, Flight Number: string, Destination Airport: string, Actual elapsed time (Minutes): string]

In [192]:
departures_df.is_cached

False

In [193]:
app_name = spark.conf.get('spark.app.name')
driver_tcp_port = spark.conf.get('spark.driver.port')
num_partitions = spark.conf.get('spark.sql.shuffle.partitions')
print("Number of partitions: %s" % num_partitions)

print("Name: %s" % app_name)
print("Driver TCP port: %s" % driver_tcp_port)
print("Number of partitions: %s" % num_partitions)

Number of partitions: 200
Name: Sample
Driver TCP port: 37951
Number of partitions: 200
