In [0]:
# Dataset
# DataFrame
# Sql table & view

# Spark : 트랜스포메이션의 처리과정을 정의하는 분산 프로그래밍 모델. 다수의 트랜스포메이션은 DAG로 표현되는 명령을 만들고, 액션은 하나의 잡을 클러스터에서 실행하기 위해 스테이지와 태스크로 나누며 DAG처리 프로세스를 실행함. 트랜스포메이션과 액션으로 다루는 논리적 구조가 DataFrame과 Dataset.

In [0]:
# DataFrame과 Dataset은 결과를 생성하기 위해 어떤 데이터에 어떤 연산을 적용해야 하는지 정의하는 지연 연산의 실행 계획이며 불변함.
# DataFrame에 액션을 호출하면 스파크는 트랜스포메이션을 실제로 실행하고 결과를 반환함.
# 테이블과 뷰는 DataFrame과 기본적으로 동일 다만, 코드 대신 SQL을 쓰는 차이만 존재
# 카탈리스트 : 실행 계획 수립과 처리에 사용하는 자체 데이터 타입의 정보를 가지고 있으며, 여러 언어 API와 직접 매핑됨


df = spark.range(500).toDF("number")
df.select(df["number"] + 10)

Out[2]: DataFrame[(number + 10): bigint]

In [0]:
# 비 타입형 : DataFrame - 데이터 타입의 일치 여부를 런타임에서 확인 - 파이썬, R
# 타입형 : Dataset - 데이터 타입의 일치 여부를 컴파일에서 확인 - 스칼라와 자바에서만 지원


# 실행과정 
1. Dataset/DataFrame/SQL을 이용해 코드 작성
2. 논리적 실행계획으로 변경
3. 물리적 실행계획으로 변경(이때 추가적인 최적화 실행[카탈리스트 옵티마이저])
4. 물리적 실행계획(RDD) 실행

In [0]:
# 구조적 API 기본 연산

df = spark.read.format("json").load("/databricks-datasets/definitive-guide/data/flight-data/json/2015-summary.json")

In [0]:
# Schema 확인

df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [0]:
spark.read.format("json").load("/databricks-datasets/definitive-guide/data/flight-data/json/2015-summary.json")

Out[3]: DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]

In [0]:
# DataFrame에 스키마를 만들고 적용하는 예제

from pyspark.sql.types import StructField, StructType, StringType, LongType

myManualSchema = StructType([
    StructField("DEST_COUNTRY_NAME", StringType(), True),
    StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
    StructField("count", LongType(), False, metadata={"hello":"world"}),
])

df = spark.read.format("json").schema(myManualSchema)\
    .load("/databricks-datasets/definitive-guide/data/flight-data/json/2015-summary.json")

In [0]:
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [0]:
df.show(3)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
+-----------------+-------------------+-----+
only showing top 3 rows



In [0]:
# Spark 표현식

from pyspark.sql.functions import col, column
col("someColumnName")

Out[9]: Column<'someColumnName'>

In [0]:
df.columns

Out[11]: ['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count']

In [0]:
df.first()

Out[12]: Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15)

In [0]:
# dataFrame 생성하기

from pyspark.sql import Row

myRow = Row("Hello", None, 1, False)

In [0]:
myRow[0]
myRow[2]

Out[15]: 1

In [0]:
# dataFrame 만들기

df.createOrReplaceTempView("dfTable")

from pyspark.sql import Row
from pyspark.sql.types import StructField, StructType, StringType, LongType

myManualSchema = StructType([
    StructField("some", StringType(), True),
    StructField("col", StringType(), True),
    StructField("names", LongType(), False),
])

myRow = Row("Hello", None, 1)
myDf = spark.createDataFrame([myRow], myManualSchema)
myDf.show()

+-----+----+-----+
| some| col|names|
+-----+----+-----+
|Hello|null|    1|
+-----+----+-----+



In [0]:
df.select("DEST_COUNTRY_NAME").show(2)

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|    United States|
|    United States|
+-----------------+
only showing top 2 rows



In [0]:
df.select("DEST_COUNTRY_NAME", "ORIGIN_COUNTRY_NAME").show(2)

+-----------------+-------------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|
+-----------------+-------------------+
|    United States|            Romania|
|    United States|            Croatia|
+-----------------+-------------------+
only showing top 2 rows



In [0]:
# col 객체와 문자열을 섞어쓰면 안됨 # 바뀐듯 #이제 됨
df.select(col("DEST_COUNTRY_NAME"), "ORIGIN_COUNTRY_NAME").show(2)

+-----------------+-------------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|
+-----------------+-------------------+
|    United States|            Romania|
|    United States|            Croatia|
+-----------------+-------------------+
only showing top 2 rows



In [0]:
from pyspark.sql.functions import expr, col
df.select(expr("DEST_COUNTRY_NAME as country")).show(2)

+-------------+
|      country|
+-------------+
|United States|
|United States|
+-------------+
only showing top 2 rows



In [0]:
df.select("*").show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



In [0]:
df.select(expr("DEST_COUNTRY_NAME as country"), "ORIGIN_COUNTRY_NAME").alias("DEST_COUTRY_NAME").show(2)

+-------------+-------------------+
|      country|ORIGIN_COUNTRY_NAME|
+-------------+-------------------+
|United States|            Romania|
|United States|            Croatia|
+-------------+-------------------+
only showing top 2 rows



In [0]:
df.select("*").show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



In [0]:
# selectExpr

df.selectExpr(
    "*",
    "(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry"
).show(2)

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
|    United States|            Romania|   15|        false|
|    United States|            Croatia|    1|        false|
+-----------------+-------------------+-----+-------------+
only showing top 2 rows



In [0]:
df.selectExpr(
    "avg(count)",
    "count(distinct(DEST_COUNTRY_NAME))"
).show(2)

+-----------+---------------------------------+
| avg(count)|count(DISTINCT DEST_COUNTRY_NAME)|
+-----------+---------------------------------+
|1770.765625|                              132|
+-----------+---------------------------------+



In [0]:
# 리터럴 사용하기
# 어떤 변숫값이 리터럴보다 큰지 확인하는 식으로 사용할 수 있음

from pyspark.sql.functions import lit

df.select(expr("*"), lit(1).alias("ONE")).show(2)

+-----------------+-------------------+-----+---+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|ONE|
+-----------------+-------------------+-----+---+
|    United States|            Romania|   15|  1|
|    United States|            Croatia|    1|  1|
+-----------------+-------------------+-----+---+
only showing top 2 rows



In [0]:
# 칼럼 추가하기 withColumn
# DataFrame 자체가 변하는 것은 아님

df.withColumn("numberOne", lit(1)).show(2)

+-----------------+-------------------+-----+---------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|numberOne|
+-----------------+-------------------+-----+---------+
|    United States|            Romania|   15|        1|
|    United States|            Croatia|    1|        1|
+-----------------+-------------------+-----+---------+
only showing top 2 rows



In [0]:
df.show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



In [0]:
# 응용
df.withColumn("changeCol", expr("DEST_COUNTRY_NAME")).show(2)

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|    changeCol|
+-----------------+-------------------+-----+-------------+
|    United States|            Romania|   15|United States|
|    United States|            Croatia|    1|United States|
+-----------------+-------------------+-----+-------------+
only showing top 2 rows



In [0]:
# 칼럼명 변경하기(2) withColumnRenamed

df.withColumnRenamed("DEST_COUNTRY_NAME", "dest").show(2)

+-------------+-------------------+-----+
|         dest|ORIGIN_COUNTRY_NAME|count|
+-------------+-------------------+-----+
|United States|            Romania|   15|
|United States|            Croatia|    1|
+-------------+-------------------+-----+
only showing top 2 rows



In [0]:
df.show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



In [0]:
df.selectExpr(
    "ORIGIN_COUNTRY_NAME as new col").show(2)

[0;31m---------------------------------------------------------------------------[0m
[0;31mParseException[0m                            Traceback (most recent call last)
[0;32m<command-2070180636502737>[0m in [0;36m<cell line: 1>[0;34m()[0m
[0;32m----> 1[0;31m df.selectExpr(
[0m[1;32m      2[0m     "ORIGIN_COUNTRY_NAME as new col").show(2)

[0;32m/databricks/spark/python/pyspark/instrumentation_utils.py[0m in [0;36mwrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m             [0mstart[0m [0;34m=[0m [0mtime[0m[0;34m.[0m[0mperf_counter[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m     47[0m             [0;32mtry[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0;32m---> 48[0;31m                 [0mres[0m [0;34m=[0m [0mfunc[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m     49[0m                 logger.log_success(
[1;32m     50[0m                     [0

In [0]:
# escaping character는 hive와 마찬가지로 ` 임    "-" 이나 " "는 예약어임

df.selectExpr(
    "`ORIGIN_COUNTRY_NAME` as `new col`").show(2)

+-------+
|new col|
+-------+
|Romania|
|Croatia|
+-------+
only showing top 2 rows



In [0]:
# spark는 기본적으로 hive와 마찬가지로 대소문자 구분하지 않음
# set spark.sql.caseSensitive True 로 구분할 수 있게 만들 수 있음

In [0]:
# 칼럼 제거

df.drop("ORIGIN_COUNTRY_NAME").columns

Out[60]: ['DEST_COUNTRY_NAME', 'count']

In [0]:
df.show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



In [0]:
# 형변환

df.withColumn("count2", col("count").cast("string"))

Out[65]: DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint, count2: string]

In [0]:
# filter, where

df.filter(col("count") < 2).show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



In [0]:
# and 구문.. 순서와는 상관 없음
df.filter(col("count") < 2).filter(col("ORIGIN_COUNTRY_NAME") != "Croatia").show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|          Singapore|    1|
|          Moldova|      United States|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



In [0]:
# distinct

df.select("ORIGIN_COUNTRY_NAME").distinct().count()

Out[68]: 125

In [0]:
# 무작위 샘플 만들기

seed = 5
withReplacement = False    #복원 추출, 비복원 추출 선택
fraction = 0.5
df.sample(withReplacement, fraction, seed).count()

Out[69]: 138

In [0]:
df.sample(withReplacement, fraction, seed).show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



In [0]:
# 임의 분할 하기 # train, test set을 비율대로 리스트에 저장

dataframes = df.randomSplit([0.25, 0.75])

In [0]:
dataframes[0].show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|            Aruba|      United States|  346|
|       Azerbaijan|      United States|   21|
+-----------------+-------------------+-----+
only showing top 2 rows



In [0]:
type(dataframes)

Out[76]: list

In [0]:
# union
from pyspark.sql import Row

schema = df.schema
newRow = [
    Row("new country", "other country", 5),
    Row("new country2", "other country2", 1)
]
pr = spark.sparkContext.parallelize(newRow)
newDF = spark.createDataFrame(pr, schema)


df.union(newDF).where(col("count") == 1).where(col("ORIGIN_COUNTRY_NAME") != "United States").show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
|    United States|          Gibraltar|    1|
|    United States|             Cyprus|    1|
|    United States|            Estonia|    1|
|    United States|          Lithuania|    1|
|    United States|           Bulgaria|    1|
|    United States|            Georgia|    1|
|    United States|            Bahrain|    1|
|    United States|   Papua New Guinea|    1|
|    United States|         Montenegro|    1|
|    United States|            Namibia|    1|
|     new country2|     other country2|    1|
+-----------------+-------------------+-----+



In [0]:
# 정렬

df.sort("count").show(5)

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|               Malta|      United States|    1|
|Saint Vincent and...|      United States|    1|
|       United States|            Croatia|    1|
|       United States|          Gibraltar|    1|
|       United States|          Singapore|    1|
+--------------------+-------------------+-----+
only showing top 5 rows



In [0]:
df.orderBy("count").show(5)

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|               Malta|      United States|    1|
|Saint Vincent and...|      United States|    1|
|       United States|            Croatia|    1|
|       United States|          Gibraltar|    1|
|       United States|          Singapore|    1|
+--------------------+-------------------+-----+
only showing top 5 rows



In [0]:
df.orderBy(col("count").desc()).show(2)

+-----------------+-------------------+------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|
+-----------------+-------------------+------+
|    United States|      United States|370002|
|    United States|             Canada|  8483|
+-----------------+-------------------+------+
only showing top 2 rows



In [0]:
df.orderBy(col("count").desc()).orderBy(col("ORIGIN_COUNTRY_NAME").asc()).show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|             Angola|   13|
|    United States|           Anguilla|   38|
+-----------------+-------------------+-----+
only showing top 2 rows



In [0]:
# 성능 최적화를 위한 파티션별 정렬

spark.read.format("json")\
    .load("/databricks-datasets/definitive-guide/data/flight-data/json/2015-summary.json")\
    .sortWithinPartitions("count")

Out[90]: DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]

In [0]:
# limit

df.orderBy(expr("count desc")).limit(6).show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|               Malta|      United States|    1|
|Saint Vincent and...|      United States|    1|
|       United States|            Croatia|    1|
|       United States|          Gibraltar|    1|
|       United States|          Singapore|    1|
|             Moldova|      United States|    1|
+--------------------+-------------------+-----+



In [0]:
# repartition / coalesce
# 향후에 사용할 파티션 수가 현재보다 많거나, 칼럼을 기준으로 파티션을 만드는 경우에 사용

df.rdd.getNumPartitions()

Out[92]: 1

In [0]:
df.repartition(5)

Out[94]: DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]

In [0]:
df.repartition(col("count"))

Out[96]: DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]

In [0]:
# coalesce : 데이터를 셔플하지 않고 파티션 병함
# ex. 목적지를 기준으로 셔플을 수행해 5개의 파티션으로 나누고, 전체 데이터를 셔플없이 병합

df.repartition(5, col("count")).coalesce(2)

Out[97]: DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]

In [0]:
# 드라이버로 로우 데이터 수집하기
# collect 등을 쓸 때는 주의! resource 부족 뜨기 쉬움

# collect : 전체 DataFrame의 모든 데이터 수집
# take : 상위 N개의 로우 반환
# show : 여러 로우를 보기 좋게 출력

df.show(5)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
+-----------------+-------------------+-----+
only showing top 5 rows



In [0]:
df.show(5, False)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|United States    |Romania            |15   |
|United States    |Croatia            |1    |
|United States    |Ireland            |344  |
|Egypt            |United States      |15   |
|United States    |India              |62   |
+-----------------+-------------------+-----+
only showing top 5 rows

