# 스파크 기초

## 2장

## 라이브러리 설치, 기본 세팅

In [3]:
#pip install pyspark
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

plt.rc('font', family='Malgun Gothic')
plt.rcParams['axes.unicode_minus'] = False

# Pyspark - SQL
from pyspark import SparkContext
from pyspark.sql import SparkSession, SQLContext, Row, SparkSession
from pyspark.sql.functions import mean, col, split, regexp_extract, when, lit

# Pyspark - ML
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, QuantileDiscretizer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#sc = SparkContext.getOrCreate() 

## 스파크 세션 실행

In [4]:
spark = SparkSession.builder\
        .appName('Spark_Guide')\
        .getOrCreate()

### 스파크 세션으로 간단한 데이터 프레임 생성

In [5]:
myRange = spark.range(1000).toDF("number")

### 데이터프레임 트랜스포메이션 (DF 변경)

In [6]:
divisBy2 = myRange.where("number % 2 = 0")

### 액션 (연산 수행)
- 1000 에 "% 2값이 0인 연산 실행"

In [7]:
divisBy2.count()

500

### 예제 활용
- 데이터 링크 : https://github.com/FVBros/Spark-The-Definitive-Guide/tree/master/data/flight-data/csv/2015-summary.csv
- 스파크는 세션,파일 등을 생성하거나 불러올때 \와 "." 으로 구분

In [8]:
flightData2015 = spark\
    .read\
    .option('inferSchema', "true")\
    .option("header", 'true')\
    .csv("data/flight-data/csv/2015-summary.csv")

In [9]:
help(flightData2015.createOrReplaceTempView)

Help on method createOrReplaceTempView in module pyspark.sql.dataframe:

createOrReplaceTempView(name: str) -> None method of pyspark.sql.dataframe.DataFrame instance
    Creates or replaces a local temporary view with this :class:`DataFrame`.
    
    The lifetime of this temporary table is tied to the :class:`SparkSession`
    that was used to create this :class:`DataFrame`.
    
    .. versionadded:: 2.0.0
    
    Examples
    --------
    >>> df.createOrReplaceTempView("people")
    >>> df2 = df.filter(df.age > 3)
    >>> df2.createOrReplaceTempView("people")
    >>> df3 = spark.sql("select * from people")
    >>> sorted(df3.collect()) == sorted(df2.collect())
    True
    >>> spark.catalog.dropTempView("people")
    True



In [10]:
flightData2015.createOrReplaceTempView("flight_data_2015")

### spark 쿼리문 비교
- 둘다 동일한 결과로 나타남

In [11]:
sqlWay = spark.sql("""
SELECT DEST_COUNTRY_NAME, count(1)
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME

""")

dataFrameWay = flightData2015\
  .groupBy("DEST_COUNTRY_NAME")\
  .count()

sqlWay.explain()
dataFrameWay.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[DEST_COUNTRY_NAME#27], functions=[count(1)])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#27, 200), ENSURE_REQUIREMENTS, [id=#71]
      +- HashAggregate(keys=[DEST_COUNTRY_NAME#27], functions=[partial_count(1)])
         +- FileScan csv [DEST_COUNTRY_NAME#27] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/c:/Users/REDTABLE/OneDrive/KyungJun/git_kkj214/스파크_공부 (PySpark)/..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>


== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[DEST_COUNTRY_NAME#27], functions=[count(1)])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#27, 200), ENSURE_REQUIREMENTS, [id=#84]
      +- HashAggregate(keys=[DEST_COUNTRY_NAME#27], functions=[partial_count(1)])
         +- FileScan csv [DEST_COUNTRY_NAME#27] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFi

In [12]:
from pyspark.sql.functions import max

flightData2015.select(max("count")).take(1)
# 데이터프레임 사용

[Row(max(count)=370002)]

- spark.sql에서 쿼리문법을 사용한 결과와, flightData2015 자체에 groupby, sum 등 연산을 적용한 결과 역시 동일

In [13]:
maxSql = spark.sql("""
SELECT DEST_COUNTRY_NAME, sum(count) as destination_total
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
ORDER BY sum(count) DESC
LIMIT 7
""")

maxSql.show()

from pyspark.sql.functions import desc

flightData2015\
  .groupBy("DEST_COUNTRY_NAME")\
  .sum("count")\
  .withColumnRenamed("sum(count)", "destination_total")\
  .sort(desc("destination_total"))\
  .limit(7)\
  .show()

+------------------+-----------------+
| DEST_COUNTRY_NAME|destination_total|
+------------------+-----------------+
|     United States|           411352|
|            Canada|             8399|
|            Mexico|             7140|
|    United Kingdom|             2025|
|             Japan|             1548|
|           Germany|             1468|
|Dominican Republic|             1353|
+------------------+-----------------+

+------------------+-----------------+
| DEST_COUNTRY_NAME|destination_total|
+------------------+-----------------+
|     United States|           411352|
|            Canada|             8399|
|            Mexico|             7140|
|    United Kingdom|             2025|
|             Japan|             1548|
|           Germany|             1468|
|Dominican Republic|             1353|
+------------------+-----------------+



### 실행계획
- .explain()을 통해 데이터 표출 방식 설명

In [14]:
flightData2015\
  .groupBy("DEST_COUNTRY_NAME")\
  .sum("count")\
  .withColumnRenamed("sum(count)", "destination_total")\
  .sort(desc("destination_total"))\
  .limit(5)\
  .explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- TakeOrderedAndProject(limit=5, orderBy=[destination_total#109L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#27,destination_total#109L])
   +- HashAggregate(keys=[DEST_COUNTRY_NAME#27], functions=[sum(count#29)])
      +- Exchange hashpartitioning(DEST_COUNTRY_NAME#27, 200), ENSURE_REQUIREMENTS, [id=#252]
         +- HashAggregate(keys=[DEST_COUNTRY_NAME#27], functions=[partial_sum(count#29)])
            +- FileScan csv [DEST_COUNTRY_NAME#27,count#29] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/c:/Users/REDTABLE/OneDrive/KyungJun/git_kkj214/스파크_공부 (PySpark)/..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,count:int>


