# **pyspark 패키지를 활용한 Spark 프로그래밍**
## SparkSession 객체 생성

In [10]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[2]") \
                    .appName('sparkedu') \
                    .getOrCreate()
spark

![spark1](images/spark1.png)

## 리스트객체로 RDD 객체 생성하기

### RDD(Resilient Distributed Dataset)
#### read-only 데이터셋으로서 다양한 머신에 데이터셋의 멀티셋(중복을 허용)을 분산해두고 특정한 머신에 문제가 생기더라도 문제없이 읽을수로 있도록 지원한다

- MapReduce 작업
- 분산하여 병렬적 처리
- 빠른 연산
- 불변(Immutable)
- Transformation 과 Action 으로 함수 종류가 나눠지며, Action 함수가 실행됐을 때 실제 연산
- Lineage 를 통해 Fault Tolerant(내고장성) 보장

![spark2](images/spark2.png)

In [12]:
dataList = [("Java", 20000), ("Python", 100000), ("Scala", 3000)]
rdd=spark.sparkContext.parallelize(dataList)
print(rdd)
print(type(rdd))
print(rdd.collect())

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:262
<class 'pyspark.rdd.RDD'>
[('Java', 20000), ('Python', 100000), ('Scala', 3000)]


In [13]:
import numpy as np
lst=np.random.randint(0,10,20)
rdd=spark.sparkContext.parallelize(lst)
print(type(rdd))
print(rdd.collect())
print(rdd.count())

<class 'pyspark.rdd.RDD'>
[2, 9, 7, 1, 9, 8, 7, 4, 3, 7, 9, 1, 0, 3, 9, 1, 2, 0, 2, 5]
20


## 텍스트 파일 내용 읽어서 RDD 객체 생성하기

In [14]:
rdd = spark.read.text("data/korean_stopwords.txt")
print(type(rdd))
print(rdd.collect())

<class 'pyspark.sql.dataframe.DataFrame'>
[Row(value='아'), Row(value='휴'), Row(value='아이구'), Row(value='아이쿠'), Row(value='아이고'), Row(value='어'), Row(value='나'), Row(value='우리'), Row(value='저희'), Row(value='따라'), Row(value='의해'), Row(value='을'), Row(value='를'), Row(value='에'), Row(value='의'), Row(value='가'), Row(value='으로'), Row(value='로'), Row(value='에게'), Row(value='뿐이다'), Row(value='의거하여'), Row(value='근거하여'), Row(value='입각하여'), Row(value='기준으로'), Row(value='예하면'), Row(value='예를 들면'), Row(value='예를 들자면'), Row(value='저'), Row(value='소인'), Row(value='소생'), Row(value='저희'), Row(value='지말고'), Row(value='하지마'), Row(value='하지마라'), Row(value='다른'), Row(value='물론'), Row(value='또한'), Row(value='그리고'), Row(value='비길수 없다'), Row(value='해서는 안된다'), Row(value='뿐만 아니라'), Row(value='만이 아니다'), Row(value='만은 아니다'), Row(value='막론하고'), Row(value='관계없이'), Row(value='그치지 않다'), Row(value='그러나'), Row(value='그런데'), Row(value='하지만'), Row(value='든간에'), Row(value='논하지 않다'), Row(value='따지지 않다'), Row(value='설사'), R

In [15]:
rdd = spark.sparkContext.textFile("data/korean_stopwords.txt")
print(type(rdd))
print(rdd.collect())

<class 'pyspark.rdd.RDD'>
['아', '휴', '아이구', '아이쿠', '아이고', '어', '나', '우리', '저희', '따라', '의해', '을', '를', '에', '의', '가', '으로', '로', '에게', '뿐이다', '의거하여', '근거하여', '입각하여', '기준으로', '예하면', '예를 들면', '예를 들자면', '저', '소인', '소생', '저희', '지말고', '하지마', '하지마라', '다른', '물론', '또한', '그리고', '비길수 없다', '해서는 안된다', '뿐만 아니라', '만이 아니다', '만은 아니다', '막론하고', '관계없이', '그치지 않다', '그러나', '그런데', '하지만', '든간에', '논하지 않다', '따지지 않다', '설사', '비록', '더라도', '아니면', '만 못하다', '하는 편이 낫다', '불문하고', '향하여', '향해서', '향하다', '쪽으로', '틈타', '이용하여', '타다', '오르다', '제외하고', '이 외에', '이 밖에', '하여야', '비로소', '한다면 몰라도', '외에도', '이곳', '여기', '부터', '기점으로', '따라서', '할 생각이다', '하려고하다', '이리하여', '그리하여', '그렇게 함으로써', '하지만', '일때', '할때', '앞에서', '중에서', '보는데서', '으로써', '로써', '까지', '해야한다', '일것이다', '반드시', '할줄알다', '할수있다', '할수있어', '임에 틀림없다', '한다면', '등', '등등', '제', '겨우', '단지', '다만', '할뿐', '딩동', '댕그', '대해서', '대하여', '대하면', '훨씬', '얼마나', '얼마만큼', '얼마큼', '남짓', '여', '얼마간', '약간', '다소', '좀', '조금', '다수', '몇', '얼마', '지만', '하물며', '또한', '그러나', '그렇지만', '하지만', '이외에도', '대해 말하자면', '뿐이다', '다음에', '반

## 생성한 RDD 객체 Spark DataFrame 으로 변환하기

### Spark DataFrame
- DataFrame은 명명 된 열로 구성된 데이터 세트 
- 개념적으로는 관계형 데이터베이스의 테이블 또는 R / Python의 데이터 프레임과 동일하지만 내부적으로 더욱  최적화가 있음
- RDB Table처럼 Schema를 가지고 있고 RDB의 Table 연산이 가능
- 구조화 된 데이터 파일, Hive의 테이블, 외부 데이터베이스 또는 기존 RDD와 같은 다양한 소스 에서 구성 할 수 있늠 
- DataFrame API는 Scala, Java, Python 및 R 에서 사용할 수 있음
- SparkSQL을 통해 사용 가능

In [16]:
dept = [("Finance",10), 
        ("Marketing",20), 
        ("Sales",30), 
        ("IT",40) 
      ]
rdd = spark.sparkContext.parallelize(dept)
print(rdd.collect())

[('Finance', 10), ('Marketing', 20), ('Sales', 30), ('IT', 40)]


In [17]:
df = rdd.toDF()
df.printSchema()
df.show()

root
 |-- _1: string (nullable = true)
 |-- _2: long (nullable = true)

+---------+---+
|       _1| _2|
+---------+---+
|  Finance| 10|
|Marketing| 20|
|    Sales| 30|
|       IT| 40|
+---------+---+



In [18]:
deptColumns = ["dept_name","dept_id"]
df2 = rdd.toDF(deptColumns)
df2.printSchema()
df2.show(truncate=False)

root
 |-- dept_name: string (nullable = true)
 |-- dept_id: long (nullable = true)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
+---------+-------+



In [19]:
data = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)
]

columns = ["firstname","middlename","lastname","dob","gender","salary"]
df = spark.createDataFrame(data=data, schema = columns)
print(type(df))
print(df)
df.printSchema()
df.show()

<class 'pyspark.sql.dataframe.DataFrame'>
DataFrame[firstname: string, middlename: string, lastname: string, dob: string, gender: string, salary: bigint]
root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



In [20]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
data2 = [("James","","Smith","36636","M",3000),
    ("Michael","Rose","","40288","M",4000),
    ("Robert","","Williams","42114","M",4000),
    ("Maria","Anne","Jones","39192","F",4000),
    ("Jen","Mary","Brown","","F",-1)
  ]

schema = StructType([ \
    StructField("firstname",StringType(),True), \
    StructField("middlename",StringType(),True), \
    StructField("lastname",StringType(),True), \
    StructField("id", StringType(), True), \
    StructField("gender", StringType(), True), \
    StructField("salary", IntegerType(), True) \
  ])
 
df = spark.createDataFrame(data=data2,schema=schema)
df.printSchema()
df.show(truncate=False)

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: integer (nullable = true)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|id   |gender|salary|
+---------+----------+--------+-----+------+------+
|James    |          |Smith   |36636|M     |3000  |
|Michael  |Rose      |        |40288|M     |4000  |
|Robert   |          |Williams|42114|M     |4000  |
|Maria    |Anne      |Jones   |39192|F     |4000  |
|Jen      |Mary      |Brown   |     |F     |-1    |
+---------+----------+--------+-----+------+------+



## CSV 파일 내용 읽어서 DataFrame 객체 생성하기

In [21]:
df = spark.read.csv("data/emp.csv")
df.printSchema()
df.show()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)

+-----+------+---------+----+----------+----+----+------+
|  _c0|   _c1|      _c2| _c3|       _c4| _c5| _c6|   _c7|
+-----+------+---------+----+----------+----+----+------+
|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|
| 7369| SMITH|    CLERK|7902|1980-12-17| 800|null|    20|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600| 300|    30|
| 7521|  WARD| SALESMAN|7698|1981-02-03|1250| 500|    30|
| 7566| JONES|  MANAGER|7839|1981-03-02|2975|null|    20|
| 7654|MARTIN| SALESMAN|7698|1981-10-22|1250|1400|    30|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850|null|    30|
| 7782| CLARK|  MANAGER|7839|1981-09-06|2450|null|    10|
| 7788| SCOTT|  ANALYST|7566|1982-12-08|3000|null|    20|
| 7839|  KING|PRES

In [22]:
emp = spark.read.csv("data/emp.csv", header=True)
emp.printSchema()
emp.show()

root
 |-- empno: string (nullable = true)
 |-- ename: string (nullable = true)
 |-- job: string (nullable = true)
 |-- mgr: string (nullable = true)
 |-- hiredate: string (nullable = true)
 |-- sal: string (nullable = true)
 |-- comm: string (nullable = true)
 |-- deptno: string (nullable = true)

+-----+------+---------+----+----------+----+----+------+
|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|
+-----+------+---------+----+----------+----+----+------+
| 7369| SMITH|    CLERK|7902|1980-12-17| 800|null|    20|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600| 300|    30|
| 7521|  WARD| SALESMAN|7698|1981-02-03|1250| 500|    30|
| 7566| JONES|  MANAGER|7839|1981-03-02|2975|null|    20|
| 7654|MARTIN| SALESMAN|7698|1981-10-22|1250|1400|    30|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850|null|    30|
| 7782| CLARK|  MANAGER|7839|1981-09-06|2450|null|    10|
| 7788| SCOTT|  ANALYST|7566|1982-12-08|3000|null|    20|
| 7839|  KING|PRESIDENT|null|1981-11-17|5000|null|    10|
| 784

In [23]:
emp = spark.read.csv("data/emp.csv", header=True, inferSchema=True)
emp.printSchema()
emp.show()

root
 |-- empno: integer (nullable = true)
 |-- ename: string (nullable = true)
 |-- job: string (nullable = true)
 |-- mgr: integer (nullable = true)
 |-- hiredate: string (nullable = true)
 |-- sal: integer (nullable = true)
 |-- comm: integer (nullable = true)
 |-- deptno: integer (nullable = true)

+-----+------+---------+----+----------+----+----+------+
|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|
+-----+------+---------+----+----------+----+----+------+
| 7369| SMITH|    CLERK|7902|1980-12-17| 800|null|    20|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600| 300|    30|
| 7521|  WARD| SALESMAN|7698|1981-02-03|1250| 500|    30|
| 7566| JONES|  MANAGER|7839|1981-03-02|2975|null|    20|
| 7654|MARTIN| SALESMAN|7698|1981-10-22|1250|1400|    30|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850|null|    30|
| 7782| CLARK|  MANAGER|7839|1981-09-06|2450|null|    10|
| 7788| SCOTT|  ANALYST|7566|1982-12-08|3000|null|    20|
| 7839|  KING|PRESIDENT|null|1981-11-17|5000|null|    10|


In [24]:
df = spark.read.csv("data/mpgdata.csv")
df.printSchema()
df.show()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)

+---+---------+------------+----------+------+------------+----------+
|_c0|      _c1|         _c2|       _c3|   _c4|         _c5|       _c6|
+---+---------+------------+----------+------+------------+----------+
|mpg|cylinders|displacement|horsepower|weight|acceleration|model-year|
| 18|        8|         307|       130|  3504|          12|        70|
| 15|        8|         350|       165|  3693|        11.5|        70|
| 18|        8|         318|       150|  3436|          11|        70|
| 16|        8|         304|       150|  3433|          12|        70|
| 17|        8|         302|       140|  3449|        10.5|        70|
| 15|        8|         429|       198|  4341|          10|        70|
| 14|        8|         454|       220|

In [25]:
df = spark.read.load("data/iris.csv",
                     format="csv", sep=",", inferSchema=True, header=True)
df.printSchema()
df.show()

root
 |-- sepal.length: double (nullable = true)
 |-- sepal.width: double (nullable = true)
 |-- petal.length: double (nullable = true)
 |-- petal.width: double (nullable = true)
 |-- variety: string (nullable = true)

+------------+-----------+------------+-----------+-------+
|sepal.length|sepal.width|petal.length|petal.width|variety|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| Setosa|
|         4.9|        3.0|         1.4|        0.2| Setosa|
|         4.7|        3.2|         1.3|        0.2| Setosa|
|         4.6|        3.1|         1.5|        0.2| Setosa|
|         5.0|        3.6|         1.4|        0.2| Setosa|
|         5.4|        3.9|         1.7|        0.4| Setosa|
|         4.6|        3.4|         1.4|        0.3| Setosa|
|         5.0|        3.4|         1.5|        0.2| Setosa|
|         4.4|        2.9|         1.4|        0.2| Setosa|
|         4.9|        3.1|         1.5|        0.1| Setosa|
|

## JSON 파일 내용 읽어서 DataFrame 객체 생성하기

In [26]:
df = spark.read.json("data/seoul_geo.json")
df.show()

+--------------------+--------------------+-----------------------+-------+
|     _corrupt_record|            geometry|             properties|   type|
+--------------------+--------------------+-----------------------+-------+
|                   {|                null|                   null|   null|
|"type": "FeatureC...|                null|                   null|   null|
|       "features": [|                null|                   null|   null|
|                null|[[[[127.115195849...|[2013, 11250, 강동구...|Feature|
|                null|[[[[127.069069813...|[2013, 11240, 송파구...|Feature|
|                null|[[[[127.058673592...|[2013, 11230, 강남구...|Feature|
|                null|[[[[127.013971196...|[2013, 11220, 서초구...|Feature|
|                null|[[[[126.961089890...|[2013, 11210, 관악구...|Feature|
|                null|[[[[126.982238079...|[2013, 11200, 동작구...|Feature|
|                null|[[[[126.891846638...|[2013, 11190, 영등포...|Feature|
|                null|[[[[126.901

In [27]:
df = spark.read.load("data/seoul_geo.json", format="json")
df.show()

+--------------------+--------------------+-----------------------+-------+
|     _corrupt_record|            geometry|             properties|   type|
+--------------------+--------------------+-----------------------+-------+
|                   {|                null|                   null|   null|
|"type": "FeatureC...|                null|                   null|   null|
|       "features": [|                null|                   null|   null|
|                null|[[[[127.115195849...|[2013, 11250, 강동구...|Feature|
|                null|[[[[127.069069813...|[2013, 11240, 송파구...|Feature|
|                null|[[[[127.058673592...|[2013, 11230, 강남구...|Feature|
|                null|[[[[127.013971196...|[2013, 11220, 서초구...|Feature|
|                null|[[[[126.961089890...|[2013, 11210, 관악구...|Feature|
|                null|[[[[126.982238079...|[2013, 11200, 동작구...|Feature|
|                null|[[[[126.891846638...|[2013, 11190, 영등포...|Feature|
|                null|[[[[126.901

## 파케이 파일 내용 읽어서 DataFrame 객체 생성하기

In [29]:
df = spark.read.load("data/userdata1.parquet")
df = df.select("first_name", "last_name", "email")
df.show()

+----------+---------+--------------------+
|first_name|last_name|               email|
+----------+---------+--------------------+
|    Amanda|   Jordan|    ajordan0@com.com|
|    Albert|  Freeman|     afreeman1@is.gd|
|    Evelyn|   Morgan|emorgan2@altervis...|
|    Denise|    Riley|    driley3@gmpg.org|
|    Carlos|    Burns|cburns4@miitbeian...|
|   Kathryn|    White|  kwhite5@google.com|
|    Samuel|   Holmes|sholmes6@foxnews.com|
|     Harry|   Howell| hhowell7@eepurl.com|
|      Jose|   Foster|   jfoster8@yelp.com|
|     Emily|  Stewart|estewart9@opensou...|
|     Susan|  Perkins| sperkinsa@patch.com|
|     Alice|    Berry|aberryb@wikipedia...|
|    Justin|    Berry|jberryc@usatoday.com|
|     Kathy| Reynolds|kreynoldsd@redcro...|
|   Dorothy|   Hudson|dhudsone@blogger.com|
|     Bruce|   Willis|bwillisf@bluehost...|
|     Emily|  Andrews|eandrewsg@cornell...|
|   Stephen|  Wallace|swallaceh@netvibe...|
|  Clarence|   Lawson|clawsoni@vkontakt...|
|   Rebecca|     Bell| rbellj@ba

## 직접 만든 DataFrame 객체 생성하여 정보 출력하기

In [31]:
data = [("James","","Smith","36636","M",60000),
        ("Michael","Rose","","40288","M",70000),
        ("Robert","","Williams","42114","",400000),
        ("Maria","Anne","Jones","39192","F",500000),
        ("Jen","Mary","Brown","","F",0)]

columns = ["first_name","middle_name","last_name","dob","gender","salary"]
pysparkDF = spark.createDataFrame(data = data, schema = columns)
pysparkDF.printSchema()
pysparkDF.show(truncate=False)

root
 |-- first_name: string (nullable = true)
 |-- middle_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)

+----------+-----------+---------+-----+------+------+
|first_name|middle_name|last_name|dob  |gender|salary|
+----------+-----------+---------+-----+------+------+
|James     |           |Smith    |36636|M     |60000 |
|Michael   |Rose       |         |40288|M     |70000 |
|Robert    |           |Williams |42114|      |400000|
|Maria     |Anne       |Jones    |39192|F     |500000|
|Jen       |Mary       |Brown    |     |F     |0     |
+----------+-----------+---------+-----+------+------+



## Spark의 DataFrame 객체를 Pandas의 DataFrame 객체로 변환하기

In [32]:
pandasDF = pysparkDF.toPandas()
print(type(pandasDF))
print(pandasDF)

<class 'pandas.core.frame.DataFrame'>
  first_name middle_name last_name    dob gender  salary
0      James                 Smith  36636      M   60000
1    Michael        Rose            40288      M   70000
2     Robert              Williams  42114         400000
3      Maria        Anne     Jones  39192      F  500000
4        Jen        Mary     Brown             F       0


## select()

In [33]:
emp1 = emp.select("empno", "ename", "hiredate", "sal")
print(type(emp1))
emp1.show()

<class 'pyspark.sql.dataframe.DataFrame'>
+-----+------+----------+----+
|empno| ename|  hiredate| sal|
+-----+------+----------+----+
| 7369| SMITH|1980-12-17| 800|
| 7499| ALLEN|1981-02-20|1600|
| 7521|  WARD|1981-02-03|1250|
| 7566| JONES|1981-03-02|2975|
| 7654|MARTIN|1981-10-22|1250|
| 7698| BLAKE|1981-05-01|2850|
| 7782| CLARK|1981-09-06|2450|
| 7788| SCOTT|1982-12-08|3000|
| 7839|  KING|1981-11-17|5000|
| 7844|TURNER|1984-10-08|1500|
| 7876| ADAMS|1983-01-12|1100|
| 7900| JAMES|1981-12-03| 950|
| 7902|  FORD|1981-12-13|3000|
| 7934|MILLER|1982-01-25|1300|
+-----+------+----------+----+



In [34]:
emp.select(emp.empno,emp.ename,emp.hiredate, emp.sal).show()

+-----+------+----------+----+
|empno| ename|  hiredate| sal|
+-----+------+----------+----+
| 7369| SMITH|1980-12-17| 800|
| 7499| ALLEN|1981-02-20|1600|
| 7521|  WARD|1981-02-03|1250|
| 7566| JONES|1981-03-02|2975|
| 7654|MARTIN|1981-10-22|1250|
| 7698| BLAKE|1981-05-01|2850|
| 7782| CLARK|1981-09-06|2450|
| 7788| SCOTT|1982-12-08|3000|
| 7839|  KING|1981-11-17|5000|
| 7844|TURNER|1984-10-08|1500|
| 7876| ADAMS|1983-01-12|1100|
| 7900| JAMES|1981-12-03| 950|
| 7902|  FORD|1981-12-13|3000|
| 7934|MILLER|1982-01-25|1300|
+-----+------+----------+----+



In [85]:
# Using col function
from pyspark.sql.functions import col
df.select(col("empno"),col("ename"),col("hiredate"),col("sal")).show()

AnalysisException: cannot resolve '`empno`' given input columns: [email, first_name, last_name];;
'Project ['empno, 'ename, 'hiredate, 'sal]
+- Project [first_name#514, last_name#515, email#516]
   +- Relation[registration_dttm#512,id#513,first_name#514,last_name#515,email#516,gender#517,ip_address#518,cc#519,country#520,birthdate#521,salary#522,title#523,comments#524] parquet


## collect()

In [36]:
dataCollect = emp.collect()
print(type(dataCollect))
print("----------------------------")
print(dataCollect)
print("----------------------------")
display(dataCollect)

<class 'list'>
----------------------------
[Row(empno=7369, ename='SMITH', job='CLERK', mgr=7902, hiredate='1980-12-17', sal=800, comm=None, deptno=20), Row(empno=7499, ename='ALLEN', job='SALESMAN', mgr=7698, hiredate='1981-02-20', sal=1600, comm=300, deptno=30), Row(empno=7521, ename='WARD', job='SALESMAN', mgr=7698, hiredate='1981-02-03', sal=1250, comm=500, deptno=30), Row(empno=7566, ename='JONES', job='MANAGER', mgr=7839, hiredate='1981-03-02', sal=2975, comm=None, deptno=20), Row(empno=7654, ename='MARTIN', job='SALESMAN', mgr=7698, hiredate='1981-10-22', sal=1250, comm=1400, deptno=30), Row(empno=7698, ename='BLAKE', job='MANAGER', mgr=7839, hiredate='1981-05-01', sal=2850, comm=None, deptno=30), Row(empno=7782, ename='CLARK', job='MANAGER', mgr=7839, hiredate='1981-09-06', sal=2450, comm=None, deptno=10), Row(empno=7788, ename='SCOTT', job='ANALYST', mgr=7566, hiredate='1982-12-08', sal=3000, comm=None, deptno=20), Row(empno=7839, ename='KING', job='PRESIDENT', mgr=None, hire

[Row(empno=7369, ename='SMITH', job='CLERK', mgr=7902, hiredate='1980-12-17', sal=800, comm=None, deptno=20),
 Row(empno=7499, ename='ALLEN', job='SALESMAN', mgr=7698, hiredate='1981-02-20', sal=1600, comm=300, deptno=30),
 Row(empno=7521, ename='WARD', job='SALESMAN', mgr=7698, hiredate='1981-02-03', sal=1250, comm=500, deptno=30),
 Row(empno=7566, ename='JONES', job='MANAGER', mgr=7839, hiredate='1981-03-02', sal=2975, comm=None, deptno=20),
 Row(empno=7654, ename='MARTIN', job='SALESMAN', mgr=7698, hiredate='1981-10-22', sal=1250, comm=1400, deptno=30),
 Row(empno=7698, ename='BLAKE', job='MANAGER', mgr=7839, hiredate='1981-05-01', sal=2850, comm=None, deptno=30),
 Row(empno=7782, ename='CLARK', job='MANAGER', mgr=7839, hiredate='1981-09-06', sal=2450, comm=None, deptno=10),
 Row(empno=7788, ename='SCOTT', job='ANALYST', mgr=7566, hiredate='1982-12-08', sal=3000, comm=None, deptno=20),
 Row(empno=7839, ename='KING', job='PRESIDENT', mgr=None, hiredate='1981-11-17', sal=5000, comm=No

In [37]:
emp.printSchema()

root
 |-- empno: integer (nullable = true)
 |-- ename: string (nullable = true)
 |-- job: string (nullable = true)
 |-- mgr: integer (nullable = true)
 |-- hiredate: string (nullable = true)
 |-- sal: integer (nullable = true)
 |-- comm: integer (nullable = true)
 |-- deptno: integer (nullable = true)



## withColumn()

In [39]:
newemp = emp.withColumn("deptno",col("deptno").cast("Integer"))
newemp.printSchema()

root
 |-- empno: integer (nullable = true)
 |-- ename: string (nullable = true)
 |-- job: string (nullable = true)
 |-- mgr: integer (nullable = true)
 |-- hiredate: string (nullable = true)
 |-- sal: integer (nullable = true)
 |-- comm: integer (nullable = true)
 |-- deptno: integer (nullable = true)



In [40]:
newemp = newemp.withColumn("sal",col("sal")*100)
newemp.show()

+-----+------+---------+----+----------+------+----+------+
|empno| ename|      job| mgr|  hiredate|   sal|comm|deptno|
+-----+------+---------+----+----------+------+----+------+
| 7369| SMITH|    CLERK|7902|1980-12-17| 80000|null|    20|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|160000| 300|    30|
| 7521|  WARD| SALESMAN|7698|1981-02-03|125000| 500|    30|
| 7566| JONES|  MANAGER|7839|1981-03-02|297500|null|    20|
| 7654|MARTIN| SALESMAN|7698|1981-10-22|125000|1400|    30|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|285000|null|    30|
| 7782| CLARK|  MANAGER|7839|1981-09-06|245000|null|    10|
| 7788| SCOTT|  ANALYST|7566|1982-12-08|300000|null|    20|
| 7839|  KING|PRESIDENT|null|1981-11-17|500000|null|    10|
| 7844|TURNER| SALESMAN|7698|1984-10-08|150000|null|    30|
| 7876| ADAMS|    CLERK|7788|1983-01-12|110000|null|    20|
| 7900| JAMES|    CLERK|7698|1981-12-03| 95000|null|    30|
| 7902|  FORD|  ANALYST|7566|1981-12-13|300000|null|    20|
| 7934|MILLER|    CLERK|7782|1982-01-25|

## withColumnRenamed()

In [42]:
newemp = newemp.withColumnRenamed("sal","salary")
newemp.show()

+-----+------+---------+----+----------+------+----+------+
|empno| ename|      job| mgr|  hiredate|salary|comm|deptno|
+-----+------+---------+----+----------+------+----+------+
| 7369| SMITH|    CLERK|7902|1980-12-17| 80000|null|    20|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|160000| 300|    30|
| 7521|  WARD| SALESMAN|7698|1981-02-03|125000| 500|    30|
| 7566| JONES|  MANAGER|7839|1981-03-02|297500|null|    20|
| 7654|MARTIN| SALESMAN|7698|1981-10-22|125000|1400|    30|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|285000|null|    30|
| 7782| CLARK|  MANAGER|7839|1981-09-06|245000|null|    10|
| 7788| SCOTT|  ANALYST|7566|1982-12-08|300000|null|    20|
| 7839|  KING|PRESIDENT|null|1981-11-17|500000|null|    10|
| 7844|TURNER| SALESMAN|7698|1984-10-08|150000|null|    30|
| 7876| ADAMS|    CLERK|7788|1983-01-12|110000|null|    20|
| 7900| JAMES|    CLERK|7698|1981-12-03| 95000|null|    30|
| 7902|  FORD|  ANALYST|7566|1981-12-13|300000|null|    20|
| 7934|MILLER|    CLERK|7782|1982-01-25|

In [43]:
newemp = newemp.withColumnRenamed("mgr","manager") \
    .withColumnRenamed("ename","empname")
newemp.show()

+-----+-------+---------+-------+----------+------+----+------+
|empno|empname|      job|manager|  hiredate|salary|comm|deptno|
+-----+-------+---------+-------+----------+------+----+------+
| 7369|  SMITH|    CLERK|   7902|1980-12-17| 80000|null|    20|
| 7499|  ALLEN| SALESMAN|   7698|1981-02-20|160000| 300|    30|
| 7521|   WARD| SALESMAN|   7698|1981-02-03|125000| 500|    30|
| 7566|  JONES|  MANAGER|   7839|1981-03-02|297500|null|    20|
| 7654| MARTIN| SALESMAN|   7698|1981-10-22|125000|1400|    30|
| 7698|  BLAKE|  MANAGER|   7839|1981-05-01|285000|null|    30|
| 7782|  CLARK|  MANAGER|   7839|1981-09-06|245000|null|    10|
| 7788|  SCOTT|  ANALYST|   7566|1982-12-08|300000|null|    20|
| 7839|   KING|PRESIDENT|   null|1981-11-17|500000|null|    10|
| 7844| TURNER| SALESMAN|   7698|1984-10-08|150000|null|    30|
| 7876|  ADAMS|    CLERK|   7788|1983-01-12|110000|null|    20|
| 7900|  JAMES|    CLERK|   7698|1981-12-03| 95000|null|    30|
| 7902|   FORD|  ANALYST|   7566|1981-12

## filter() - where() 와 동일

In [46]:
emp.filter(emp.ename == "KING").show(truncate=False)

+-----+-----+---------+----+----------+----+----+------+
|empno|ename|job      |mgr |hiredate  |sal |comm|deptno|
+-----+-----+---------+----+----------+----+----+------+
|7839 |KING |PRESIDENT|null|1981-11-17|5000|null|10    |
+-----+-----+---------+----+----------+----+----+------+



In [47]:
emp.filter('ename == "KING"').show(truncate=False)

+-----+-----+---------+----+----------+----+----+------+
|empno|ename|job      |mgr |hiredate  |sal |comm|deptno|
+-----+-----+---------+----+----------+----+----+------+
|7839 |KING |PRESIDENT|null|1981-11-17|5000|null|10    |
+-----+-----+---------+----+----------+----+----+------+



In [48]:
emp.filter((emp.deptno == 30) & (emp.sal >= 1500)).show(truncate=False)

+-----+------+--------+----+----------+----+----+------+
|empno|ename |job     |mgr |hiredate  |sal |comm|deptno|
+-----+------+--------+----+----------+----+----+------+
|7499 |ALLEN |SALESMAN|7698|1981-02-20|1600|300 |30    |
|7698 |BLAKE |MANAGER |7839|1981-05-01|2850|null|30    |
|7844 |TURNER|SALESMAN|7698|1984-10-08|1500|null|30    |
+-----+------+--------+----+----------+----+----+------+



In [49]:
emp.where((emp.deptno == 30) & (emp.sal >= 1500)).show(truncate=False)

+-----+------+--------+----+----------+----+----+------+
|empno|ename |job     |mgr |hiredate  |sal |comm|deptno|
+-----+------+--------+----+----------+----+----+------+
|7499 |ALLEN |SALESMAN|7698|1981-02-20|1600|300 |30    |
|7698 |BLAKE |MANAGER |7839|1981-05-01|2850|null|30    |
|7844 |TURNER|SALESMAN|7698|1984-10-08|1500|null|30    |
+-----+------+--------+----+----------+----+----+------+



## drop (), distinct(), dropDuplicates()

In [50]:
empnew = emp.select("job", "deptno")
empnew.show()

+---------+------+
|      job|deptno|
+---------+------+
|    CLERK|    20|
| SALESMAN|    30|
| SALESMAN|    30|
|  MANAGER|    20|
| SALESMAN|    30|
|  MANAGER|    30|
|  MANAGER|    10|
|  ANALYST|    20|
|PRESIDENT|    10|
| SALESMAN|    30|
|    CLERK|    20|
|    CLERK|    30|
|  ANALYST|    20|
|    CLERK|    10|
+---------+------+



In [51]:
empnew.distinct().show()

+---------+------+
|      job|deptno|
+---------+------+
|  ANALYST|    20|
|  MANAGER|    10|
|  MANAGER|    30|
|PRESIDENT|    10|
|    CLERK|    20|
| SALESMAN|    30|
|    CLERK|    10|
|  MANAGER|    20|
|    CLERK|    30|
+---------+------+



In [52]:
empnew.dropDuplicates().show()

+---------+------+
|      job|deptno|
+---------+------+
|  ANALYST|    20|
|  MANAGER|    10|
|  MANAGER|    30|
|PRESIDENT|    10|
|    CLERK|    20|
| SALESMAN|    30|
|    CLERK|    10|
|  MANAGER|    20|
|    CLERK|    30|
+---------+------+



In [53]:
empnew.drop("deptno").show()

+---------+
|      job|
+---------+
|    CLERK|
| SALESMAN|
| SALESMAN|
|  MANAGER|
| SALESMAN|
|  MANAGER|
|  MANAGER|
|  ANALYST|
|PRESIDENT|
| SALESMAN|
|    CLERK|
|    CLERK|
|  ANALYST|
|    CLERK|
+---------+



## orderBy(), sort()

In [54]:
emp.sort("sal").show(truncate=False)

+-----+------+---------+----+----------+----+----+------+
|empno|ename |job      |mgr |hiredate  |sal |comm|deptno|
+-----+------+---------+----+----------+----+----+------+
|7369 |SMITH |CLERK    |7902|1980-12-17|800 |null|20    |
|7900 |JAMES |CLERK    |7698|1981-12-03|950 |null|30    |
|7876 |ADAMS |CLERK    |7788|1983-01-12|1100|null|20    |
|7654 |MARTIN|SALESMAN |7698|1981-10-22|1250|1400|30    |
|7521 |WARD  |SALESMAN |7698|1981-02-03|1250|500 |30    |
|7934 |MILLER|CLERK    |7782|1982-01-25|1300|null|10    |
|7844 |TURNER|SALESMAN |7698|1984-10-08|1500|null|30    |
|7499 |ALLEN |SALESMAN |7698|1981-02-20|1600|300 |30    |
|7782 |CLARK |MANAGER  |7839|1981-09-06|2450|null|10    |
|7698 |BLAKE |MANAGER  |7839|1981-05-01|2850|null|30    |
|7566 |JONES |MANAGER  |7839|1981-03-02|2975|null|20    |
|7788 |SCOTT |ANALYST  |7566|1982-12-08|3000|null|20    |
|7902 |FORD  |ANALYST  |7566|1981-12-13|3000|null|20    |
|7839 |KING  |PRESIDENT|null|1981-11-17|5000|null|10    |
+-----+------+

In [55]:
emp.sort(emp.sal.desc()).show(truncate=False)

+-----+------+---------+----+----------+----+----+------+
|empno|ename |job      |mgr |hiredate  |sal |comm|deptno|
+-----+------+---------+----+----------+----+----+------+
|7839 |KING  |PRESIDENT|null|1981-11-17|5000|null|10    |
|7788 |SCOTT |ANALYST  |7566|1982-12-08|3000|null|20    |
|7902 |FORD  |ANALYST  |7566|1981-12-13|3000|null|20    |
|7566 |JONES |MANAGER  |7839|1981-03-02|2975|null|20    |
|7698 |BLAKE |MANAGER  |7839|1981-05-01|2850|null|30    |
|7782 |CLARK |MANAGER  |7839|1981-09-06|2450|null|10    |
|7499 |ALLEN |SALESMAN |7698|1981-02-20|1600|300 |30    |
|7844 |TURNER|SALESMAN |7698|1984-10-08|1500|null|30    |
|7934 |MILLER|CLERK    |7782|1982-01-25|1300|null|10    |
|7654 |MARTIN|SALESMAN |7698|1981-10-22|1250|1400|30    |
|7521 |WARD  |SALESMAN |7698|1981-02-03|1250|500 |30    |
|7876 |ADAMS |CLERK    |7788|1983-01-12|1100|null|20    |
|7900 |JAMES |CLERK    |7698|1981-12-03|950 |null|30    |
|7369 |SMITH |CLERK    |7902|1980-12-17|800 |null|20    |
+-----+------+

In [56]:
emp.sort("deptno", "sal").show(truncate=False)

+-----+------+---------+----+----------+----+----+------+
|empno|ename |job      |mgr |hiredate  |sal |comm|deptno|
+-----+------+---------+----+----------+----+----+------+
|7934 |MILLER|CLERK    |7782|1982-01-25|1300|null|10    |
|7782 |CLARK |MANAGER  |7839|1981-09-06|2450|null|10    |
|7839 |KING  |PRESIDENT|null|1981-11-17|5000|null|10    |
|7369 |SMITH |CLERK    |7902|1980-12-17|800 |null|20    |
|7876 |ADAMS |CLERK    |7788|1983-01-12|1100|null|20    |
|7566 |JONES |MANAGER  |7839|1981-03-02|2975|null|20    |
|7902 |FORD  |ANALYST  |7566|1981-12-13|3000|null|20    |
|7788 |SCOTT |ANALYST  |7566|1982-12-08|3000|null|20    |
|7900 |JAMES |CLERK    |7698|1981-12-03|950 |null|30    |
|7654 |MARTIN|SALESMAN |7698|1981-10-22|1250|1400|30    |
|7521 |WARD  |SALESMAN |7698|1981-02-03|1250|500 |30    |
|7844 |TURNER|SALESMAN |7698|1984-10-08|1500|null|30    |
|7499 |ALLEN |SALESMAN |7698|1981-02-20|1600|300 |30    |
|7698 |BLAKE |MANAGER  |7839|1981-05-01|2850|null|30    |
+-----+------+

In [57]:
emp.sort(emp.deptno.desc(), emp.sal.desc()).show(truncate=False)

+-----+------+---------+----+----------+----+----+------+
|empno|ename |job      |mgr |hiredate  |sal |comm|deptno|
+-----+------+---------+----+----------+----+----+------+
|7698 |BLAKE |MANAGER  |7839|1981-05-01|2850|null|30    |
|7499 |ALLEN |SALESMAN |7698|1981-02-20|1600|300 |30    |
|7844 |TURNER|SALESMAN |7698|1984-10-08|1500|null|30    |
|7654 |MARTIN|SALESMAN |7698|1981-10-22|1250|1400|30    |
|7521 |WARD  |SALESMAN |7698|1981-02-03|1250|500 |30    |
|7900 |JAMES |CLERK    |7698|1981-12-03|950 |null|30    |
|7902 |FORD  |ANALYST  |7566|1981-12-13|3000|null|20    |
|7788 |SCOTT |ANALYST  |7566|1982-12-08|3000|null|20    |
|7566 |JONES |MANAGER  |7839|1981-03-02|2975|null|20    |
|7876 |ADAMS |CLERK    |7788|1983-01-12|1100|null|20    |
|7369 |SMITH |CLERK    |7902|1980-12-17|800 |null|20    |
|7839 |KING  |PRESIDENT|null|1981-11-17|5000|null|10    |
|7782 |CLARK |MANAGER  |7839|1981-09-06|2450|null|10    |
|7934 |MILLER|CLERK    |7782|1982-01-25|1300|null|10    |
+-----+------+

In [58]:
emp.orderBy(emp.deptno.desc(), emp.sal.desc()).show(truncate=False)

+-----+------+---------+----+----------+----+----+------+
|empno|ename |job      |mgr |hiredate  |sal |comm|deptno|
+-----+------+---------+----+----------+----+----+------+
|7698 |BLAKE |MANAGER  |7839|1981-05-01|2850|null|30    |
|7499 |ALLEN |SALESMAN |7698|1981-02-20|1600|300 |30    |
|7844 |TURNER|SALESMAN |7698|1984-10-08|1500|null|30    |
|7654 |MARTIN|SALESMAN |7698|1981-10-22|1250|1400|30    |
|7521 |WARD  |SALESMAN |7698|1981-02-03|1250|500 |30    |
|7900 |JAMES |CLERK    |7698|1981-12-03|950 |null|30    |
|7902 |FORD  |ANALYST  |7566|1981-12-13|3000|null|20    |
|7788 |SCOTT |ANALYST  |7566|1982-12-08|3000|null|20    |
|7566 |JONES |MANAGER  |7839|1981-03-02|2975|null|20    |
|7876 |ADAMS |CLERK    |7788|1983-01-12|1100|null|20    |
|7369 |SMITH |CLERK    |7902|1980-12-17|800 |null|20    |
|7839 |KING  |PRESIDENT|null|1981-11-17|5000|null|10    |
|7782 |CLARK |MANAGER  |7839|1981-09-06|2450|null|10    |
|7934 |MILLER|CLERK    |7782|1982-01-25|1300|null|10    |
+-----+------+

In [59]:
emp.sort(col("hiredate").asc(),col("sal").asc()).show(truncate=False)
emp.orderBy(col("hiredate").asc(),col("sal").asc()).show(truncate=False)

+-----+------+---------+----+----------+----+----+------+
|empno|ename |job      |mgr |hiredate  |sal |comm|deptno|
+-----+------+---------+----+----------+----+----+------+
|7369 |SMITH |CLERK    |7902|1980-12-17|800 |null|20    |
|7521 |WARD  |SALESMAN |7698|1981-02-03|1250|500 |30    |
|7499 |ALLEN |SALESMAN |7698|1981-02-20|1600|300 |30    |
|7566 |JONES |MANAGER  |7839|1981-03-02|2975|null|20    |
|7698 |BLAKE |MANAGER  |7839|1981-05-01|2850|null|30    |
|7782 |CLARK |MANAGER  |7839|1981-09-06|2450|null|10    |
|7654 |MARTIN|SALESMAN |7698|1981-10-22|1250|1400|30    |
|7839 |KING  |PRESIDENT|null|1981-11-17|5000|null|10    |
|7900 |JAMES |CLERK    |7698|1981-12-03|950 |null|30    |
|7902 |FORD  |ANALYST  |7566|1981-12-13|3000|null|20    |
|7934 |MILLER|CLERK    |7782|1982-01-25|1300|null|10    |
|7788 |SCOTT |ANALYST  |7566|1982-12-08|3000|null|20    |
|7876 |ADAMS |CLERK    |7788|1983-01-12|1100|null|20    |
|7844 |TURNER|SALESMAN |7698|1984-10-08|1500|null|30    |
+-----+------+

## groupBy()

In [60]:
emp.groupBy("deptno").sum("sal").show(truncate=False)

+------+--------+
|deptno|sum(sal)|
+------+--------+
|20    |10875   |
|10    |8750    |
|30    |9400    |
+------+--------+



In [61]:
emp.groupBy("deptno").min("sal").show(truncate=False)

+------+--------+
|deptno|min(sal)|
+------+--------+
|20    |800     |
|10    |1300    |
|30    |950     |
+------+--------+



In [62]:
emp.groupBy("deptno").max("sal").show(truncate=False)

+------+--------+
|deptno|max(sal)|
+------+--------+
|20    |3000    |
|10    |5000    |
|30    |2850    |
+------+--------+



In [63]:
emp.groupBy("deptno").avg("sal").show(truncate=False)

+------+------------------+
|deptno|avg(sal)          |
+------+------------------+
|20    |2175.0            |
|10    |2916.6666666666665|
|30    |1566.6666666666667|
+------+------------------+



In [64]:
emp.groupBy("deptno").avg("sal").show(truncate=False)

+------+------------------+
|deptno|avg(sal)          |
+------+------------------+
|20    |2175.0            |
|10    |2916.6666666666665|
|30    |1566.6666666666667|
+------+------------------+



In [65]:
emp.groupBy("deptno", "job").sum("sal").show(truncate=False)

+------+---------+--------+
|deptno|job      |sum(sal)|
+------+---------+--------+
|20    |ANALYST  |6000    |
|20    |MANAGER  |2975    |
|30    |MANAGER  |2850    |
|30    |SALESMAN |5600    |
|30    |CLERK    |950     |
|20    |CLERK    |1900    |
|10    |PRESIDENT|5000    |
|10    |CLERK    |1300    |
|10    |MANAGER  |2450    |
+------+---------+--------+



In [66]:
emp.groupBy("deptno").sum("sal", "comm").show(truncate=False)

+------+--------+---------+
|deptno|sum(sal)|sum(comm)|
+------+--------+---------+
|20    |10875   |null     |
|10    |8750    |null     |
|30    |9400    |2200     |
+------+--------+---------+



In [67]:
from pyspark.sql.functions import sum,avg,max,min,mean,count
emp.groupBy("deptno").agg(sum("sal"), avg("sal"), max("sal"), min("sal"), mean("sal")).show(truncate=False)

+------+--------+------------------+--------+--------+------------------+
|deptno|sum(sal)|avg(sal)          |max(sal)|min(sal)|avg(sal)          |
+------+--------+------------------+--------+--------+------------------+
|20    |10875   |2175.0            |3000    |800     |2175.0            |
|10    |8750    |2916.6666666666665|5000    |1300    |2916.6666666666665|
|30    |9400    |1566.6666666666667|2850    |950     |1566.6666666666667|
+------+--------+------------------+--------+--------+------------------+



In [68]:
emp.groupBy("deptno") \
    .agg(sum("sal").alias("sum_salary"), \
         avg("sal").alias("avg_salary"), \
         max("sal").alias("max_salary"), \
         min("sal").alias("min_salary"), \
         mean("sal").alias("mean_salary"), \
     ) \
    .show(truncate=False)

+------+----------+------------------+----------+----------+------------------+
|deptno|sum_salary|avg_salary        |max_salary|min_salary|mean_salary       |
+------+----------+------------------+----------+----------+------------------+
|20    |10875     |2175.0            |3000      |800       |2175.0            |
|10    |8750      |2916.6666666666665|5000      |1300      |2916.6666666666665|
|30    |9400      |1566.6666666666667|2850      |950       |1566.6666666666667|
+------+----------+------------------+----------+----------+------------------+



In [69]:
emp.groupBy("deptno") \
    .agg(sum("sal").alias("sum_salary"), \
         avg("sal").alias("avg_salary"), \
         max("sal").alias("max_salary"), \
         min("sal").alias("min_salary"), \
         mean("sal").alias("mean_salary"), \
     ) \
    .where(col("sum_salary") > 9000)\
    .show(truncate=False)

+------+----------+------------------+----------+----------+------------------+
|deptno|sum_salary|avg_salary        |max_salary|min_salary|mean_salary       |
+------+----------+------------------+----------+----------+------------------+
|20    |10875     |2175.0            |3000      |800       |2175.0            |
|30    |9400      |1566.6666666666667|2850      |950       |1566.6666666666667|
+------+----------+------------------+----------+----------+------------------+



In [70]:
deptdata = [(10, '영업부', '서울'), (20, '개발부', '대전'), (30, '기획부', '서울'), (40, '마케팅부', '서울')]
deptcolname = ['deptno', 'dname', 'loc']
dept = spark.createDataFrame(data=deptdata, schema=deptcolname)
dept.show(truncate=False)

+------+--------+----+
|deptno|dname   |loc |
+------+--------+----+
|10    |영업부  |서울|
|20    |개발부  |대전|
|30    |기획부  |서울|
|40    |마케팅부|서울|
+------+--------+----+



## join()

In [71]:
emp.join(dept,emp.deptno ==  dept.deptno,"inner") \
     .show(truncate=False)

+-----+------+---------+----+----------+----+----+------+------+------+----+
|empno|ename |job      |mgr |hiredate  |sal |comm|deptno|deptno|dname |loc |
+-----+------+---------+----+----------+----+----+------+------+------+----+
|7934 |MILLER|CLERK    |7782|1982-01-25|1300|null|10    |10    |영업부|서울|
|7839 |KING  |PRESIDENT|null|1981-11-17|5000|null|10    |10    |영업부|서울|
|7782 |CLARK |MANAGER  |7839|1981-09-06|2450|null|10    |10    |영업부|서울|
|7902 |FORD  |ANALYST  |7566|1981-12-13|3000|null|20    |20    |개발부|대전|
|7876 |ADAMS |CLERK    |7788|1983-01-12|1100|null|20    |20    |개발부|대전|
|7788 |SCOTT |ANALYST  |7566|1982-12-08|3000|null|20    |20    |개발부|대전|
|7566 |JONES |MANAGER  |7839|1981-03-02|2975|null|20    |20    |개발부|대전|
|7369 |SMITH |CLERK    |7902|1980-12-17|800 |null|20    |20    |개발부|대전|
|7900 |JAMES |CLERK    |7698|1981-12-03|950 |null|30    |30    |기획부|서울|
|7844 |TURNER|SALESMAN |7698|1984-10-08|1500|null|30    |30    |기획부|서울|
|7698 |BLAKE |MANAGER  |7839|1981-05-01|2850|null

In [72]:
emp.join(dept,emp.deptno ==  dept.deptno,"right") \
     .show(truncate=False)

+-----+------+---------+----+----------+----+----+------+------+--------+----+
|empno|ename |job      |mgr |hiredate  |sal |comm|deptno|deptno|dname   |loc |
+-----+------+---------+----+----------+----+----+------+------+--------+----+
|7934 |MILLER|CLERK    |7782|1982-01-25|1300|null|10    |10    |영업부  |서울|
|7839 |KING  |PRESIDENT|null|1981-11-17|5000|null|10    |10    |영업부  |서울|
|7782 |CLARK |MANAGER  |7839|1981-09-06|2450|null|10    |10    |영업부  |서울|
|7902 |FORD  |ANALYST  |7566|1981-12-13|3000|null|20    |20    |개발부  |대전|
|7876 |ADAMS |CLERK    |7788|1983-01-12|1100|null|20    |20    |개발부  |대전|
|7788 |SCOTT |ANALYST  |7566|1982-12-08|3000|null|20    |20    |개발부  |대전|
|7566 |JONES |MANAGER  |7839|1981-03-02|2975|null|20    |20    |개발부  |대전|
|7369 |SMITH |CLERK    |7902|1980-12-17|800 |null|20    |20    |개발부  |대전|
|7900 |JAMES |CLERK    |7698|1981-12-03|950 |null|30    |30    |기획부  |서울|
|7844 |TURNER|SALESMAN |7698|1984-10-08|1500|null|30    |30    |기획부  |서울|
|7698 |BLAKE |MANAGER  

## union()

In [73]:
emp1 = emp.filter("job == 'MANAGER'").select("ename", "sal")
emp2 = emp.filter("deptno == 30").select("ename", "sal")
emp1.show()
emp2.show()
emp1.union(emp2).show()

+-----+----+
|ename| sal|
+-----+----+
|JONES|2975|
|BLAKE|2850|
|CLARK|2450|
+-----+----+

+------+----+
| ename| sal|
+------+----+
| ALLEN|1600|
|  WARD|1250|
|MARTIN|1250|
| BLAKE|2850|
|TURNER|1500|
| JAMES| 950|
+------+----+

+------+----+
| ename| sal|
+------+----+
| JONES|2975|
| BLAKE|2850|
| CLARK|2450|
| ALLEN|1600|
|  WARD|1250|
|MARTIN|1250|
| BLAKE|2850|
|TURNER|1500|
| JAMES| 950|
+------+----+



In [74]:
emp1 = emp.filter("job == 'MANAGER'").select("ename", "sal")
emp2 = emp.filter("deptno == 30").select("ename", "sal")
emp1.show()
emp2.show()
emp1.union(emp2).dropDuplicates().show()

+-----+----+
|ename| sal|
+-----+----+
|JONES|2975|
|BLAKE|2850|
|CLARK|2450|
+-----+----+

+------+----+
| ename| sal|
+------+----+
| ALLEN|1600|
|  WARD|1250|
|MARTIN|1250|
| BLAKE|2850|
|TURNER|1500|
| JAMES| 950|
+------+----+

+------+----+
| ename| sal|
+------+----+
| BLAKE|2850|
|MARTIN|1250|
|TURNER|1500|
| CLARK|2450|
| JAMES| 950|
| ALLEN|1600|
| JONES|2975|
|  WARD|1250|
+------+----+



## **map() 과 flatMap()**

### lines = [['w1',  'w2', 'w3'], ['w4', 'w5', 'w6']]
### lines를 map/flatmap을 이용하여 split하게 되면 아래와 같다.
### map: one2one mapping
###	Array(Array('w1', 'w2', 'w3'), Array('w4', 'w5', 'w6'))

### flatmap: one example → one result(flatten)
### Array('w1', 'w2', 'w3', 'w4', 'w5', 'w6')

In [75]:
data = ["둘리 또치 도우너 희동이 고길동 마이콜",
        "피카츄 꼬부기 잠만보",
        "듀크 턱시",
        "프로도 간달프 스미골",
        "코코"]
rdd=spark.sparkContext.parallelize(data)
for element in rdd.collect():
    print(element)

둘리 또치 도우너 희동이 고길동 마이콜
피카츄 꼬부기 잠만보
듀크 턱시
프로도 간달프 스미골
코코


In [76]:
rdd2=rdd.map(lambda x: x.split(" "))
rdd2.collect()

[['둘리', '또치', '도우너', '희동이', '고길동', '마이콜'],
 ['피카츄', '꼬부기', '잠만보'],
 ['듀크', '턱시'],
 ['프로도', '간달프', '스미골'],
 ['코코']]

In [77]:
rdd2=rdd.flatMap(lambda x: x.split(" "))
rdd2.collect()

['둘리',
 '또치',
 '도우너',
 '희동이',
 '고길동',
 '마이콜',
 '피카츄',
 '꼬부기',
 '잠만보',
 '듀크',
 '턱시',
 '프로도',
 '간달프',
 '스미골',
 '코코']

In [78]:
spark.sparkContext.parallelize([3,4,5]).map(lambda x: range(1,x)).collect() 

[range(1, 3), range(1, 4), range(1, 5)]

In [79]:
spark.sparkContext.parallelize([3,4,5]).flatMap(lambda x: range(1,x)).collect() 

[1, 2, 1, 2, 3, 1, 2, 3, 4]

In [80]:
spark.sparkContext.parallelize([3,4,5]).map(lambda x: [x,  x*x]).collect() 

[[3, 9], [4, 16], [5, 25]]

In [81]:
spark.sparkContext.parallelize([3,4,5]).flatMap(lambda x: [x,  x*x]).collect() 

[3, 9, 4, 16, 5, 25]

In [82]:
lines = spark.sparkContext.textFile("data/greeting.txt")
sorted(lines.flatMap(lambda line: line.split()).map(lambda w: (w,1)).reduceByKey(lambda v1, v2: v1+v2).collect())

[('Birthday', 1),
 ('Day', 1),
 ('Evening', 1),
 ('Good', 3),
 ('Happy', 2),
 ('Morning', 1),
 ('New', 1),
 ('Year', 1)]

In [83]:
rdd1 = spark.sparkContext.textFile("data/greeting.txt")
print(type(rdd1))
print(rdd1)
print(rdd1.collect())
print("------------------------------------------------------------------------------")
rdd2 = rdd1.flatMap(lambda line: line.split())
print(type(rdd2))
print(rdd2)
print(rdd2.collect())
print("------------------------------------------------------------------------------")
rdd3 = rdd2.map(lambda w: (w,1))
print(type(rdd3))
print(rdd3)      
print(rdd3.collect())
print("------------------------------------------------------------------------------")
rdd4 = rdd3.reduceByKey(lambda v1, v2: v1+v2)
print(type(rdd4))
print(rdd4)
print(rdd4.collect())
print("------------------------------------------------------------------------------")
result = rdd4.collect()
print(type(result))
print(result)

<class 'pyspark.rdd.RDD'>
data/greeting.txt MapPartitionsRDD[374] at textFile at NativeMethodAccessorImpl.java:0
['Good Morning', 'Good Evening', 'Good Day', 'Happy Birthday', 'Happy New Year']
------------------------------------------------------------------------------
<class 'pyspark.rdd.PipelinedRDD'>
PythonRDD[375] at RDD at PythonRDD.scala:53
['Good', 'Morning', 'Good', 'Evening', 'Good', 'Day', 'Happy', 'Birthday', 'Happy', 'New', 'Year']
------------------------------------------------------------------------------
<class 'pyspark.rdd.PipelinedRDD'>
PythonRDD[376] at RDD at PythonRDD.scala:53
[('Good', 1), ('Morning', 1), ('Good', 1), ('Evening', 1), ('Good', 1), ('Day', 1), ('Happy', 1), ('Birthday', 1), ('Happy', 1), ('New', 1), ('Year', 1)]
------------------------------------------------------------------------------
<class 'pyspark.rdd.PipelinedRDD'>
PythonRDD[381] at RDD at PythonRDD.scala:53
[('Good', 3), ('Morning', 1), ('Evening', 1), ('Birthday', 1), ('New', 1), ('Ye