In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").appName("spark-sql").getOrCreate()

In [2]:
df = spark.read.format('json')\
        .load("learning_spark_data/2015-summary.json")

In [3]:
df.dtypes

[('DEST_COUNTRY_NAME', 'string'),
 ('ORIGIN_COUNTRY_NAME', 'string'),
 ('count', 'bigint')]

In [4]:

df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [5]:
df.take(3) #하나하나가 다 RDD

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344)]

In [6]:
df.select('count').show(5) # 모양 형태 그대로 쓰고 싶으면 select 함수 씀 

+-----+
|count|
+-----+
|   15|
|    1|
|  344|
|   15|
|   62|
+-----+
only showing top 5 rows



In [7]:
# distinct(), 중복을 제거하는 함수 
df.select('DEST_COUNTRY_NAME').distinct().show(5) 

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|         Anguilla|
|           Russia|
|         Paraguay|
|          Senegal|
|           Sweden|
+-----------------+
only showing top 5 rows



In [8]:
# cache(), 메모리에 올려두고 계속 써, 새로운 rdd 생성  x 
df1 = df.select('DEST_COUNTRY_NAME').distinct().cache()
df1.count()

132

In [9]:
# row class를 이용한 단일 record 생성 

from pyspark.sql import Row
myRow = Row('Hello', None, 1, False)

In [10]:
# 새로운 컬럼 추가하기 
from pyspark.sql.functions import expr

df3 = df.withColumn('withinCountry', expr('ORIGIN_COUNTRY_NAME==DEST_COUNTRY_NAME')) #expr():표현식을 받아 생성 
df3

DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint, withinCountry: boolean]

In [11]:
df3.filter(df3.withinCountry == True).show()

+-----------------+-------------------+------+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|withinCountry|
+-----------------+-------------------+------+-------------+
|    United States|      United States|370002|         true|
+-----------------+-------------------+------+-------------+



In [12]:
# case when 카운트 10 이하 under, 이상 upper로 변환 > category 컬럼 추가 

# sql 표현식 
df4 = df.withColumn( 'category', expr("CASE WHEN count <= 10 THEN 'under' ELSE 'upper' END")).show()

+--------------------+-------------------+-----+--------+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|category|
+--------------------+-------------------+-----+--------+
|       United States|            Romania|   15|   upper|
|       United States|            Croatia|    1|   under|
|       United States|            Ireland|  344|   upper|
|               Egypt|      United States|   15|   upper|
|       United States|              India|   62|   upper|
|       United States|          Singapore|    1|   under|
|       United States|            Grenada|   62|   upper|
|          Costa Rica|      United States|  588|   upper|
|             Senegal|      United States|   40|   upper|
|             Moldova|      United States|    1|   under|
|       United States|       Sint Maarten|  325|   upper|
|       United States|   Marshall Islands|   39|   upper|
|              Guyana|      United States|   64|   upper|
|               Malta|      United States|    1|   under|
|            A

In [13]:
# dataframe의 select(), where(), filter() -> transformation
#show(), count() -> action

In [14]:
# emp.csv 읽기
emp_df = spark.read.csv(
    "learning_spark_data/emp.csv",
    header=True,       # 첫 행을 컬럼명으로 사용
    inferSchema=True   # 데이터 타입 자동 추론
)

# dept.csv 읽기
dept_df = spark.read.csv(
    "learning_spark_data/dept.csv",
    header=True,
    inferSchema=True
)

In [15]:
# 확인
emp_df.show()
dept_df.show()

+-----+------+---------+----+----------+----+----+------+
|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|
+-----+------+---------+----+----------+----+----+------+
| 7369| SMITH|    CLERK|7902|1980-12-17| 800|NULL|    20|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600| 300|    30|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250| 500|    30|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975|NULL|    20|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250|1400|    30|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850|NULL|    30|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450|NULL|    10|
| 7788| SCOTT|  ANALYST|7566|1987-04-19|3000|NULL|    20|
| 7839|  KING|PRESIDENT|NULL|1981-11-17|5000|NULL|    10|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500|   0|    30|
| 7876| ADAMS|    CLERK|7788|1987-05-23|1100|NULL|    20|
| 7900| JAMES|    CLERK|7698|1981-12-03| 950|NULL|    30|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000|NULL|    20|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300|NULL|    10|
| 9292|  JACK|

In [16]:
emp_df.printSchema()
dept_df.printSchema()

root
 |-- empno: integer (nullable = true)
 |-- ename: string (nullable = true)
 |-- job: string (nullable = true)
 |-- mgr: integer (nullable = true)
 |-- hiredate: date (nullable = true)
 |-- sal: integer (nullable = true)
 |-- comm: integer (nullable = true)
 |-- deptno: integer (nullable = true)

root
 |-- deptno: integer (nullable = true)
 |-- dname: string (nullable = true)
 |-- loc: string (nullable = true)



In [17]:
emp_df.count(), dept_df.count()

(15, 4)

In [18]:
# 대소문자구별 안함 
emp_df.select('ENAME', 'DEPTNO').show()

+------+------+
| ENAME|DEPTNO|
+------+------+
| SMITH|    20|
| ALLEN|    30|
|  WARD|    30|
| JONES|    20|
|MARTIN|    30|
| BLAKE|    30|
| CLARK|    10|
| SCOTT|    20|
|  KING|    10|
|TURNER|    30|
| ADAMS|    20|
| JAMES|    30|
|  FORD|    20|
|MILLER|    10|
|  JACK|    70|
+------+------+



In [19]:
# filter() 와 동일함 
emp_df.select('*').where('deptno=20').show()

+-----+-----+-------+----+----------+----+----+------+
|empno|ename|    job| mgr|  hiredate| sal|comm|deptno|
+-----+-----+-------+----+----------+----+----+------+
| 7369|SMITH|  CLERK|7902|1980-12-17| 800|NULL|    20|
| 7566|JONES|MANAGER|7839|1981-04-02|2975|NULL|    20|
| 7788|SCOTT|ANALYST|7566|1987-04-19|3000|NULL|    20|
| 7876|ADAMS|  CLERK|7788|1987-05-23|1100|NULL|    20|
| 7902| FORD|ANALYST|7566|1981-12-03|3000|NULL|    20|
+-----+-----+-------+----+----------+----+----+------+



In [20]:
emp_df.selectExpr('count(*)').show()

+--------+
|count(1)|
+--------+
|      15|
+--------+



In [21]:
# function 사용하면 더 빠름, api 확인 필요 
from pyspark.sql.functions import countDistinct
emp_df.select(countDistinct('job')).show()

+-------------------+
|count(DISTINCT job)|
+-------------------+
|                  5|
+-------------------+



In [22]:
# 오류율: 이 정도까지 허용 
from pyspark.sql.functions import approx_count_distinct
emp_df.select(approx_count_distinct('job', 0.1)).show()

+--------------------------+
|approx_count_distinct(job)|
+--------------------------+
|                         5|
+--------------------------+



In [23]:
# fist, last, min, max, sum, avg  -> expr쓰지 않고 function으로만 처리 

In [24]:
from pyspark.sql.functions import count, first, last, min, max, sum, avg, round

In [25]:
emp_df.select(count("sal")).show()

+----------+
|count(sal)|
+----------+
|        15|
+----------+



In [26]:
# first()
emp_df.select(first('sal')).show()

+----------+
|first(sal)|
+----------+
|       800|
+----------+



In [27]:
# last()
emp_df.select(last('sal')).show()

+---------+
|last(sal)|
+---------+
|     3200|
+---------+



In [28]:
# min()
emp_df.select(min('sal')).show()

+--------+
|min(sal)|
+--------+
|     800|
+--------+



In [29]:
# max()
emp_df.select(max('sal')).show()

+--------+
|max(sal)|
+--------+
|    5000|
+--------+



In [30]:
# sum()
emp_df.select(sum('sal')).show()

+--------+
|sum(sal)|
+--------+
|   32225|
+--------+



In [31]:
# avg()
emp_df.select(round(avg('sal'), 2)).show()

+------------------+
|round(avg(sal), 2)|
+------------------+
|           2148.33|
+------------------+



In [32]:
from pyspark.sql.functions import col
emp_df.select(sum(col('sal'))).show()

+--------+
|sum(sal)|
+--------+
|   32225|
+--------+



In [33]:
emp_df.selectExpr('sum(distinct sal)').show()

+-----------------+
|sum(DISTINCT sal)|
+-----------------+
|            27975|
+-----------------+



In [34]:
# total_salary / total_transaction (salary의 count), avg_salary, mean_salary
emp_df.select(
    sum("sal").alias("total_salary"),
    count("sal").alias("total_transaction"),
    round(avg("sal")).alias("avg_salary"),
    round(avg("sal")).alias("mean_salary")
).selectExpr(
    'total_salary/total_transaction',
    'avg_salary',
    'mean_salary'
).show()

+----------------------------------+----------+-----------+
|(total_salary / total_transaction)|avg_salary|mean_salary|
+----------------------------------+----------+-----------+
|                2148.3333333333335|    2148.0|     2148.0|
+----------------------------------+----------+-----------+



In [35]:
# 그룹화 
emp_df.groupBy('job').count().show()

+---------+-----+
|      job|count|
+---------+-----+
|  ANALYST|    2|
| SALESMAN|    4|
|    CLERK|    5|
|  MANAGER|    3|
|PRESIDENT|    1|
+---------+-----+



In [36]:
#select job, 
#count(job),
#sum(sal)
#groupby job

group_df = emp_df.groupBy('job').agg(
    count('job').alias('quantity'),
    expr('count(job)'),
    sum('sal')
    )

group_df.show()

+---------+--------+----------+--------+
|      job|quantity|count(job)|sum(sal)|
+---------+--------+----------+--------+
|  ANALYST|       2|         2|    6000|
| SALESMAN|       4|         4|    5600|
|    CLERK|       5|         5|    7350|
|  MANAGER|       3|         3|    8275|
|PRESIDENT|       1|         1|    5000|
+---------+--------+----------+--------+



In [37]:
# sal의 평균 SAL_AVG, 표준편차 SAL_STDEV 를 job별로 계산해서 출력, 소수점 2자리

from pyspark.sql.functions import stddev

emp_df.groupBy('job').agg(
    round(avg('sal'), 2).alias("SAL_AVG"),
    round(stddev('sal'), 2).alias("SAL_STDEV")
).show()

+---------+-------+---------+
|      job|SAL_AVG|SAL_STDEV|
+---------+-------+---------+
|  ANALYST| 3000.0|      0.0|
| SALESMAN| 1400.0|   177.95|
|    CLERK| 1470.0|   984.63|
|  MANAGER|2758.33|   274.24|
|PRESIDENT| 5000.0|     NULL|
+---------+-------+---------+



In [38]:
# sal top10
emp_df.orderBy(emp_df.sal.desc()).limit(10).show()

+-----+------+---------+----+----------+----+----+------+
|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|
+-----+------+---------+----+----------+----+----+------+
| 7839|  KING|PRESIDENT|NULL|1981-11-17|5000|NULL|    10|
| 9292|  JACK|    CLERK|7782|1982-01-23|3200|NULL|    70|
| 7788| SCOTT|  ANALYST|7566|1987-04-19|3000|NULL|    20|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000|NULL|    20|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975|NULL|    20|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850|NULL|    30|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450|NULL|    10|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600| 300|    30|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500|   0|    30|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300|NULL|    10|
+-----+------+---------+----+----------+----+----+------+



In [39]:
# 윈도우 함수 
from pyspark.sql.window import Window 
from pyspark.sql.functions import desc, rank

windowspec = Window.orderBy(desc('sal'))
salAllRank = rank().over(windowspec)
salAllRank

Column<'RANK() OVER (ORDER BY sal DESC NULLS LAST unspecifiedframe$())'>

In [40]:
emp_df.show(10)

+-----+------+---------+----+----------+----+----+------+
|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|
+-----+------+---------+----+----------+----+----+------+
| 7369| SMITH|    CLERK|7902|1980-12-17| 800|NULL|    20|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600| 300|    30|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250| 500|    30|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975|NULL|    20|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250|1400|    30|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850|NULL|    30|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450|NULL|    10|
| 7788| SCOTT|  ANALYST|7566|1987-04-19|3000|NULL|    20|
| 7839|  KING|PRESIDENT|NULL|1981-11-17|5000|NULL|    10|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500|   0|    30|
+-----+------+---------+----+----------+----+----+------+
only showing top 10 rows



In [41]:
emp_df.withColumn('salary_rank', salAllRank).show(10)

+-----+------+---------+----+----------+----+----+------+-----------+
|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|salary_rank|
+-----+------+---------+----+----------+----+----+------+-----------+
| 7839|  KING|PRESIDENT|NULL|1981-11-17|5000|NULL|    10|          1|
| 9292|  JACK|    CLERK|7782|1982-01-23|3200|NULL|    70|          2|
| 7788| SCOTT|  ANALYST|7566|1987-04-19|3000|NULL|    20|          3|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000|NULL|    20|          3|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975|NULL|    20|          5|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850|NULL|    30|          6|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450|NULL|    10|          7|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600| 300|    30|          8|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500|   0|    30|          9|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300|NULL|    10|         10|
+-----+------+---------+----+----------+----+----+------+-----------+
only showing top 10 

In [42]:
# 직무별로 rank 작성
# window.partitionBy()

windowSpec = Window.partitionBy('job').orderBy(desc('sal'))

# job_rank_df 작성 
job_rank_df = emp_df.withColumn("rank", rank().over(windowSpec))
job_rank_df.orderBy('job', 'rank').show()


+-----+------+---------+----+----------+----+----+------+----+
|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|rank|
+-----+------+---------+----+----------+----+----+------+----+
| 7788| SCOTT|  ANALYST|7566|1987-04-19|3000|NULL|    20|   1|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000|NULL|    20|   1|
| 9292|  JACK|    CLERK|7782|1982-01-23|3200|NULL|    70|   1|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300|NULL|    10|   2|
| 7876| ADAMS|    CLERK|7788|1987-05-23|1100|NULL|    20|   3|
| 7900| JAMES|    CLERK|7698|1981-12-03| 950|NULL|    30|   4|
| 7369| SMITH|    CLERK|7902|1980-12-17| 800|NULL|    20|   5|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975|NULL|    20|   1|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850|NULL|    30|   2|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450|NULL|    10|   3|
| 7839|  KING|PRESIDENT|NULL|1981-11-17|5000|NULL|    10|   1|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600| 300|    30|   1|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500|   0|    3

In [47]:
# 부서별순위
window_rank = Window.partitionBy('deptno').orderBy(desc('sal'))

# emp_df에 순위 컬럼 추가
emp_df.withColumn("dept_rank", rank().over(window_rank)) \
      .orderBy("deptno", "dept_rank") \
      .show()


+-----+------+---------+----+----------+----+----+------+---------+
|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|dept_rank|
+-----+------+---------+----+----------+----+----+------+---------+
| 7839|  KING|PRESIDENT|NULL|1981-11-17|5000|NULL|    10|        1|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450|NULL|    10|        2|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300|NULL|    10|        3|
| 7788| SCOTT|  ANALYST|7566|1987-04-19|3000|NULL|    20|        1|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000|NULL|    20|        1|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975|NULL|    20|        3|
| 7876| ADAMS|    CLERK|7788|1987-05-23|1100|NULL|    20|        4|
| 7369| SMITH|    CLERK|7902|1980-12-17| 800|NULL|    20|        5|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850|NULL|    30|        1|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600| 300|    30|        2|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500|   0|    30|        3|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250| 50

In [48]:
# 누적 급여 sum('sal').over()

# 부서별 + 급여 내림차순으로 누적
window_cum = Window.partitionBy("deptno") \
                   .orderBy(desc("sal")) \
                   .rowsBetween(Window.unboundedPreceding, Window.currentRow)

emp_df.withColumn("cum_salary", sum("sal").over(window_cum)) \
      .orderBy("deptno", desc("sal")) \
      .show()


+-----+------+---------+----+----------+----+----+------+----------+
|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|cum_salary|
+-----+------+---------+----+----------+----+----+------+----------+
| 7839|  KING|PRESIDENT|NULL|1981-11-17|5000|NULL|    10|      5000|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450|NULL|    10|      7450|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300|NULL|    10|      8750|
| 7788| SCOTT|  ANALYST|7566|1987-04-19|3000|NULL|    20|      3000|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000|NULL|    20|      6000|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975|NULL|    20|      8975|
| 7876| ADAMS|    CLERK|7788|1987-05-23|1100|NULL|    20|     10075|
| 7369| SMITH|    CLERK|7902|1980-12-17| 800|NULL|    20|     10875|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850|NULL|    30|      2850|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600| 300|    30|      4450|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500|   0|    30|      5950|
| 7521|  WARD| SALESMAN|7698|1981-

In [49]:
# 부서별 누적급여

# 부서별 누적합 윈도우 스펙
window_cum = Window.partitionBy("deptno") \
                   .orderBy(desc("sal")) \
                   .rowsBetween(Window.unboundedPreceding, Window.currentRow)

# 누적 급여 컬럼 추가
emp_df.withColumn("cum_salary", sum("sal").over(window_cum)) \
      .orderBy("deptno", desc("sal")) \
      .show()

+-----+------+---------+----+----------+----+----+------+----------+
|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|cum_salary|
+-----+------+---------+----+----------+----+----+------+----------+
| 7839|  KING|PRESIDENT|NULL|1981-11-17|5000|NULL|    10|      5000|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450|NULL|    10|      7450|
| 7934|MILLER|    CLERK|7782|1982-01-23|1300|NULL|    10|      8750|
| 7788| SCOTT|  ANALYST|7566|1987-04-19|3000|NULL|    20|      3000|
| 7902|  FORD|  ANALYST|7566|1981-12-03|3000|NULL|    20|      6000|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975|NULL|    20|      8975|
| 7876| ADAMS|    CLERK|7788|1987-05-23|1100|NULL|    20|     10075|
| 7369| SMITH|    CLERK|7902|1980-12-17| 800|NULL|    20|     10875|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850|NULL|    30|      2850|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600| 300|    30|      4450|
| 7844|TURNER| SALESMAN|7698|1981-09-08|1500|   0|    30|      5950|
| 7521|  WARD| SALESMAN|7698|1981-

In [50]:
# 부서별 평균급여
from pyspark.sql.functions import avg, round

emp_df.groupBy("deptno").agg(
    round(avg("sal"), 2).alias("avg_salary")
).orderBy("deptno").show()

+------+----------+
|deptno|avg_salary|
+------+----------+
|    10|   2916.67|
|    20|    2175.0|
|    30|   1566.67|
|    70|    3200.0|
+------+----------+



In [None]:
# 부서별, 직업별 소계
emp_df.groupBy('deptno', 'job').agg(count('*'))

In [51]:
emp_df.cube('deptno', 'job').agg(count('*'),sum('sal'))\
    .orderBy('deptno', 'job').show()

+------+---------+--------+--------+
|deptno|      job|count(1)|sum(sal)|
+------+---------+--------+--------+
|  NULL|     NULL|      15|   32225|
|  NULL|  ANALYST|       2|    6000|
|  NULL|    CLERK|       5|    7350|
|  NULL|  MANAGER|       3|    8275|
|  NULL|PRESIDENT|       1|    5000|
|  NULL| SALESMAN|       4|    5600|
|    10|     NULL|       3|    8750|
|    10|    CLERK|       1|    1300|
|    10|  MANAGER|       1|    2450|
|    10|PRESIDENT|       1|    5000|
|    20|     NULL|       5|   10875|
|    20|  ANALYST|       2|    6000|
|    20|    CLERK|       2|    1900|
|    20|  MANAGER|       1|    2975|
|    30|     NULL|       6|    9400|
|    30|    CLERK|       1|     950|
|    30|  MANAGER|       1|    2850|
|    30| SALESMAN|       4|    5600|
|    70|     NULL|       1|    3200|
|    70|    CLERK|       1|    3200|
+------+---------+--------+--------+



In [56]:
emp_dept_df = emp_df.join(dept_df, emp_df['deptno']==dept_df['deptno'])
emp_dept_df.show()

+-----+------+---------+----+----------+----+----+------+------+----------+--------+
|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|deptno|     dname|     loc|
+-----+------+---------+----+----------+----+----+------+------+----------+--------+
| 7369| SMITH|    CLERK|7902|1980-12-17| 800|NULL|    20|    20|  RESEARCH|  DALLAS|
| 7499| ALLEN| SALESMAN|7698|1981-02-20|1600| 300|    30|    30|     SALES| CHICAGO|
| 7521|  WARD| SALESMAN|7698|1981-02-22|1250| 500|    30|    30|     SALES| CHICAGO|
| 7566| JONES|  MANAGER|7839|1981-04-02|2975|NULL|    20|    20|  RESEARCH|  DALLAS|
| 7654|MARTIN| SALESMAN|7698|1981-09-28|1250|1400|    30|    30|     SALES| CHICAGO|
| 7698| BLAKE|  MANAGER|7839|1981-05-01|2850|NULL|    30|    30|     SALES| CHICAGO|
| 7782| CLARK|  MANAGER|7839|1981-06-09|2450|NULL|    10|    10|ACCOUNTING|NEW YORK|
| 7788| SCOTT|  ANALYST|7566|1987-04-19|3000|NULL|    20|    20|  RESEARCH|  DALLAS|
| 7839|  KING|PRESIDENT|NULL|1981-11-17|5000|NULL|    10|    10|A

In [59]:
join_df = emp_df.join(dept_df, on='deptno', how='inner')
join_df.select('ename','deptno','dname').show()

+------+------+----------+
| ename|deptno|     dname|
+------+------+----------+
| SMITH|    20|  RESEARCH|
| ALLEN|    30|     SALES|
|  WARD|    30|     SALES|
| JONES|    20|  RESEARCH|
|MARTIN|    30|     SALES|
| BLAKE|    30|     SALES|
| CLARK|    10|ACCOUNTING|
| SCOTT|    20|  RESEARCH|
|  KING|    10|ACCOUNTING|
|TURNER|    30|     SALES|
| ADAMS|    20|  RESEARCH|
| JAMES|    30|     SALES|
|  FORD|    20|  RESEARCH|
|MILLER|    10|ACCOUNTING|
+------+------+----------+



In [60]:
spark.stop()

In [None]:
# SQL
