## RDD를 이용해서 데이터프레임 생성

In [1]:
from pyspark.sql.types import StringType, IntegerType

### 리스트를 이용해서 데이터프레임 만들기

In [2]:
# 리스트로 RDD 객체 생성
words = sc.parallelize(['apple','peach','banana','mango','pineapple'])
words.collect()

                                                                                

['apple', 'peach', 'banana', 'mango', 'pineapple']

In [3]:
# createDataFrame(RDD객체, 타입)
df = spark.createDataFrame(words, StringType())
df.show()

                                                                                

+---------+
|    value|
+---------+
|    apple|
|    peach|
|   banana|
|    mango|
|pineapple|
+---------+



In [4]:
# 리스트로 데이터프레임 객체 생성 - (과일명, 가격)
data = [('apple',1500),('peach',2000),('banana',1500),('mango',2500),('pineapple',3000)]

fruits = spark.createDataFrame(data)
fruits.collect()

[Row(_1='apple', _2=1500),
 Row(_1='peach', _2=2000),
 Row(_1='banana', _2=1500),
 Row(_1='mango', _2=2500),
 Row(_1='pineapple', _2=3000)]

In [5]:
# 컬럼명을 지정하면서 데이터프레임 객체 생성
fruits = spark.createDataFrame(data, ['fruit','price'])
fruits.collect()

[Row(fruit='apple', price=1500),
 Row(fruit='peach', price=2000),
 Row(fruit='banana', price=1500),
 Row(fruit='mango', price=2500),
 Row(fruit='pineapple', price=3000)]

In [6]:
# 컬럼명, 데이터타입을 지정하면서 데이터프레임 객체 생성
# 컬럼명:데이터타입 으로 정의
fruits = spark.createDataFrame(data, "fruit:string, price:int")
fruits.collect()

[Row(fruit='apple', price=1500),
 Row(fruit='peach', price=2000),
 Row(fruit='banana', price=1500),
 Row(fruit='mango', price=2500),
 Row(fruit='pineapple', price=3000)]

In [7]:
# 특정 컬럼만 출력 : select
fruits.select('fruit').collect()

[Row(fruit='apple'),
 Row(fruit='peach'),
 Row(fruit='banana'),
 Row(fruit='mango'),
 Row(fruit='pineapple')]

In [3]:
## 스파크세션을 이용한 고급 데이터프레임 생성
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType

In [4]:
#데이터프레임 스키마 정의 - employees
# 스파크 세션 객체 직접 생성
spark = SparkSession.builder.appName("emp").getOrCreate()

In [5]:
# 데이터프레임 생성 전 스키마 정의
# add(컬럼명, 데이터타입)
emp_schema = StructType().add("empno", "integer").add("fname", "string").add("lname", "string")\
                        .add("hdate", "string").add("sal", "integer").add("deptid", "integer")

In [6]:
# 지정한 스키마를 이용해서 데이터프레임 생성
# 데이터프레임의 각 행은 set 객체로 정의
# 위에서 정의한 스키마는 schema 속성으로 지정
df = spark.createDataFrame([(123,'steve','king','',35000,None),\
                            (456,'john','seo','2005-12-15',20000,50),\
                            (789,'david',None,'2004-03-01',22000,90)], schema=emp_schema)
df.show()

[Stage 0:>                                                          (0 + 1) / 1]

+-----+-----+-----+----------+-----+------+
|empno|fname|lname|     hdate|  sal|deptid|
+-----+-----+-----+----------+-----+------+
|  123|steve| king|          |35000|  null|
|  456| john|  seo|2005-12-15|20000|    50|
|  789|david| null|2004-03-01|22000|    90|
+-----+-----+-----+----------+-----+------+



                                                                                

In [7]:
# 데이터프레임의 스키마 확인
df.printSchema()

root
 |-- empno: integer (nullable = true)
 |-- fname: string (nullable = true)
 |-- lname: string (nullable = true)
 |-- hdate: string (nullable = true)
 |-- sal: integer (nullable = true)
 |-- deptid: integer (nullable = true)



In [8]:
# 결측치 처리
# fillna(대체값)
df.fillna('X').show()    # 문자형 결측치 대체
df.fillna(999).show()    # 숫자형 결측치 대체

+-----+-----+-----+----------+-----+------+
|empno|fname|lname|     hdate|  sal|deptid|
+-----+-----+-----+----------+-----+------+
|  123|steve| king|          |35000|  null|
|  456| john|  seo|2005-12-15|20000|    50|
|  789|david|    X|2004-03-01|22000|    90|
+-----+-----+-----+----------+-----+------+

+-----+-----+-----+----------+-----+------+
|empno|fname|lname|     hdate|  sal|deptid|
+-----+-----+-----+----------+-----+------+
|  123|steve| king|          |35000|   999|
|  456| john|  seo|2005-12-15|20000|    50|
|  789|david| null|2004-03-01|22000|    90|
+-----+-----+-----+----------+-----+------+



In [9]:
# 여러 컬럼에 대해 결측치 처리 : dict 이용
df.fillna({'lname':'X', 'deptid':999}).show()

+-----+-----+-----+----------+-----+------+
|empno|fname|lname|     hdate|  sal|deptid|
+-----+-----+-----+----------+-----+------+
|  123|steve| king|          |35000|   999|
|  456| john|  seo|2005-12-15|20000|    50|
|  789|david|    X|2004-03-01|22000|    90|
+-----+-----+-----+----------+-----+------+



In [10]:
# 결측치 제거
# na.drop()
df2 = spark.createDataFrame([(123,'steve','king','2003-06-17',35000,None),\
                            (456,'john','seo','2005-12-15',20000,50),\
                            (789,'david',None,'2004-03-01',22000,90)], schema=emp_schema)
df2.show()

+-----+-----+-----+----------+-----+------+
|empno|fname|lname|     hdate|  sal|deptid|
+-----+-----+-----+----------+-----+------+
|  123|steve| king|2003-06-17|35000|  null|
|  456| john|  seo|2005-12-15|20000|    50|
|  789|david| null|2004-03-01|22000|    90|
+-----+-----+-----+----------+-----+------+



In [11]:
df2.na.drop().show()

+-----+-----+-----+----------+-----+------+
|empno|fname|lname|     hdate|  sal|deptid|
+-----+-----+-----+----------+-----+------+
|  456| john|  seo|2005-12-15|20000|    50|
+-----+-----+-----+----------+-----+------+



In [12]:
# 특정 컬럼 제거
# drop(컬럼명)
df2.drop('fname').show()

+-----+-----+----------+-----+------+
|empno|lname|     hdate|  sal|deptid|
+-----+-----+----------+-----+------+
|  123| king|2003-06-17|35000|  null|
|  456|  seo|2005-12-15|20000|    50|
|  789| null|2004-03-01|22000|    90|
+-----+-----+----------+-----+------+



In [13]:
# 여러 컬럼 제거
df2.drop('hdate','sal').show()

+-----+-----+-----+------+
|empno|fname|lname|deptid|
+-----+-----+-----+------+
|  123|steve| king|  null|
|  456| john|  seo|    50|
|  789|david| null|    90|
+-----+-----+-----+------+



## csv 파일을 이용해서 데이터프레임 만들기
+ read.csv(경로, 헤더여부, 스키마여부)

In [14]:
emp = spark.read.csv("employees.csv", header=True, inferSchema=True)
emp.printSchema()    # 데이터프레임 스키마 출력

root
 |-- EMPLOYEE_ID: integer (nullable = true)
 |-- FIRST_NAME: string (nullable = true)
 |-- LAST_NAME: string (nullable = true)
 |-- EMAIL: string (nullable = true)
 |-- PHONE_NUMBER: string (nullable = true)
 |-- HIRE_DATE: timestamp (nullable = true)
 |-- JOB_ID: string (nullable = true)
 |-- SALARY: integer (nullable = true)
 |-- COMMISSION_PCT: double (nullable = true)
 |-- MANAGER_ID: integer (nullable = true)
 |-- DEPARTMENT_ID: integer (nullable = true)



In [15]:
emp.count()

107

In [16]:
emp.columns    # 데이터프레임 컬럼 목록 출력

['EMPLOYEE_ID',
 'FIRST_NAME',
 'LAST_NAME',
 'EMAIL',
 'PHONE_NUMBER',
 'HIRE_DATE',
 'JOB_ID',
 'SALARY',
 'COMMISSION_PCT',
 'MANAGER_ID',
 'DEPARTMENT_ID']

In [17]:
emp.show(5)

+-----------+----------+---------+--------+------------+-------------------+-------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|          HIRE_DATE| JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+-------------------+-------+------+--------------+----------+-------------+
|        100|    Steven|     King|   SKING|515.123.4567|2003-06-17 00:00:00|AD_PRES| 24000|          null|      null|           90|
|        101|     Neena|  Kochhar|NKOCHHAR|515.123.4568|2005-09-21 00:00:00|  AD_VP| 17000|          null|       100|           90|
|        102|       Lex|  De Haan| LDEHAAN|515.123.4569|2001-01-13 00:00:00|  AD_VP| 17000|          null|       100|           90|
|        103| Alexander|   Hunold| AHUNOLD|590.423.4567|2006-01-03 00:00:00|IT_PROG|  9000|          null|       102|           60|
|        104|     Bruce|    Ernst|  BERNST|590.423.4568|2007-05-21 00:00:00|

In [18]:
# 데이터 요약 보기
# summary(통계항목)
emp.summary().show()
emp.summary('count','min','max').show()

                                                                                

+-------+----------------+----------+---------+-------+------------------+----------+------------------+-------------------+------------------+------------------+
|summary|     EMPLOYEE_ID|FIRST_NAME|LAST_NAME|  EMAIL|      PHONE_NUMBER|    JOB_ID|            SALARY|     COMMISSION_PCT|        MANAGER_ID|     DEPARTMENT_ID|
+-------+----------------+----------+---------+-------+------------------+----------+------------------+-------------------+------------------+------------------+
|  count|             107|       107|      107|    107|               107|       107|               107|                 35|               106|               106|
|   mean|           153.0|      null|     null|   null|              null|      null|6461.8317757009345|0.22285714285714286|124.76415094339623| 63.20754716981132|
| stddev|31.0322412983658|      null|     null|   null|              null|      null|3909.5797305524825|0.08518393346757594|20.315395000692018|20.910110100200708|
|    min|             

In [19]:
emp.select('SALARY','COMMISSION_PCT').summary().show()

+-------+------------------+-------------------+
|summary|            SALARY|     COMMISSION_PCT|
+-------+------------------+-------------------+
|  count|               107|                 35|
|   mean|6461.8317757009345|0.22285714285714286|
| stddev|3909.5797305524825|0.08518393346757594|
|    min|              2100|                0.1|
|    25%|              3100|               0.15|
|    50%|              6200|                0.2|
|    75%|              9000|                0.3|
|    max|             24000|                0.4|
+-------+------------------+-------------------+



## 데이터프레임 데이터 탐색
+ select : 컬럼 선택
+ filter : 조건 검색
+ where : 고급 조건 검색
+ orderBy : 정렬
+ groupBy : 그룹화

In [20]:
# 모든 사원의 이름 조회
emp.select(['FIRST_NAME','LAST_NAME']).show(5)

+----------+---------+
|FIRST_NAME|LAST_NAME|
+----------+---------+
|    Steven|     King|
|     Neena|  Kochhar|
|       Lex|  De Haan|
| Alexander|   Hunold|
|     Bruce|    Ernst|
+----------+---------+
only showing top 5 rows



In [21]:
# 급여가 7000 이상인 사원 조회
emp.filter(emp['SALARY'] >= 7000).show(5)

+-----------+----------+---------+--------+------------+-------------------+-------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|          HIRE_DATE| JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+-------------------+-------+------+--------------+----------+-------------+
|        100|    Steven|     King|   SKING|515.123.4567|2003-06-17 00:00:00|AD_PRES| 24000|          null|      null|           90|
|        101|     Neena|  Kochhar|NKOCHHAR|515.123.4568|2005-09-21 00:00:00|  AD_VP| 17000|          null|       100|           90|
|        102|       Lex|  De Haan| LDEHAAN|515.123.4569|2001-01-13 00:00:00|  AD_VP| 17000|          null|       100|           90|
|        103| Alexander|   Hunold| AHUNOLD|590.423.4567|2006-01-03 00:00:00|IT_PROG|  9000|          null|       102|           60|
|        108|     Nancy|Greenberg|NGREENBE|515.124.4569|2002-08-17 00:00:00|

In [22]:
# 급여가 7000 이상인 사원의 수 조회
emp.filter(emp['SALARY'] >= 7000).count()

47

In [23]:
# 2006-02-05부터 2006-11-15 사이에 고용된 사원 조회
emp.filter((emp['HIRE_DATE'] >= '2006-02-05')&(emp['HIRE_DATE'] <= '2006-11-15')).show(5)

+-----------+-----------+-----------+--------+------------+-------------------+----------+------+--------------+----------+-------------+
|EMPLOYEE_ID| FIRST_NAME|  LAST_NAME|   EMAIL|PHONE_NUMBER|          HIRE_DATE|    JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+-----------+-----------+--------+------------+-------------------+----------+------+--------------+----------+-------------+
|        106|      Valli|  Pataballa|VPATABAL|590.423.4560|2006-02-05 00:00:00|   IT_PROG|  4800|          null|       103|           60|
|        112|Jose Manuel|      Urman| JMURMAN|515.124.4469|2006-03-07 00:00:00|FI_ACCOUNT|  7800|          null|       108|          100|
|        126|      Irene|Mikkilineni|IMIKKILI|650.124.1224|2006-09-28 00:00:00|  ST_CLERK|  2700|          null|       120|           50|
|        134|    Michael|     Rogers| MROGERS|650.127.1834|2006-08-26 00:00:00|  ST_CLERK|  2900|          null|       122|           50|
|        139|       John|        S

In [24]:
emp.where((emp['HIRE_DATE'] >= '2006-02-05')&(emp['HIRE_DATE'] <= '2006-11-15')).show(5)

+-----------+-----------+-----------+--------+------------+-------------------+----------+------+--------------+----------+-------------+
|EMPLOYEE_ID| FIRST_NAME|  LAST_NAME|   EMAIL|PHONE_NUMBER|          HIRE_DATE|    JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+-----------+-----------+--------+------------+-------------------+----------+------+--------------+----------+-------------+
|        106|      Valli|  Pataballa|VPATABAL|590.423.4560|2006-02-05 00:00:00|   IT_PROG|  4800|          null|       103|           60|
|        112|Jose Manuel|      Urman| JMURMAN|515.124.4469|2006-03-07 00:00:00|FI_ACCOUNT|  7800|          null|       108|          100|
|        126|      Irene|Mikkilineni|IMIKKILI|650.124.1224|2006-09-28 00:00:00|  ST_CLERK|  2700|          null|       120|           50|
|        134|    Michael|     Rogers| MROGERS|650.127.1834|2006-08-26 00:00:00|  ST_CLERK|  2900|          null|       122|           50|
|        139|       John|        S

In [25]:
# 부서번호별 사원수 조회
emp.groupBy('DEPARTMENT_ID').count().show()



+-------------+-----+
|DEPARTMENT_ID|count|
+-------------+-----+
|         null|    1|
|           20|    2|
|           40|    1|
|          100|    6|
|           10|    1|
|           50|   45|
|           80|   34|
|           70|    1|
|           90|    3|
|           60|    5|
|          110|    2|
|           30|    6|
+-------------+-----+



                                                                                

In [26]:
# 부서번호별 사원수 조회 후 부서번호 순으로 정렬
emp.groupBy('DEPARTMENT_ID').count().orderBy('DEPARTMENT_ID').show()



+-------------+-----+
|DEPARTMENT_ID|count|
+-------------+-----+
|         null|    1|
|           10|    1|
|           20|    2|
|           30|    6|
|           40|    1|
|           50|   45|
|           60|    5|
|           70|    1|
|           80|   34|
|           90|    3|
|          100|    6|
|          110|    2|
+-------------+-----+



                                                                                

In [27]:
# 직책별 사원수 조회
emp.groupBy('JOB_ID').count().show()

                                                                                

+----------+-----+
|    JOB_ID|count|
+----------+-----+
|FI_ACCOUNT|    5|
|    MK_MAN|    1|
|   IT_PROG|    5|
|    FI_MGR|    1|
|AC_ACCOUNT|    1|
|    HR_REP|    1|
|  PU_CLERK|    5|
|    AC_MGR|    1|
|    PR_REP|    1|
|    ST_MAN|    5|
|    MK_REP|    1|
|    SA_REP|   30|
|    SA_MAN|    5|
|    PU_MAN|    1|
|  SH_CLERK|   20|
|   AD_PRES|    1|
|  ST_CLERK|   20|
|   AD_ASST|    1|
|     AD_VP|    2|
+----------+-----+



In [28]:
# 직책별 사원수 조회 후 직책 순으로 정렬
emp.groupBy('JOB_ID').count().orderBy('JOB_ID').show()



+----------+-----+
|    JOB_ID|count|
+----------+-----+
|AC_ACCOUNT|    1|
|    AC_MGR|    1|
|   AD_ASST|    1|
|   AD_PRES|    1|
|     AD_VP|    2|
|FI_ACCOUNT|    5|
|    FI_MGR|    1|
|    HR_REP|    1|
|   IT_PROG|    5|
|    MK_MAN|    1|
|    MK_REP|    1|
|    PR_REP|    1|
|  PU_CLERK|    5|
|    PU_MAN|    1|
|    SA_MAN|    5|
|    SA_REP|   30|
|  SH_CLERK|   20|
|  ST_CLERK|   20|
|    ST_MAN|    5|
+----------+-----+



                                                                                

In [29]:
# 직책별 사원수 조회 후 사원수 순으로 내림정렬
emp.groupBy('JOB_ID').count().orderBy('count', ascending=False).show()



+----------+-----+
|    JOB_ID|count|
+----------+-----+
|    SA_REP|   30|
|  SH_CLERK|   20|
|  ST_CLERK|   20|
|   IT_PROG|    5|
|    ST_MAN|    5|
|    SA_MAN|    5|
|  PU_CLERK|    5|
|FI_ACCOUNT|    5|
|     AD_VP|    2|
|    MK_MAN|    1|
|    FI_MGR|    1|
|    HR_REP|    1|
|    AC_MGR|    1|
|AC_ACCOUNT|    1|
|    PU_MAN|    1|
|   AD_PRES|    1|
|    PR_REP|    1|
|    MK_REP|    1|
|   AD_ASST|    1|
+----------+-----+



                                                                                

## 집계함수 사용하기
+ agg(집계함수명)

In [30]:
import pyspark.sql.functions as F

In [31]:
# 직책별 평균 급여를 조회
rs = emp.groupBy('JOB_ID').agg(F.avg('SALARY'))
rs.show()
rs.orderBy('avg(SALARY)', ascending=False).show()    # 내림차순으로 정렬

rs = emp.groupBy('JOB_ID').agg(F.avg('SALARY').alias('mean sal'))    # 별칭 부여
rs.orderBy('mean sal', ascending=False).show()

                                                                                

+----------+-----------+
|    JOB_ID|avg(SALARY)|
+----------+-----------+
|FI_ACCOUNT|     7920.0|
|    MK_MAN|    13000.0|
|   IT_PROG|     5760.0|
|    FI_MGR|    12008.0|
|AC_ACCOUNT|     8300.0|
|    HR_REP|     6500.0|
|  PU_CLERK|     2780.0|
|    AC_MGR|    12008.0|
|    PR_REP|    10000.0|
|    ST_MAN|     7280.0|
|    MK_REP|     6000.0|
|    SA_REP|     8350.0|
|    SA_MAN|    12200.0|
|    PU_MAN|    11000.0|
|  SH_CLERK|     3215.0|
|   AD_PRES|    24000.0|
|  ST_CLERK|     2785.0|
|   AD_ASST|     4400.0|
|     AD_VP|    17000.0|
+----------+-----------+



                                                                                

+----------+-----------+
|    JOB_ID|avg(SALARY)|
+----------+-----------+
|   AD_PRES|    24000.0|
|     AD_VP|    17000.0|
|    MK_MAN|    13000.0|
|    SA_MAN|    12200.0|
|    FI_MGR|    12008.0|
|    AC_MGR|    12008.0|
|    PU_MAN|    11000.0|
|    PR_REP|    10000.0|
|    SA_REP|     8350.0|
|AC_ACCOUNT|     8300.0|
|FI_ACCOUNT|     7920.0|
|    ST_MAN|     7280.0|
|    HR_REP|     6500.0|
|    MK_REP|     6000.0|
|   IT_PROG|     5760.0|
|   AD_ASST|     4400.0|
|  SH_CLERK|     3215.0|
|  ST_CLERK|     2785.0|
|  PU_CLERK|     2780.0|
+----------+-----------+





+----------+--------+
|    JOB_ID|mean sal|
+----------+--------+
|   AD_PRES| 24000.0|
|     AD_VP| 17000.0|
|    MK_MAN| 13000.0|
|    SA_MAN| 12200.0|
|    FI_MGR| 12008.0|
|    AC_MGR| 12008.0|
|    PU_MAN| 11000.0|
|    PR_REP| 10000.0|
|    SA_REP|  8350.0|
|AC_ACCOUNT|  8300.0|
|FI_ACCOUNT|  7920.0|
|    ST_MAN|  7280.0|
|    HR_REP|  6500.0|
|    MK_REP|  6000.0|
|   IT_PROG|  5760.0|
|   AD_ASST|  4400.0|
|  SH_CLERK|  3215.0|
|  ST_CLERK|  2785.0|
|  PU_CLERK|  2780.0|
+----------+--------+



                                                                                

In [34]:
# 사원들의 직책을 모두 출력하세요,
# 단 중복없이 하나씩만 표시되도록 합니다.(distinct)

emp.select('JOB_ID').distinct().show()

+----------+
|    JOB_ID|
+----------+
|FI_ACCOUNT|
|    MK_MAN|
|   IT_PROG|
|    FI_MGR|
|AC_ACCOUNT|
|    HR_REP|
|  PU_CLERK|
|    AC_MGR|
|    PR_REP|
|    ST_MAN|
|    MK_REP|
|    SA_REP|
|    SA_MAN|
|    PU_MAN|
|  SH_CLERK|
|   AD_PRES|
|  ST_CLERK|
|   AD_ASST|
|     AD_VP|
+----------+



In [35]:
# 모든 직책 수는? (중복 제외하고 카운팅)

emp.select('JOB_ID').distinct().count()

                                                                                

19

In [36]:
emp.select(F.countDistinct('JOB_ID').alias('JOB_ID')).show()



+------+
|JOB_ID|
+------+
|    19|
+------+



                                                                                

In [41]:
# 사원의 이름, 직책, 급여 출력하세요
# 단, 5% 인상한 급여도 같이 출력합니다.

# select sal, sal * 1.05 from emp
# emp.select('FIRST_NAME','JOB_ID','SALARY', emp.SALARY * 1.05).show()

emp.select('FIRST_NAME','JOB_ID','SALARY', (emp.SALARY * 1.05).alias('5%sal')).show(5)

+----------+-------+------+-------+
|FIRST_NAME| JOB_ID|SALARY|  5%sal|
+----------+-------+------+-------+
|    Steven|AD_PRES| 24000|25200.0|
|     Neena|  AD_VP| 17000|17850.0|
|       Lex|  AD_VP| 17000|17850.0|
| Alexander|IT_PROG|  9000| 9450.0|
|     Bruce|IT_PROG|  6000| 6300.0|
+----------+-------+------+-------+
only showing top 5 rows



In [50]:
# 20 번 또는 50 번 부서에 근무하며, 급여가 5000 ~ 12,000 사이인 사원들의 LAST_NAME 및 급여을 조회하세요

# emp.filter((emp.DEPARTMENT_ID == 20) | (emp.DEPARTMENT_ID == 50)).filter((emp.SALARY >= 5000 ) & (emp.SALARY <= 12000)).show(5)

emp.filter((emp.DEPARTMENT_ID == 20) | (emp.DEPARTMENT_ID == 50)).filter((emp.SALARY >= 5000 ) & (emp.SALARY <= 12000)) \
.orderBy('SALARY').show(5)

+-----------+----------+---------+--------+------------+-------------------+------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|          HIRE_DATE|JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+-------------------+------+------+--------------+----------+-------------+
|        124|     Kevin|  Mourgos|KMOURGOS|650.123.5234|2007-11-16 00:00:00|ST_MAN|  5800|          null|       100|           50|
|        202|       Pat|      Fay|    PFAY|603.123.6666|2005-08-17 00:00:00|MK_REP|  6000|          null|       201|           20|
|        123|    Shanta|  Vollman|SVOLLMAN|650.123.4234|2005-10-10 00:00:00|ST_MAN|  6500|          null|       100|           50|
|        122|     Payam| Kaufling|PKAUFLIN|650.123.3234|2003-05-01 00:00:00|ST_MAN|  7900|          null|       100|           50|
|        120|   Matthew|    Weiss|  MWEISS|650.123.1234|2004-07-18 00:00:00|ST_MAN|