# ch 6_ SET OPERATION
****
#### KEY WORD
* UNION
* INTERSECTION
* MINUS
* ROLLUP
* CUBE
* GROUPING SETS
***

## 6-1 UNION

### < UNION >
* union은 중복된 데이터를 제거하고 하나만 출력
* 중복체크를 위해 SORT operation발생 (ORDER by절 발생)

In [1]:
import cx_Oracle
import pandas as pd
xedb = cx_Oracle.connect('hr/hr@localhost/xe')
cur = xedb.cursor()

In [2]:
df = pd.read_sql("""
        SELECT employee_id, job_id
        FROM employees
        
        UNION
        
        SELECT employee_id, job_id
        FROM job_history
        """,xedb)
print(df)

     EMPLOYEE_ID      JOB_ID
0            100     AD_PRES
1            101  AC_ACCOUNT
2            101      AC_MGR
3            101       AD_VP
4            102       AD_VP
5            102     IT_PROG
6            103     IT_PROG
7            104     IT_PROG
8            105     IT_PROG
9            106     IT_PROG
10           107     IT_PROG
11           108      FI_MGR
12           109  FI_ACCOUNT
13           110  FI_ACCOUNT
14           111  FI_ACCOUNT
15           112  FI_ACCOUNT
16           113  FI_ACCOUNT
17           114      PU_MAN
18           114    ST_CLERK
19           115    PU_CLERK
20           116    PU_CLERK
21           117    PU_CLERK
22           118    PU_CLERK
23           119    PU_CLERK
24           120      ST_MAN
25           121      ST_MAN
26           122    ST_CLERK
27           122      ST_MAN
28           123      ST_MAN
29           124      ST_MAN
..           ...         ...
85           179      SA_REP
86           180    SH_CLERK
87           1

###  < UNION ALL>
* Union all 합집합이되 중복된 데이터도 출력
* sort operation 발생하지 않음 
* 인위적으로 sort 가능(order by절)
* ORDER by절에는 첫번째 쿼리 문장의 컬럼을 사용해야, 혹은 위치표기법

In [3]:
df = pd.read_sql("""
        SELECT employee_id, job_id
        FROM employees
        
        UNION ALL
        
        SELECT employee_id, job_id
        FROM job_history
        ORDER BY 1
        """,xedb)
print(df)

     EMPLOYEE_ID      JOB_ID
0            100     AD_PRES
1            101       AD_VP
2            101  AC_ACCOUNT
3            101      AC_MGR
4            102       AD_VP
5            102     IT_PROG
6            103     IT_PROG
7            104     IT_PROG
8            105     IT_PROG
9            106     IT_PROG
10           107     IT_PROG
11           108      FI_MGR
12           109  FI_ACCOUNT
13           110  FI_ACCOUNT
14           111  FI_ACCOUNT
15           112  FI_ACCOUNT
16           113  FI_ACCOUNT
17           114      PU_MAN
18           114    ST_CLERK
19           115    PU_CLERK
20           116    PU_CLERK
21           117    PU_CLERK
22           118    PU_CLERK
23           119    PU_CLERK
24           120      ST_MAN
25           121      ST_MAN
26           122      ST_MAN
27           122    ST_CLERK
28           123      ST_MAN
29           124      ST_MAN
..           ...         ...
87           180    SH_CLERK
88           181    SH_CLERK
89           1

## 6-2 INTERSECTION
* SORT operation발생

In [4]:
df = pd.read_sql("""
        SELECT employee_id, job_id
        FROM employees
        
        INTERSECT
        
        SELECT employee_id, job_id
        FROM job_history
        """,xedb)
print(df)

   EMPLOYEE_ID   JOB_ID
0          176   SA_REP
1          200  AD_ASST


## 6-3 MINUS

In [5]:
df = pd.read_sql("""
        SELECT employee_id, job_id
        FROM employees
        
        MINUS
        
        SELECT employee_id, job_id
        FROM job_history
        """,xedb)
print(df)

     EMPLOYEE_ID      JOB_ID
0            100     AD_PRES
1            101       AD_VP
2            102       AD_VP
3            103     IT_PROG
4            104     IT_PROG
5            105     IT_PROG
6            106     IT_PROG
7            107     IT_PROG
8            108      FI_MGR
9            109  FI_ACCOUNT
10           110  FI_ACCOUNT
11           111  FI_ACCOUNT
12           112  FI_ACCOUNT
13           113  FI_ACCOUNT
14           114      PU_MAN
15           115    PU_CLERK
16           116    PU_CLERK
17           117    PU_CLERK
18           118    PU_CLERK
19           119    PU_CLERK
20           120      ST_MAN
21           121      ST_MAN
22           122      ST_MAN
23           123      ST_MAN
24           124      ST_MAN
25           125    ST_CLERK
26           126    ST_CLERK
27           127    ST_CLERK
28           128    ST_CLERK
29           129    ST_CLERK
..           ...         ...
75           175      SA_REP
76           177      SA_REP
77           1

## 6-4 ROLLUP
### < 열개수, 타입 맞추기 >

    SELECT department_id, job_id, manager_id, sum(salary)
    FROM employees
    GROUP BY department_id, job_id, manager_id

    UNION ALL

    SELECT department_id, job_id, NULL, sum(salary)
    FROM employees
    GROUP BY department_id, job_id

    UNION ALL

    SELECT department_id, NULL, NULL, sum(salary)
    FROM employees
    GROUP BY department_id

    UNION ALL

    SELECT NULL, NULL, NULL, sum(salary)
    FROM employees 
    
#### 문제점 : 동일한 테이블(employees)을 4번씩이나 반복적으로 작업 ( ROLLUP으로 해결)

###  < ROLLUP >
* GROUP BY 절의 UNION 과 같다.

      GROUP BY ROLLUP(a, b, c) 는

      GROUP BY a,b,c
      UNION
      GROUP BY a,b
      UNION
      GROUP BY a
      UNION
      GROUP BY ()


* ROLLUP은 컬럼의 순서가 중요하다.
    
        {a,b,c}
        {a,b}
        {a}
        {전체}
        4개의 집계값을 차례로 출력하는 것

In [6]:
df = pd.read_sql("""
        SELECT department_id, job_id, manager_id, sum(salary)
        FROM employees
        GROUP BY ROLLUP(department_id, job_id, manager_id)
        """,xedb)
print(df)

    DEPARTMENT_ID      JOB_ID  MANAGER_ID  SUM(SALARY)
0             NaN      SA_REP       149.0         7000
1             NaN      SA_REP         NaN         7000
2             NaN        None         NaN         7000
3            10.0     AD_ASST       101.0         4400
4            10.0     AD_ASST         NaN         4400
5            10.0        None         NaN         4400
6            20.0      MK_MAN       100.0        13000
7            20.0      MK_MAN         NaN        13000
8            20.0      MK_REP       201.0         6000
9            20.0      MK_REP         NaN         6000
10           20.0        None         NaN        19000
11           30.0      PU_MAN       100.0        11000
12           30.0      PU_MAN         NaN        11000
13           30.0    PU_CLERK       114.0        13900
14           30.0    PU_CLERK         NaN        13900
15           30.0        None         NaN        24900
16           40.0      HR_REP       101.0         6500
17        

## 6-5 CUBE
* rollup기능 포함, 가능한 모든 조합을 집계

        {a,b,c}
        {a,b}
        {a}
        {전체}
        + {b}, {c}, {a,c}, {b,c}
* GROUP BY 절의 UNION ALL과 같다  
      GROUP BY CUBE(a, b, c) 는
 
      GROUP BY ()
      UNION ALL
      GROUP BY c
      UNION ALL
      GROUP BY b
      UNION ALL
      GROUP BY b,c
      UNION ALL
      GROUP BY a,c
      UNION ALL
      GROUP BY a,b
      UNION ALL
      GROUP BY a,b,c
* CUBE는 컬럼의 순서가 중요하지 않다.

In [7]:
df = pd.read_sql("""
        SELECT department_id, job_id, manager_id, sum(salary)
        FROM employees
        GROUP BY CUBE(department_id, job_id, manager_id)
        """,xedb)
print(df)

     DEPARTMENT_ID      JOB_ID  MANAGER_ID  SUM(SALARY)
0              NaN        None         NaN         7000
1              NaN        None         NaN        24000
2              NaN        None         NaN       691416
3              NaN        None       100.0       155400
4              NaN        None       101.0        44916
5              NaN        None       102.0         9000
6              NaN        None       103.0        19800
7              NaN        None       108.0        39600
8              NaN        None       114.0        13900
9              NaN        None       120.0        22100
10             NaN        None       121.0        25400
11             NaN        None       122.0        23600
12             NaN        None       123.0        25900
13             NaN        None       124.0        23000
14             NaN        None       145.0        51000
15             NaN        None       146.0        51000
16             NaN        None       147.0      

## 6-6 GROUPING SETS
* 불필요한 집계값 말고, {a,b}, {a,c}와 같이 원하는 집계값만 구할 때 사용

In [8]:
df = pd.read_sql("""
        SELECT department_id, job_id, manager_id, sum(salary)
        FROM employees
        GROUP BY GROUPING SETS( (department_id, job_id), (department_id,manager_id) )
        """,xedb)
print(df)

    DEPARTMENT_ID      JOB_ID  MANAGER_ID  SUM(SALARY)
0           110.0  AC_ACCOUNT         NaN         8300
1            90.0       AD_VP         NaN        34000
2            50.0    ST_CLERK         NaN        55700
3            80.0      SA_REP         NaN       243500
4            50.0      ST_MAN         NaN        36400
5            80.0      SA_MAN         NaN        61000
6           110.0      AC_MGR         NaN        12008
7            90.0     AD_PRES         NaN        24000
8            60.0     IT_PROG         NaN        28800
9           100.0      FI_MGR         NaN        12008
10           30.0    PU_CLERK         NaN        13900
11           50.0    SH_CLERK         NaN        64300
12           20.0      MK_MAN         NaN        13000
13          100.0  FI_ACCOUNT         NaN        39600
14            NaN      SA_REP         NaN         7000
15           70.0      PR_REP         NaN        10000
16           30.0      PU_MAN         NaN        11000
17        

In [9]:
df = pd.read_sql("""
        SELECT department_id, job_id, manager_id, sum(salary)
        FROM employees
        GROUP BY GROUPING SETS((department_id, job_id), (department_id,manager_id),() )
        """,xedb)
print(df)

# ( ) : 전체집합

    DEPARTMENT_ID      JOB_ID  MANAGER_ID  SUM(SALARY)
0           110.0  AC_ACCOUNT         NaN         8300
1            90.0       AD_VP         NaN        34000
2            50.0    ST_CLERK         NaN        55700
3            80.0      SA_REP         NaN       243500
4            50.0      ST_MAN         NaN        36400
5            80.0      SA_MAN         NaN        61000
6           110.0      AC_MGR         NaN        12008
7            90.0     AD_PRES         NaN        24000
8            60.0     IT_PROG         NaN        28800
9           100.0      FI_MGR         NaN        12008
10           30.0    PU_CLERK         NaN        13900
11           50.0    SH_CLERK         NaN        64300
12           20.0      MK_MAN         NaN        13000
13          100.0  FI_ACCOUNT         NaN        39600
14            NaN      SA_REP         NaN         7000
15           70.0      PR_REP         NaN        10000
16           30.0      PU_MAN         NaN        11000
17        

***
# [ 연습문제 ]
***
### (1)  union 연산자 없이 union all를 이용하여 다음 문장을 수정하세요.
* union은 sort가 작업이 발생하기 때문에 중복을 만들지 않고, union all을 사용하기
* 첫번째 쿼리: 부서가 없는 사원 / 두번째쿼리: 소속사원이 없는 부서정보
    
      SELECT e.employee_id, e.last_name, d.department_name
      FROM employees e, departments d
      WHERE e.department_id = d.department_id(+)

      UNION

      SELECT e.employee_id, e.last_name, d.department_name
      FROM employees e, departments d
      WHERE e.department_id(+) = d.department_id

####  < UNION   =    UNION ALL    +    NOT EXISTS  >

In [10]:
df = pd.read_sql("""
        SELECT e.employee_id, e.last_name, d.department_name
        FROM employees e, departments d
        WHERE e.department_id = d.department_id(+)
        
        UNION ALL
        
        SELECT NULL, NULL, d.department_name
        FROM departments d
        WHERE NOT EXISTS (SELECT 'x' 
                          FROM employees e 
                          WHERE e.department_id = d.department_id)
        """,xedb)
print(df)

     EMPLOYEE_ID   LAST_NAME       DEPARTMENT_NAME
0          200.0      Whalen        Administration
1          202.0         Fay             Marketing
2          201.0   Hartstein             Marketing
3          119.0  Colmenares            Purchasing
4          118.0      Himuro            Purchasing
5          117.0      Tobias            Purchasing
6          116.0       Baida            Purchasing
7          115.0        Khoo            Purchasing
8          114.0    Raphaely            Purchasing
9          203.0      Mavris       Human Resources
10         199.0       Grant              Shipping
11         198.0    OConnell              Shipping
12         197.0      Feeney              Shipping
13         196.0       Walsh              Shipping
14         195.0       Jones              Shipping
15         194.0      McCain              Shipping
16         193.0     Everett              Shipping
17         192.0        Bell              Shipping
18         191.0     Perkins   

#### < OUTER JOIN >

In [11]:
df = pd.read_sql("""
        SELECT e.employee_id, e.last_name, d.department_name
        FROM employees e 
        FULL OUTER JOIN departments d
        ON e.department_id = d.department_id
        """,xedb)
print(df)

     EMPLOYEE_ID    LAST_NAME       DEPARTMENT_NAME
0          100.0         King             Executive
1          101.0      Kochhar             Executive
2          102.0      De Haan             Executive
3          103.0       Hunold                    IT
4          104.0        Ernst                    IT
5          105.0       Austin                    IT
6          106.0    Pataballa                    IT
7          107.0      Lorentz                    IT
8          108.0    Greenberg               Finance
9          109.0       Faviet               Finance
10         110.0         Chen               Finance
11         111.0      Sciarra               Finance
12         112.0        Urman               Finance
13         113.0         Popp               Finance
14         114.0     Raphaely            Purchasing
15         115.0         Khoo            Purchasing
16         116.0        Baida            Purchasing
17         117.0       Tobias            Purchasing
18         1

### (2) Intersect 연산자 없이 다음 문장을 수정하세요.
* 중복을 사용하지 않고 교집합을 만들기

      SELECT employee_id, job_id
      FROM employees

      INTERSECT

      SELECT employee_id, job_id
      FROM job_history

#### < JOIN >

In [12]:
df = pd.read_sql("""
        SELECT e.employee_id, e.job_id
        FROM employees e, job_history j
        WHERE e.employee_id = j.employee_id
        AND e.job_id = j.job_id
        """,xedb)
print(df)

   EMPLOYEE_ID   JOB_ID
0          200  AD_ASST
1          176   SA_REP


#### < EXISTS >

In [13]:
df = pd.read_sql("""
        SELECT employee_id, job_id
        FROM employees e
        WHERE EXISTS (SELECT 'x' 
                      FROM job_history 
                      WHERE employee_id = e.employee_id 
                      AND job_id = e.job_id)
        """,xedb)
print(df)

   EMPLOYEE_ID   JOB_ID
0          200  AD_ASST
1          176   SA_REP


### (3) Minus 연산자 없이 다음 문장을 수정하세요
* 중복을 사용하지 않고 A-B 차집합 만들기

       SELECT employee_id, job_id
       FROM employees
          
       MINUS

       SELECT employee_id, job_id
       FROM job_history

#### < NOT EXISTS >

In [14]:
df = pd.read_sql("""
        SELECT employee_id, job_id
        FROM employees e
        WHERE NOT EXISTS (SELECT 'x' 
                          FROM job_history 
                          WHERE employee_id = e.employee_id 
                          AND job_id = e.job_id)
        """,xedb)
print(df)

     EMPLOYEE_ID      JOB_ID
0            148      SA_MAN
1            118    PU_CLERK
2            136    ST_CLERK
3            117    PU_CLERK
4            122      ST_MAN
5            158      SA_REP
6            175      SA_REP
7            142    ST_CLERK
8            101       AD_VP
9            168      SA_REP
10           115    PU_CLERK
11           131    ST_CLERK
12           173      SA_REP
13           193    SH_CLERK
14           105     IT_PROG
15           111  FI_ACCOUNT
16           205      AC_MGR
17           183    SH_CLERK
18           167      SA_REP
19           146      SA_MAN
20           186    SH_CLERK
21           149      SA_MAN
22           104     IT_PROG
23           124      ST_MAN
24           206  AC_ACCOUNT
25           171      SA_REP
26           174      SA_REP
27           128    ST_CLERK
28           134    ST_CLERK
29           164      SA_REP
..           ...         ...
75           103     IT_PROG
76           145      SA_MAN
77           1

In [15]:
cur.close()
xedb.close()