In [1]:
%load_ext sql
import os
from sqlalchemy import create_engine

pgconfig = {
    'host': 'db',
    'port': os.environ['PG_PORT'],
    'database': os.environ['PG_DATABASE'],
    'user': os.environ['PG_USER'],
    'password': os.environ['PG_PASSWORD'],
}
dsl = 'postgresql://{user}:{password}@{host}:{port}/{database}'.format(**pgconfig)
conn = create_engine(dsl)

# MagicコマンドでSQLを書くための設定
%sql conn

In [2]:
%%sql
DROP TABLE if exists Products;
CREATE TABLE Products
(
    id integer primary key,
    name VARCHAR(16) NOT NULL,
    price INTEGER NOT NULL
);


--重複するレコード
INSERT INTO Products VALUES(1,'りんご',	50);
INSERT INTO Products VALUES(2,'みかん',	100);
INSERT INTO Products VALUES(3,'みかん',	100);
INSERT INTO Products VALUES(4,'みかん',	100);
INSERT INTO Products VALUES(5,'バナナ',	80);

*  postgresql://padawan:***@db:5432/dsdojo_db
Done.
Done.
1 rows affected.
1 rows affected.
1 rows affected.
1 rows affected.
1 rows affected.


[]

## やりたいこと
+ 重複するものだけを抽出したい

In [40]:
%%sql
-- 相関サブクエリを使う方法(その1)
select *
from Products p1
where id < (
    select max(id)
    from Products p2
    where p1.name = p2.name and
    p1.price = p2.price
)

*  postgresql://padawan:***@db:5432/dsdojo_db
2 rows affected.


id,name,price
2,みかん,100
3,みかん,100


In [41]:
%%sql
-- 相関サブクエリを使う方法(その2)
select *
from Products p1
where exists (
    select *
    from products p2
    where p1.name = p2.name and
    p1.price = p2.price and
    p1.id < p2.id
)

*  postgresql://padawan:***@db:5432/dsdojo_db
2 rows affected.


id,name,price
2,みかん,100
3,みかん,100


In [58]:
%%sql
-- exceptを使う
select *
from Products
where id in (
    select id
    from Products
    except
    select max(id)
    from Products
    group by name, price
)

*  postgresql://padawan:***@db:5432/dsdojo_db
2 rows affected.


id,name,price
2,みかん,100
3,みかん,100


In [52]:
%%sql
-- not inを使う
select *
from Products
where id not in (
    select max(id)
    from Products
    group by name, price
)

*  postgresql://padawan:***@db:5432/dsdojo_db
2 rows affected.


id,name,price
2,みかん,100
3,みかん,100


In [5]:
%%sql
select *
from Products
where (name, price) in (
    select name, price
    from Products
    group by name, price
    having count(*) > 1
)

*  postgresql://padawan:***@db:5432/dsdojo_db
3 rows affected.


id,name,price
2,みかん,100
3,みかん,100
4,みかん,100


In [7]:
%%sql
-- window関数を用いる方法
select *
from (
    select *,
        row_number() over (
            partition by name, price
            order by id
        )
    from Products
) as tmp(id, name, price, row_id)
where row_id <> 1

*  postgresql://padawan:***@db:5432/dsdojo_db
2 rows affected.


id,name,price,row_id
3,みかん,100,2
4,みかん,100,3
