## Data Validation for DVD Rental Dataset

This notebook loads all of the data sets used in this SQL tool and quickly QAs them to make sure the level of data dirtiness is as expected. Any unexpected irregularities should be noted and fixed accordingly.

In [1]:
%%capture
import pandas as pd
import sqlalchemy
import psycopg2
import sqlite3

%reload_ext sql
%config SqlMagic.displaylimit = 10
%sql sqlite://
    
%store -r actor
%store -r film_actor
%store -r customer
%store -r film_category
%store -r films
%store -r language
%store -r rental

%sql persist actor
%sql persist film_actor
%sql persist customer
%sql persist film_category
%sql persist films
%sql persist language
%sql persist rental

### Actor Dataset
* Looks at overall data schema and performs basic distinct counts
* Update dates can be all the same time or different (assume that actors are manually maintained by an intern heh)
* Expect dirty data (duplicate actors in the system)

In [2]:
%%sql
select * from actor

 * sqlite://
Done.


index,actor_id,first_name,last_name,last_update
0,1,Penelope,Guiness,2/15/06 10:05
1,2,Nick,Wahlberg,2/15/06 10:05
2,3,Ed,Chase,2/15/06 10:05
3,4,Jennifer,Davis,2/15/06 10:05
4,5,Johnny,Lollobrigida,2/15/06 10:05
5,6,Bette,Nicholson,2/15/06 10:05
6,7,Grace,Mostel,2/15/06 10:05
7,8,Matthew,Johansson,2/15/06 10:05
8,9,Joe,Swank,2/15/06 10:05
9,10,Christian,Gable,2/15/06 10:05


In [3]:
%%sql
select count(*) from actor

 * sqlite://
Done.


count(*)
200


In [4]:
%%sql
-- Would be good if we had more than one duplicate
select sum(1) from (select distinct first_name, last_name from actor) unique_actors

 * sqlite://
Done.


sum(1)
199


In [5]:
%%sql
select first_name, last_name from actor group by first_name, last_name having count(*) > 1

 * sqlite://
Done.


first_name,last_name
Laura,VerHulst


In [7]:
%%sql
-- Looks like the actor_ids are messed up!
select * from actor where first_name = 'Laura' and last_name = 'VerHulst'

 * sqlite://
Done.


index,actor_id,first_name,last_name,last_update
101,101,Laura,VerHulst,2/15/06 10:05
110,110,Laura,VerHulst,2/15/06 10:05


### Film and Actor Mapping Dataset
* Looks at overall data schema and performs basic distinct counts
* Make sure that the film and mapping include our dirty actor

In [9]:
%%sql
-- Note that film_id is a different data type than actor_id
select * from film_actor

 * sqlite://
Done.


index,actor_id,film_id,last_update
0,1,1.0,2006-02-15 10:05:03
1,1,23.0,2006-02-15 10:05:03
2,1,25.0,2006-02-15 10:05:03
3,1,106.0,2006-02-15 10:05:03
4,1,140.0,2006-02-15 10:05:03
5,1,166.0,2006-02-15 10:05:03
6,1,277.0,2006-02-15 10:05:03
7,1,361.0,2006-02-15 10:05:03
8,1,438.0,2006-02-15 10:05:03
9,1,499.0,2006-02-15 10:05:03


In [12]:
%%sql
select sum(1) from film_actor

 * sqlite://
Done.


sum(1)
5462


In [11]:
%%sql
select count(distinct film_id) from film_actor

 * sqlite://
Done.


count(distinct film_id)
997


In [10]:
%%sql
select count(distinct actor_id) from film_actor

 * sqlite://
Done.


count(distinct actor_id)
200


In [13]:
%%sql
select sum(1) from (select actor_id, film_id from film_actor group by actor_id, film_id)

 * sqlite://
Done.


sum(1)
5462


In [14]:
%%sql
select actor_id, film_id from film_actor group by actor_id, film_id having count(*) > 1

 * sqlite://
Done.


actor_id,film_id


In [15]:
%%sql
select * from film_actor where actor_id in (110, 101)

 * sqlite://
Done.


index,actor_id,film_id,last_update
2686,101,60.0,2006-02-15 10:05:03
2687,101,66.0,2006-02-15 10:05:03
2688,101,85.0,2006-02-15 10:05:03
2689,101,146.0,2006-02-15 10:05:03
2690,101,189.0,2006-02-15 10:05:03
2691,101,250.0,2006-02-15 10:05:03
2692,101,255.0,2006-02-15 10:05:03
2693,101,263.0,2006-02-15 10:05:03
2694,101,275.0,2006-02-15 10:05:03
2695,101,289.0,2006-02-15 10:05:03


In [25]:
%%sql
select 
  min(num_actors) as min_num_actors,
  avg(num_actors) as avg_num_actors,
  max(num_actors) as max_num_actors
from
  (select film_id, count(distinct actor_id) as num_actors from film_actor group by film_id) actor_count

 * sqlite://
Done.


min_num_actors,avg_num_actors,max_num_actors
1,5.4784353059177535,15


### Rental Dataset
* Describes the rental activity of each DVD we have
* last_update field can be the same because we have the intern
* Looking for dirty data where return_date < rental_date

In [26]:
%%sql
select * from rental

 * sqlite://
Done.


index,rental_id,rental_date,inventory_id,customer_id,return_date,staff_id,last_update
0,2,5/24/05 22:54,1525,459,5/28/05 19:40,1,2/16/06 2:30
1,3,5/24/05 23:03,1711,408,6/1/05 22:12,1,2/16/06 2:30
2,4,5/24/05 23:04,2452,333,6/3/05 1:43,2,2/16/06 2:30
3,5,5/24/05 23:05,2079,222,6/2/05 4:33,1,2/16/06 2:30
4,6,5/24/05 23:08,2792,549,5/27/05 1:32,1,2/16/06 2:30
5,7,5/24/05 23:11,3995,269,5/29/05 20:34,2,2/16/06 2:30
6,8,5/24/05 23:31,2346,239,5/27/05 23:33,2,2/16/06 2:30
7,9,5/25/05 0:00,2580,126,5/28/05 0:22,1,2/16/06 2:30
8,10,5/25/05 0:02,1824,399,5/31/05 22:44,2,2/16/06 2:30
9,11,5/25/05 0:09,4443,142,6/2/05 20:56,2,2/16/06 2:30


In [29]:
%%sql
select * from rental order by rental_id

 * sqlite://
Done.


index,rental_id,rental_date,inventory_id,customer_id,return_date,staff_id,last_update
16043,1,5/24/05 22:53,367,130,5/26/05 22:04,1,2/15/06 21:30
0,2,5/24/05 22:54,1525,459,5/28/05 19:40,1,2/16/06 2:30
1,3,5/24/05 23:03,1711,408,6/1/05 22:12,1,2/16/06 2:30
2,4,5/24/05 23:04,2452,333,6/3/05 1:43,2,2/16/06 2:30
3,5,5/24/05 23:05,2079,222,6/2/05 4:33,1,2/16/06 2:30
4,6,5/24/05 23:08,2792,549,5/27/05 1:32,1,2/16/06 2:30
5,7,5/24/05 23:11,3995,269,5/29/05 20:34,2,2/16/06 2:30
6,8,5/24/05 23:31,2346,239,5/27/05 23:33,2,2/16/06 2:30
7,9,5/25/05 0:00,2580,126,5/28/05 0:22,1,2/16/06 2:30
8,10,5/25/05 0:02,1824,399,5/31/05 22:44,2,2/16/06 2:30


In [30]:
%%sql
select count(*) from rental

 * sqlite://
Done.


count(*)
16044


In [31]:
%%sql
select count(distinct rental_id) from rental

 * sqlite://
Done.


count(distinct rental_id)
16044


In [32]:
%%sql
-- Investigating distinct values to determine what identifies a unique row 
select 
    count(distinct rental_id) as rental_id_count,
    count(distinct rental_date) as rental_date_count,
    count(distinct inventory_id) as inventory_id_count, 
    count(distinct customer_id) as customer_id_count,
    count(distinct staff_id) as staff_id_count, 
    count(distinct last_update) as last_update_count
from rental

 * sqlite://
Done.


rental_id_count,rental_date_count,inventory_id_count,customer_id_count,staff_id_count,last_update_count
16044,13319,4580,599,2,3


In [51]:
%%sql
select * from rental

 * sqlite://
Done.


index,rental_id,rental_date,inventory_id,customer_id,return_date,staff_id,last_update
0,2,5/24/05 22:54,1525,459,5/28/05 19:40,1,2/16/06 2:30
1,3,5/24/05 23:03,1711,408,6/1/05 22:12,1,2/16/06 2:30
2,4,5/24/05 23:04,2452,333,6/3/05 1:43,2,2/16/06 2:30
3,5,5/24/05 23:05,2079,222,6/2/05 4:33,1,2/16/06 2:30
4,6,5/24/05 23:08,2792,549,5/27/05 1:32,1,2/16/06 2:30
5,7,5/24/05 23:11,3995,269,5/29/05 20:34,2,2/16/06 2:30
6,8,5/24/05 23:31,2346,239,5/27/05 23:33,2,2/16/06 2:30
7,9,5/25/05 0:00,2580,126,5/28/05 0:22,1,2/16/06 2:30
8,10,5/25/05 0:02,1824,399,5/31/05 22:44,2,2/16/06 2:30
9,11,5/25/05 0:09,4443,142,6/2/05 20:56,2,2/16/06 2:30


In [64]:
%%sql
select * from rental where rental_date > return_date

 * sqlite://
Done.


index,rental_id,rental_date,inventory_id,customer_id,return_date,staff_id,last_update
30,32,5/25/05 4:06,3832,230,5/25/05 23:55,1,2/16/06 2:30
186,188,5/26/05 5:47,3342,243,5/26/05 23:48,1,2/16/06 2:30
518,521,5/28/05 3:32,2397,374,5/28/05 22:37,1,2/16/06 2:30
1210,1213,6/15/05 3:14,3191,94,6/15/05 21:41,2,2/16/06 2:30
1554,1557,6/16/05 2:28,2617,439,6/16/05 22:11,2,2/16/06 2:30
1562,1565,6/16/05 3:13,3833,506,6/16/05 22:42,2,2/16/06 2:30
1867,1870,6/17/05 2:24,895,191,6/17/05 23:04,2,2/16/06 2:30
1901,1904,6/17/05 4:45,2472,87,6/17/05 23:56,2,2/16/06 2:30
2211,2214,6/18/05 2:44,2430,366,6/18/05 23:37,2,2/16/06 2:30
2215,2218,6/18/05 3:13,3577,176,6/18/05 21:16,1,2/16/06 2:30


In [62]:
%%sql
pragma table_info(rental);

 * sqlite://
Done.


cid,name,type,notnull,dflt_value,pk
0,index,BIGINT,0,,0
1,rental_id,BIGINT,0,,0
2,rental_date,TEXT,0,,0
3,inventory_id,BIGINT,0,,0
4,customer_id,BIGINT,0,,0
5,return_date,TEXT,0,,0
6,staff_id,BIGINT,0,,0
7,last_update,TEXT,0,,0


In [63]:
%%sql
select 
  strftime('%d', date(rental_date)), 
  count(*) as count 
from 
  rental
group by 
  strftime('%d', rental_date)
order by 
  count(*) desc

 * sqlite://
Done.


"strftime('%d', date(rental_date))",count
,16044


In [None]:
%%sql
select 'rental' as table, min(rental_id) as min_id, max(rental_id) as max_id from rental

UNION

select 'payment' as table, min(rental_id) as min_id, max(rental_id) as max_id from payment

ORDER BY min_id

In [None]:
%%sql
select * from payment where rental_id = 1400

In [None]:
%%sql
select * from rental where rental_id = 1400

In [None]:
%%sql
select count(*) from payment;

In [None]:
%%sql
select count(distinct payment_id) from payment

In [None]:
%%sql
select count(distinct rental_id) from payment

In [None]:
%%sql
select p.rental_id, p.payment_id, p.customer_id, p.amount
from payment p
join 
(select rental_id, COUNT(*) as rental_count
 from payment group by rental_id 
 having COUNT(*) > 1) pr ON p.rental_id = pr.rental_id

In [None]:
%%sql
select rental_id, customer_id from rental where rental_id = 4591

In [None]:
%%sql
select rental_id, customer_id from rental where customer_id = 16

In [None]:
%%sql
select count(distinct customer_id) from rental

In [None]:
%%sql
select customer_id, count(*) as count from rental group by customer_id order by count desc

In [None]:
%%sql
select count(distinct customer_id) from customer

In [None]:
%%sql
select customer_id, count(*) as count from customer group by customer_id order by count desc

In [None]:
%%sql
select distinct activebool from customer

In [None]:
%%sql
select active, count(*) from customer group by active

In [None]:
%%sql
select activebool, count(*) from customer group by activebool

In [None]:
 %%sql
select email, count(customer_id) as count from customer group by email order by count desc

In [None]:
%%sql
-- All of the create dates are the same! More likely the date that this was added to the 
select create_date, count(*) from customer group by create_date

In [None]:
%%sql
select distinct create_date from customer

In [None]:
%%sql
select avg(inv_count) 
from 
    (
    select film_id, count(distinct inventory_id) as inv_count 
    from inventory 
    group by film_id 
    order by inv_count desc
    ) a

In [None]:
%%sql
select distinct first_name from customer order by first_name 

In [None]:
%%sql
select * from payment where payment_date IS NULL

In [None]:
%%sql
select date_trunc('month', payment_date) as month, count(*)
from payment 
group by date_trunc('month', payment_date)

In [None]:
%%sql
select customer_id from customer order by customer_id

In [None]:
%%sql
select COUNT(CASE WHEN upper(description) LIKE '%MOOSE%' THEN 1 END) as moose_count, COUNT(*) from film

In [None]:
%%sql
select count(distinct film_id) from film

In [None]:
%%sql
select special_features from film

In [None]:
%%sql
select distinct special_features from film

In [None]:
%%sql
select * from film

In [None]:
%%sql
select special_features, count(distinct film_id) as count 
from 
    (
    select film_id, unnest(special_features) as special_features from film
    ) a
group by special_features
order by count desc

In [None]:
%%sql
select rating, count(distinct film_id) from film group by rating

In [None]:
%%sql
select rental_rate, count(distinct film_id) from film group by rental_rate

In [None]:
%%sql
select fulltext:'airport'] from film

In [None]:
%%sql
ALTER TABLE film
DROP COLUMN fulltext

In [None]:
%%sql
select * from film

In [None]:
%%sql
select * from film

In [None]:
%%sql
select f.film_id, title, MAX(inventory_id)-MIN(inventory_id)+1 as total_inventory
from film f
join inventory i on f.film_id = i.film_id
group by f.film_id, title
order by total_inventory desc

In [None]:
%%sql
select f.film_id, title, inventory_id
from film f
left join inventory i on f.film_id = i.film_id
where inventory_id IS NULL

In [None]:
%%sql
select count(distinct film_id) from inventory

In [None]:
%%sql
select f.film_id, title, inventory_id
from film f
join inventory i on f.film_id = i.film_id

In [None]:
%%sql
select count(distinct case when inventory_id is null then f.film_id end)
from film f
left join inventory i on f.film_id = i.film_id

In [None]:
%%sql
select count(distinct f.film_id)
from film f
left join inventory i on f.film_id = i.film_id

In [None]:
# NULLS ARE PRESENT IN THE INVENTORY TABLE
# THERE ARE FILMS IN THE FILM TABLE THAT ARE NOT IN THE INVENTORY TABLE

In [None]:
%%sql
select * from film

In [None]:
%%sql
select * from rental

In [None]:
%%sql
select * from inventory

In [None]:
%%sql
-- Number of rentals per customer
select customer_id, count(*) as count
from rental
group by customer_id 
order by count desc

In [None]:
%%sql
select inventory_id, film_id from inventory i
join film f on i.film_id = f.film_id