## Data Validation for DVD Rental Dataset

This notebook loads all of the data sets used in this SQL tool and quickly QAs them to make sure the level of data dirtiness is as expected. Any unexpected irregularities should be noted and fixed accordingly.

In [1]:
%%capture
import pandas as pd
import sqlalchemy
import psycopg2
import sqlite3

%reload_ext sql
%config SqlMagic.displaylimit = 10
%sql sqlite://
    
%store -r actor
%store -r film_actor
%store -r customer
%store -r film_category
%store -r films
%store -r language
%store -r rental

%sql persist actor
%sql persist film_actor
%sql persist customer
%sql persist film_category
%sql persist films
%sql persist language
%sql persist rental

### Actor Dataset
* Looks at overall data schema and performs basic distinct counts
* Update dates can be all the same time or different (assume that actors are manually maintained by an intern heh)
* Expect dirty data (duplicate actors in the system)

In [2]:
%%sql
select * from actor

 * sqlite://
Done.


index,actor_id,first_name,last_name,last_update
0,1,Penelope,Guiness,2/15/06 10:05
1,2,Nick,Wahlberg,2/15/06 10:05
2,3,Ed,Chase,2/15/06 10:05
3,4,Jennifer,Davis,2/15/06 10:05
4,5,Johnny,Lollobrigida,2/15/06 10:05
5,6,Bette,Nicholson,2/15/06 10:05
6,7,Grace,Mostel,2/15/06 10:05
7,8,Matthew,Johansson,2/15/06 10:05
8,9,Joe,Swank,2/15/06 10:05
9,10,Christian,Gable,2/15/06 10:05


In [3]:
%%sql
select count(*) from actor

 * sqlite://
Done.


count(*)
200


In [7]:
%%sql
-- Would be good if we had more than one duplicate
select sum(1) from (select distinct first_name, last_name from actor) unique_actors

 * sqlite://
Done.


sum(1)
199


In [6]:
%%sql
select first_name, last_name from actor group by first_name, last_name having count(*) > 1

 * sqlite://
Done.


first_name,last_name
Susan,Davis


In [9]:
%%sql
-- Looks like the actor_ids are messed up!
select * from actor where first_name = 'Susan' and last_name = 'Davis'

 * sqlite://
Done.


index,actor_id,first_name,last_name,last_update
101,101,Susan,Davis,2/15/06 10:05
110,110,Susan,Davis,2/15/06 10:05


In [None]:
%%sql
-- Querying rentals
-- Case of converting a date to a string and then aggregating to the month level
SELECT left(cast(rental_date as varchar), 7) as month, count(*)
from rental 
GROUP BY left(cast(rental_date as varchar), 7)
ORDER BY MONTH desc

In [None]:
%%sql
select * from
rental order by rental_id

In [None]:
%%sql
SELECT COUNT(*) FROM rental

In [None]:
%%sql
SELECT COUNT(DISTINCT rental_id) FROM rental

In [None]:
%%sql
-- Investigating distinct values to determine what identifies a unique row 
SELECT 
    COUNT(DISTINCT rental_id) as rental_id_count,
    COUNT(DISTINCT rental_date) as rental_date_count,
    COUNT(DISTINCT inventory_id) as inventory_id_count, 
    COUNT(DISTINCT customer_id) as customer_id_count,
    COUNT(DISTINCT staff_id) as staff_id_count, 
    COUNT(DISTINCT last_update) as last_update_count
FROM rental

In [None]:
%%sql
select 
    date_trunc('month', payment_date) as month,
    sum(amount) as amount
from payment
group by date_trunc('month', payment_date)
order by month 

In [None]:
%%sql


In [None]:
%%sql
select distinct payment_date from payment order by payment_date

In [None]:
%%sql
select date_trunc('day', rental_date), count(*) as count 
from rental
group by date_trunc('day', rental_date)
order by count desc

In [None]:
%%sql
select 'rental' as table, min(rental_id) as min_id, max(rental_id) as max_id from rental

UNION

select 'payment' as table, min(rental_id) as min_id, max(rental_id) as max_id from payment

ORDER BY min_id

In [None]:
%%sql
select * from payment where rental_id = 1400

In [None]:
%%sql
select * from rental where rental_id = 1400

In [None]:
%%sql
select count(*) from payment;

In [None]:
%%sql
select count(distinct payment_id) from payment

In [None]:
%%sql
select count(distinct rental_id) from payment

In [None]:
%%sql
select p.rental_id, p.payment_id, p.customer_id, p.amount
from payment p
join 
(select rental_id, COUNT(*) as rental_count
 from payment group by rental_id 
 having COUNT(*) > 1) pr ON p.rental_id = pr.rental_id

In [None]:
%%sql
select rental_id, customer_id from rental where rental_id = 4591

In [None]:
%%sql
select rental_id, customer_id from rental where customer_id = 16

In [None]:
%%sql
select count(distinct customer_id) from rental

In [None]:
%%sql
select customer_id, count(*) as count from rental group by customer_id order by count desc

In [None]:
%%sql
select count(distinct customer_id) from customer

In [None]:
%%sql
select customer_id, count(*) as count from customer group by customer_id order by count desc

In [None]:
%%sql
select distinct activebool from customer

In [None]:
%%sql
select active, count(*) from customer group by active

In [None]:
%%sql
select activebool, count(*) from customer group by activebool

In [None]:
 %%sql
select email, count(customer_id) as count from customer group by email order by count desc

In [None]:
%%sql
-- All of the create dates are the same! More likely the date that this was added to the 
select create_date, count(*) from customer group by create_date

In [None]:
%%sql
select distinct create_date from customer

In [None]:
%%sql
select avg(inv_count) 
from 
    (
    select film_id, count(distinct inventory_id) as inv_count 
    from inventory 
    group by film_id 
    order by inv_count desc
    ) a

In [None]:
%%sql
select distinct first_name from customer order by first_name 

In [None]:
%%sql
select * from payment where payment_date IS NULL

In [None]:
%%sql
select date_trunc('month', payment_date) as month, count(*)
from payment 
group by date_trunc('month', payment_date)

In [None]:
%%sql
select customer_id from customer order by customer_id

In [None]:
%%sql
select COUNT(CASE WHEN upper(description) LIKE '%MOOSE%' THEN 1 END) as moose_count, COUNT(*) from film

In [None]:
%%sql
select count(distinct film_id) from film

In [None]:
%%sql
select special_features from film

In [None]:
%%sql
select distinct special_features from film

In [None]:
%%sql
select * from film

In [None]:
%%sql
select special_features, count(distinct film_id) as count 
from 
    (
    select film_id, unnest(special_features) as special_features from film
    ) a
group by special_features
order by count desc

In [None]:
%%sql
select rating, count(distinct film_id) from film group by rating

In [None]:
%%sql
select rental_rate, count(distinct film_id) from film group by rental_rate

In [None]:
%%sql
select fulltext:'airport'] from film

In [None]:
%%sql
ALTER TABLE film
DROP COLUMN fulltext

In [None]:
%%sql
select * from film

In [None]:
%%sql
select * from film

In [None]:
%%sql
select f.film_id, title, MAX(inventory_id)-MIN(inventory_id)+1 as total_inventory
from film f
join inventory i on f.film_id = i.film_id
group by f.film_id, title
order by total_inventory desc

In [None]:
%%sql
select f.film_id, title, inventory_id
from film f
left join inventory i on f.film_id = i.film_id
where inventory_id IS NULL

In [None]:
%%sql
select count(distinct film_id) from inventory

In [None]:
%%sql
select f.film_id, title, inventory_id
from film f
join inventory i on f.film_id = i.film_id

In [None]:
%%sql
select count(distinct case when inventory_id is null then f.film_id end)
from film f
left join inventory i on f.film_id = i.film_id

In [None]:
%%sql
select count(distinct f.film_id)
from film f
left join inventory i on f.film_id = i.film_id

In [None]:
# NULLS ARE PRESENT IN THE INVENTORY TABLE
# THERE ARE FILMS IN THE FILM TABLE THAT ARE NOT IN THE INVENTORY TABLE

In [None]:
%%sql
select * from film

In [None]:
%%sql
select * from rental

In [None]:
%%sql
select * from inventory

In [None]:
%%sql
-- Number of rentals per customer
select customer_id, count(*) as count
from rental
group by customer_id 
order by count desc

In [None]:
%%sql
select inventory_id, film_id from inventory i
join film f on i.film_id = f.film_id