# User Data Cleaning  
Steps followed to have a good user data list from staging_events

In [2]:
%load_ext sql
import configparser
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))
HOST              = config.get('CLUSTER','HOST')
DB_NAME           = config.get('CLUSTER','DB_NAME')
DB_USER           = config.get('CLUSTER','DB_USER')
DB_PASSWORD       = config.get('CLUSTER','DB_PASSWORD')
DB_PORT           = config.get('CLUSTER','DB_PORT')
DB_NAME           = config.get('CLUSTER','DB_NAME')

conn_string="postgresql://{}:{}@{}:{}/{}".format(DB_USER, DB_PASSWORD, HOST, DB_PORT, DB_NAME)

%sql $conn_string

'Connected: dwhuser@dwh'

## User Id is always valid

In [5]:
%%sql

SELECT count(1)           
FROM staging_events
WHERE userid != NULL


 * postgresql://dwhuser:***@dwhcluster.cmfoxim90hks.us-west-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


count
0


## User can have different levels across it's streaming history

In [13]:
%%sql 
select distinct
 userId, firstName, lastName, level
from staging_events 
where userId in ( 
    select userId
    from staging_events 
    where page = 'NextSong'
    group by userId
    having count(distinct level) > 1 
)
and page = 'NextSong'
order by userId
limit 10

 * postgresql://dwhuser:***@dwhcluster.cmfoxim90hks.us-west-2.redshift.amazonaws.com:5439/dwh
10 rows affected.


userid,firstname,lastname,level
15,Lily,Koch,free
15,Lily,Koch,paid
16,Rylan,George,paid
16,Rylan,George,free
29,Jacqueline,Lynch,paid
29,Jacqueline,Lynch,free
36,Matthew,Jones,free
36,Matthew,Jones,paid
49,Chloe,Cuevas,free
49,Chloe,Cuevas,paid


## We will use the values from the latest stream event

In [21]:
%%sql 

drop table if exists users;
create table if not exists users
(
    user_id int not null primary key sortkey,
    first_name varchar not null,
    last_name varchar not null,
    gender varchar not null,
    level varchar not null
) diststyle ALL;

 * postgresql://dwhuser:***@dwhcluster.cmfoxim90hks.us-west-2.redshift.amazonaws.com:5439/dwh
Done.
Done.


[]

In [23]:
%%sql 
insert into users
(user_id, first_name, last_name, gender, level)
with latest_user_stream_event as (
    select 
    userId::int,
    firstName,
    lastName,
    gender,
    level,
    row_number() over(partition by userId order by ts desc) as rank
    from staging_events
    where page = 'NextSong'
)
select 
    userId,
    firstName,
    lastName,
    gender,
    level
from latest_user_stream_event
where rank = 1

 * postgresql://dwhuser:***@dwhcluster.cmfoxim90hks.us-west-2.redshift.amazonaws.com:5439/dwh
96 rows affected.


[]