# Song Data Cleaning  
Steps followed to have a good song data list from staging_songs   

In [40]:
%load_ext sql
import configparser
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))
HOST              = config.get('CLUSTER','HOST')
DB_NAME           = config.get('CLUSTER','DB_NAME')
DB_USER           = config.get('CLUSTER','DB_USER')
DB_PASSWORD       = config.get('CLUSTER','DB_PASSWORD')
DB_PORT           = config.get('CLUSTER','DB_PORT')
DB_NAME           = config.get('CLUSTER','DB_NAME')

conn_string="postgresql://{}:{}@{}:{}/{}".format(DB_USER, DB_PASSWORD, HOST, DB_PORT, DB_NAME)
print(conn_string)
%sql $conn_string

The sql extension is already loaded. To reload it, use:
  %reload_ext sql
postgresql://dwhuser:Passw0rd@dwhcluster.cmfoxim90hks.us-west-2.redshift.amazonaws.com:5439/dwh


'Connected: dwhuser@dwh'

# Understanding Song list

## num_songs attribute seems to be irrelevant

In [33]:
%%sql
select     
    num_songs,
    count(1)    
from staging_songs
group by num_songs
limit 10

 * postgresql://dwhuser:***@dwhcluster.cmfoxim90hks.us-west-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


num_songs,count
1,385252


## Song ID with multiple title  

In [42]:
%%sql 
select 
    song_id,
    num_songs,
    title,
    artist_name,
    year,
    duration
from staging_songs s
where s.song_id in (
    select 
    a.song_id
    from staging_songs a
    group by a.song_id
    having count(distinct a.title) > 1
)
order by s.song_id
limit 10

 * postgresql://dwhuser:***@dwhcluster.cmfoxim90hks.us-west-2.redshift.amazonaws.com:5439/dwh
10 rows affected.


song_id,num_songs,title,artist_name,year,duration
SOAAEFC12AB01852F1,1,De Tongbreker (Tineke Schouten & Linda de Mol),Tineke Schouten,0,219.37587
SOAAEFC12AB01852F1,1,De Tongbreker,Tineke Schouten/Linda De Mol/Franklin Brown,0,219.61098
SOAQUBQ12A67ADE359,1,Smile (Radio Edit),Lily Allen,2006,194.53342
SOAQUBQ12A67ADE359,1,Smile,Lily Allen,2006,194.89914
SOAUFOF12AB0180C65,1,Seven Nation Army,The White Stripes,2003,232.61995
SOAUFOF12AB0180C65,1,Seven Nation Army (Album Version),The White Stripes,2003,231.81016
SOBHHUS12A58A78589,1,Once,Harry Connick_ Jr.,0,364.64281
SOBHHUS12A58A78589,1,The One That Got Away,Bon Jovi,2004,287.79057
SOBKTKO12AB01857FD,1,Jolene [Live],The White Stripes,2010,234.97098
SOBKTKO12AB01857FD,1,Jolene (Live),The White Stripes,2010,235.20608


## Song ID with multiple artist names

In [43]:
%%sql 
select 
    song_id,
    num_songs,
    title,
    artist_name,
    year,
    duration
from staging_songs s
where s.song_id in (
    select 
    a.song_id
    from staging_songs a
    group by a.song_id
    having count(distinct a.artist_name) > 1
)
order by s.song_id
limit 10

 * postgresql://dwhuser:***@dwhcluster.cmfoxim90hks.us-west-2.redshift.amazonaws.com:5439/dwh
10 rows affected.


song_id,num_songs,title,artist_name,year,duration
SOAAEFC12AB01852F1,1,De Tongbreker (Tineke Schouten & Linda de Mol),Tineke Schouten,0,219.37587
SOAAEFC12AB01852F1,1,De Tongbreker,Tineke Schouten/Linda De Mol/Franklin Brown,0,219.61098
SOBHHUS12A58A78589,1,Once,Harry Connick_ Jr.,0,364.64281
SOBHHUS12A58A78589,1,The One That Got Away,Bon Jovi,2004,287.79057
SOBYBQI12AB0189F89,1,The Poison,All American Rejects,2010,233.87383
SOBYBQI12AB0189F89,1,The Poison,The All-American Rejects,2010,233.40363
SOCOOQD12A6D4FAC53,1,Naïve,Lily Allen,2007,224.57424
SOCOOQD12A6D4FAC53,1,Naive,Lilly Allen,0,224.44363
SOEHWGF12A6D4F8B2B,1,Hips Don't Lie,Shakira ft. Wyclef Jean,2006,215.90159
SOEHWGF12A6D4F8B2B,1,Hips Don't Lie (featuring Wyclef Jean),Shakira Featuring Wyclef Jean,0,217.36444


## Obviously we may have same title with different artists

In [52]:
%%sql 
select 
    song_id,
    num_songs,
    title,
    artist_name,
    year,
    duration
from staging_songs s
where s.title in (
    select 
    a.title
    from staging_songs a
    group by a.title
    having count(distinct a.artist_name) > 1
)
order by s.title
limit 10 offset 210

 * postgresql://dwhuser:***@dwhcluster.cmfoxim90hks.us-west-2.redshift.amazonaws.com:5439/dwh
10 rows affected.


song_id,num_songs,title,artist_name,year,duration
SOCUDAB12A6D4F8E18,1,1974,O Terco,1974,745.0901599999999
SOOWZCB12A8C136994,1,1974,Wilki,2004,220.57751
SOZUZVY12A6D4F8C98,1,1975,Gene Clark,1971,227.57832
SOYANGW12A8AE45E56,1,1975,Clovis,2003,183.84934
SOQDPRP12A8C1401F6,1,1978,Sébastien Schuller,2005,236.66893
SOHYNOY12AB017FD89,1,1978,Salim Nourallah,2004,173.322
SOECUPQ12AB017B1BB,1,1980,Torsten Goods,0,333.58322
SODUXBU12AB01893C1,1,1980,Frozen Plasma,2006,292.44036
SOIJRZL12A8C13E4D3,1,1980,Rx Bandits,2006,183.03955
SOYUTPK12AB01820F7,1,1980,Rehab / Steaknife,0,199.99302


## Natural Key is artist name and title
Streams or songplays will reference the song by artist name and title

In [55]:
%%sql 
with artist_song as (
    select 
    artist_name,
    title,
    count(1) as count
    from staging_songs 
    group by artist_name, title
    having count(1) > 1
)
select 
    s.song_id,
    s.num_songs,
    s.title,
    s.artist_name,
    s.year,
    s.duration
from staging_songs s
where exists (
    select 1
    from artist_song a    
    where a.title = s.title
    and a.artist_name = s.artist_name    
)
order by s.title, s.artist_name
limit 10 offset 2100

 * postgresql://dwhuser:***@dwhcluster.cmfoxim90hks.us-west-2.redshift.amazonaws.com:5439/dwh
10 rows affected.


song_id,num_songs,title,artist_name,year,duration
SOCCWZW12AB01829DB,1,Forever & Always,Taylor Swift,2008,267.49342
SOMPTCI12AB017C416,1,Forever & Always,Taylor Swift,2008,224.9922
SOTNWCI12AAF3B2028,1,Forever & Always,Taylor Swift,2008,246.38649
SOCCWZW12AB01829DB,1,Forever & Always,Taylor Swift,2008,267.49342
SOJQCPX12AB01898C3,1,Forever Blue,Little River Band,1995,286.71955
SOVEBEM12A6D4F7F50,1,Forever Blue,Little River Band,1995,306.07628
SOZSSJS12AF72A1230,1,Forever Blue,Swing Out Sister,1989,258.58567
SOTXKZS12A6D4F67A3,1,Forever Blue,Swing Out Sister,1989,254.14485
SOCWGBF12AB018210B,1,Forever Yellow Skies,The Cranberries,1996,289.41016
SOTJFUK12A6D4F7B5D,1,Forever Yellow Skies,The Cranberries,1996,253.962


## PrimaryKey will be title and artist_name 
The dimension will be song_titles
* Song id is ignored
* Pick latest year (max)
* Pick largest duration  (max)

In [67]:
%%sql 
select count(1) from (
select     
    s.title,
    s.artist_name,
    max(s.year) as year,
    max(s.duration) as duration
from staging_songs s
group by s.title,
    s.artist_name
)

 * postgresql://dwhuser:***@dwhcluster.cmfoxim90hks.us-west-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


count
380832


## Dimension Song titles    

In [86]:
%%sql 

drop table if exists song_titles;
create table if not exists song_titles
(    
    artist_name varchar(1000) not null,
    title varchar(1000) not null,    
    year int not null,
    duration decimal not null,
    primary key (artist_name, title)    
) 
diststyle KEY
distkey (title)
sortkey (artist_name, title)
;

 * postgresql://dwhuser:***@dwhcluster.cmfoxim90hks.us-west-2.redshift.amazonaws.com:5439/dwh
Done.
Done.


[]

In [88]:
%%sql
insert into song_titles
(artist_name, title, year, duration)
select     
    s.artist_name,
    s.title,    
    max(s.year)::int as year,
    max(s.duration)::decimal as duration
from staging_songs s
group by s.artist_name, s.title

 * postgresql://dwhuser:***@dwhcluster.cmfoxim90hks.us-west-2.redshift.amazonaws.com:5439/dwh
380832 rows affected.


[]