# Artist Data Cleaning  
Steps followed to have a good artist data list from staging_songs   

In [1]:
%load_ext sql
import configparser
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))
HOST              = config.get('CLUSTER','HOST')
DB_NAME           = config.get('CLUSTER','DB_NAME')
DB_USER           = config.get('CLUSTER','DB_USER')
DB_PASSWORD       = config.get('CLUSTER','DB_PASSWORD')
DB_PORT           = config.get('CLUSTER','DB_PORT')
DB_NAME           = config.get('CLUSTER','DB_NAME')

conn_string="postgresql://{}:{}@{}:{}/{}".format(DB_USER, DB_PASSWORD, HOST, DB_PORT, DB_NAME)
print(conn_string)
%sql $conn_string

postgresql://dwhuser:Passw0rd@dwhcluster.cmfoxim90hks.us-west-2.redshift.amazonaws.com:5439/dwh


'Connected: dwhuser@dwh'

## Understanding how the artist data is represented - counts

In [47]:
%%sql 
with artists as (
        select distinct 
        artist_id,
        artist_name
        from staging_songs 
    )
select count(1) as artists_count,
count(distinct artist_name) as artist_name_count,
count(distinct artist_id) as artist_id_count
from artists

 * postgresql://dwhuser:***@dwhcluster.cmfoxim90hks.us-west-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


artists_count,artist_name_count,artist_id_count
44317,42061,30542


## There are more Names than IDs 
There must be different names to the same artist entity

In [9]:
%%sql 
select distinct 
a.artist_id,
a.artist_name,
b.artist_name,
b.artist_id
from staging_songs a, staging_songs b
where 
    a.artist_id = b.artist_id
    and a.artist_name <> b.artist_name
order by a.title
limit 10

 * postgresql://dwhuser:***@dwhcluster.cmfoxim90hks.us-west-2.redshift.amazonaws.com:5439/dwh
10 rows affected.


artist_id,artist_name,artist_name_1,artist_id_1
ARYYD9K1187FB44C76,Savoy-Doucet Cajun Band,[re:jazz],ARYYD9K1187FB44C76
ARFBNNS12454A4CE80,Silvestre Dangond & Juancho de La Espriella,Silvestre Dangond & Juancho de La Espriella;Juancho De La Espriella,ARFBNNS12454A4CE80
AR50MHJ1187FB3D015,Lovemakers,The Lovemakers,AR50MHJ1187FB3D015
ARCBD0U1187FB466EF,Nelly / Clipse / Postaboy,Nelly / Anthony Hamilton,ARCBD0U1187FB466EF
ARCBD0U1187FB466EF,Nelly / Clipse / Postaboy,Nelly / St. Lunatics,ARCBD0U1187FB466EF
ARCBD0U1187FB466EF,Nelly / Clipse / Postaboy,Nelly / Avery Storm,ARCBD0U1187FB466EF
ARCBD0U1187FB466EF,Nelly / Clipse / Postaboy,Nelly / Avery Storm / Mase,ARCBD0U1187FB466EF
ARCBD0U1187FB466EF,Nelly / Clipse / Postaboy,Nelly,ARCBD0U1187FB466EF
ARCBD0U1187FB466EF,Nelly / Clipse / Postaboy,Nelly / Cedric The Entertainer,ARCBD0U1187FB466EF
ARCBD0U1187FB466EF,Nelly / Clipse / Postaboy,Nelly / Pharrell Williams,ARCBD0U1187FB466EF


## How many Multiple Names per ID do we have

In [16]:
%%sql 
with artists_multiple_names as (
    with artists as (
        select distinct 
        artist_id,
        artist_name
        from staging_songs 
    )
    select 
        artist_id,
        count(1) as artist_name_count
    from artists
    group by artist_id
    having count(1) > 1
)
select 
count(1) as artist_id_count,
sum(artist_name_count) as artist_name_count
from artists_multiple_names

 * postgresql://dwhuser:***@dwhcluster.cmfoxim90hks.us-west-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


artist_id_count,artist_name_count
6699,20474


## And we also have multiple IDs per name

In [19]:
%%sql 
select distinct 
a.artist_name,
a.artist_id,
b.artist_id
from staging_songs a, staging_songs b
where 
    a.artist_id <> b.artist_id
    and a.artist_name = b.artist_name
order by a.artist_name
limit 10

 * postgresql://dwhuser:***@dwhcluster.cmfoxim90hks.us-west-2.redshift.amazonaws.com:5439/dwh
10 rows affected.


artist_name,artist_id,artist_id_1
'68 Comeback,ARY35JW1187B998233,ARPXYNA11F50C4EB6C
'68 Comeback,ARPXYNA11F50C4EB6C,ARY35JW1187B998233
(hed) p.e.,ARAWNNA1241B9C726F,ARDBN3H1187FB4F49F
(hed) p.e.,ARDBN3H1187FB4F49F,ARAWNNA1241B9C726F
-123min.,ARYFWZQ11F4C845C52,AR1NRFQ1187B994284
-123min.,AR1NRFQ1187B994284,ARYFWZQ11F4C845C52
-M-,ARU5K6O1187FB43B06,AR828WL1187FB47E81
-M-,AR828WL1187FB47E81,ARU5K6O1187FB43B06
1000names,ARGJVKV1187B9ACBF1,ARLHLXZ11E2835E51D
1000names,ARLHLXZ11E2835E51D,ARGJVKV1187B9ACBF1


## How many Multiple IDs per Name do we have

In [52]:
%%sql 
with artists_multiple_ids as (
    with artists as (
        select distinct 
        artist_id,
        artist_name
        from staging_songs 
    )
    select 
        artist_name,
        count(1) as artist_id_count
    from artists
    group by artist_name
    having count(1) > 1
)
select 
/*
count(1) as artist_name_count,
sum(artist_id_count) as artist_id_count*/
artist_name, artist_id_count
from artists_multiple_ids
order by artist_id_count desc 
limit 15

 * postgresql://dwhuser:***@dwhcluster.cmfoxim90hks.us-west-2.redshift.amazonaws.com:5439/dwh
15 rows affected.


artist_name,artist_id_count
Bill & Gloria Gaither,7
Original Broadway Cast,6
Karaoke,6
Ja-Man All Stars,5
Marc Et Claude,5
King Tubby,5
Charttraxx Karaoke,5
Magnatune Compilation,5
La Cabra Mecanica,5
Grandmaster Flash & The Furious Five,4


## Let's see how many artists there are that have a single name / id reference

In [26]:
%%sql 
with artists_id_single_name as (
    with artists as (
        select distinct 
        artist_id,
        artist_name
        from staging_songs 
    )
    select 
        artist_id        
    from artists
    group by artist_id
    having count(1) = 1
)
select 
count (1)
from artists_id_single_name


 * postgresql://dwhuser:***@dwhcluster.cmfoxim90hks.us-west-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


count
23843


In [33]:
%%sql 
with artists_name_single_id as (
    with artists as (
        select distinct 
        artist_id,
        artist_name
        from staging_songs 
    )
    select 
        artist_name,
        count(1) as artist_id_count
    from artists
    group by artist_name
    having count(1) = 1
)
select 
count(1)
from artists_name_single_id


 * postgresql://dwhuser:***@dwhcluster.cmfoxim90hks.us-west-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


count
40028


## ID vs Name
Even when artists_id looks like an obvious unique identifier
In the end, streaming events reference songs and artists by song_titles and artist_names respectively


## Multiple Name per Id where no Multiple Id per Name  

In [53]:
%%sql 
with artists as (
        select distinct 
        artist_id,
        artist_name
        from staging_songs 
    ),
artists_multiple_names as (    
    select 
        artist_id,
        count(1) as artist_name_count
    from artists
    group by artist_id
    having count(1) > 1
),
artists_multiple_ids as (
    select 
        artist_name,
        count(1) as artist_id_count
    from artists
    group by artist_name
    having count(1) > 1
)
select count(1)
from artists a
where not exists (select 1 from artists_multiple_names b where b.artist_id = a.artist_id) 
and not exists (select 1 from artists_multiple_ids c where c.artist_name = a.artist_name) 

limit 10

 * postgresql://dwhuser:***@dwhcluster.cmfoxim90hks.us-west-2.redshift.amazonaws.com:5439/dwh
1 rows affected.


count
22265


## Start Undertanding where the duplicates are 
Duplicates exists because the table staging_songs has a song granularity.
The problem is that the artist related columns:
- artist id
- artist name
- artist latitude
- artist longitude 
- artist location

May vary between 