In [1]:
#Dependencies
import pandas as pd
# SQL Alchemy
from sqlalchemy import create_engine

In [2]:
# Create Engine for Pitchfork
pitchfork_database_path = "./Resources/pitchfork.sqlite"
engine = create_engine(f"sqlite:///{pitchfork_database_path}")
conn = engine.connect()

In [3]:
engine.table_names()



['artists', 'content', 'genres', 'labels', 'reviews', 'top10', 'years']

In [4]:
pitchfork_data = pd.read_sql("SELECT * FROM reviews", conn)
pitchfork_data.head()

Unnamed: 0,reviewid,title,artist,url,score,best_new_music,author,author_type,pub_date,pub_weekday,pub_day,pub_month,pub_year
0,22703,mezzanine,massive attack,http://pitchfork.com/reviews/albums/22703-mezz...,9.3,0,nate patrin,contributor,2017-01-08,6,8,1,2017
1,22721,prelapsarian,krallice,http://pitchfork.com/reviews/albums/22721-prel...,7.9,0,zoe camp,contributor,2017-01-07,5,7,1,2017
2,22659,all of them naturals,uranium club,http://pitchfork.com/reviews/albums/22659-all-...,7.3,0,david glickman,contributor,2017-01-07,5,7,1,2017
3,22661,first songs,"kleenex, liliput",http://pitchfork.com/reviews/albums/22661-firs...,9.0,1,jenn pelly,associate reviews editor,2017-01-06,4,6,1,2017
4,22725,new start,taso,http://pitchfork.com/reviews/albums/22725-new-...,8.1,0,kevin lozano,tracks coordinator,2017-01-06,4,6,1,2017


In [5]:
#Store Spotify CSV into Dataframe
spotify_csv = "Resources/spotify_top10s.csv"
spotify_data_df = pd.read_csv(spotify_csv, encoding = "ISO-8859-1")
spotify_data_df.head()

#Create new data with select columns:
new_spotify_data_df = spotify_data_df[['title','artist','year']]
new_spotify_data_df

Unnamed: 0,title,artist,year
0,"Hey, Soul Sister",Train,2010
1,Love The Way You Lie,Eminem,2010
2,TiK ToK,Kesha,2010
3,Bad Romance,Lady Gaga,2010
4,Just the Way You Are,Bruno Mars,2010
...,...,...,...
598,Find U Again (feat. Camila Cabello),Mark Ronson,2019
599,Cross Me (feat. Chance the Rapper & PnB Rock),Ed Sheeran,2019
600,"No Brainer (feat. Justin Bieber, Chance the Ra...",DJ Khaled,2019
601,Nothing Breaks Like a Heart (feat. Miley Cyrus),Mark Ronson,2019


In [None]:
#spotify_data_df.head()

In [6]:
#Use pandas to load csv converted DataFrame into database
new_spotify_data_df.to_sql(name='top10', con=engine,if_exists='append',index=False)

In [7]:
#Confirm data has been added by querying the table
pd.read_sql_query('select * from top10', con=engine).head()

Unnamed: 0,title,artist,year
0,"Hey, Soul Sister",Train,2010
1,Love The Way You Lie,Eminem,2010
2,TiK ToK,Kesha,2010
3,Bad Romance,Lady Gaga,2010
4,Just the Way You Are,Bruno Mars,2010


In [8]:
pitchfork_data = pd.read_sql("SELECT * FROM years", conn)

#columns in table
print(list(pitchfork_data.columns))

['reviewid', 'year']


In [9]:
#Investigating merge between both dataframes
initial_reviews = pd.read_sql("SELECT * FROM reviews", conn)
initial_reviews
reviews = initial_reviews[['reviewid','artist','score','author','pub_year']]
reviews

Unnamed: 0,reviewid,artist,score,author,pub_year
0,22703,massive attack,9.3,nate patrin,2017
1,22721,krallice,7.9,zoe camp,2017
2,22659,uranium club,7.3,david glickman,2017
3,22661,"kleenex, liliput",9.0,jenn pelly,2017
4,22725,taso,8.1,kevin lozano,2017
...,...,...,...,...,...
18388,1535,coldcut,8.9,james p. wisdom,1999
18389,1341,cassius,4.8,james p. wisdom,1999
18390,5376,mojave 3,6.3,jason josephes,1999
18391,2413,don caballero,7.2,james p. wisdom,1999


In [10]:
content = pd.read_sql("SELECT * FROM content", conn)
content

Unnamed: 0,reviewid,content
0,22703,"“Trip-hop” eventually became a ’90s punchline,..."
1,22721,"Eight years, five albums, and two EPs in, the ..."
2,22659,Minneapolis’ Uranium Club seem to revel in bei...
3,22661,Kleenex began with a crash. It transpired one ...
4,22725,It is impossible to consider a given release b...
...,...,...
18388,1535,The marketing guys of yer average modern megac...
18389,1341,"Well, it's been two weeks now, and I guess it'..."
18390,5376,"Out of Tune is a Steve Martin album. Yes, I'l..."
18391,2413,"Well, kids, I just went back and re-read my re..."


In [11]:
content['reviewid'].nunique()

18389

In [12]:
#merge dataframes on reviewid column
content_reviews = reviews.merge(content, how='inner',on='reviewid')
content_reviews

Unnamed: 0,reviewid,artist,score,author,pub_year,content
0,22703,massive attack,9.3,nate patrin,2017,"“Trip-hop” eventually became a ’90s punchline,..."
1,22721,krallice,7.9,zoe camp,2017,"Eight years, five albums, and two EPs in, the ..."
2,22659,uranium club,7.3,david glickman,2017,Minneapolis’ Uranium Club seem to revel in bei...
3,22661,"kleenex, liliput",9.0,jenn pelly,2017,Kleenex began with a crash. It transpired one ...
4,22725,taso,8.1,kevin lozano,2017,It is impossible to consider a given release b...
...,...,...,...,...,...,...
18396,1535,coldcut,8.9,james p. wisdom,1999,The marketing guys of yer average modern megac...
18397,1341,cassius,4.8,james p. wisdom,1999,"Well, it's been two weeks now, and I guess it'..."
18398,5376,mojave 3,6.3,jason josephes,1999,"Out of Tune is a Steve Martin album. Yes, I'l..."
18399,2413,don caballero,7.2,james p. wisdom,1999,"Well, kids, I just went back and re-read my re..."


In [13]:
#Checking for duplicates in the Pitchfork reviews
duplicate_reviews = content_reviews[content_reviews.duplicated()]
duplicate_reviews

Unnamed: 0,reviewid,artist,score,author,pub_year,content
12117,9417,easy star all-stars,7.0,joe tangari,2006,\r\n A song-for-song reggae cover of Radioh...
12118,9417,easy star all-stars,7.0,joe tangari,2006,\r\n A song-for-song reggae cover of Radioh...
12119,9417,easy star all-stars,7.0,joe tangari,2006,\r\n A song-for-song reggae cover of Radioh...
12121,9505,various artists,8.2,tim finney,2006,\nOn the one hand it is a largely superfluous ...
12122,9505,various artists,8.2,tim finney,2006,\nOn the one hand it is a largely superfluous ...
12123,9505,various artists,8.2,tim finney,2006,\nOn the one hand it is a largely superfluous ...
12125,9499,the blood brothers,6.2,jason crock,2006,"When we last left our heroes, the Blood Brothe..."
12126,9499,the blood brothers,6.2,jason crock,2006,"When we last left our heroes, the Blood Brothe..."
12127,9499,the blood brothers,6.2,jason crock,2006,"When we last left our heroes, the Blood Brothe..."
12129,9460,xasthur,7.8,brandon stosuy,2006,Strange things are a foot in the bowels of hel...


In [14]:
#Drop columns with identical values in all columns
content_reviews = content_reviews.drop_duplicates(subset = ["reviewid", "artist", "score", "author", "content"])

In [15]:
#checking that all the duplicate reviews were dropped - if so, then this command should not return us any rows
duplicate_reviews_2 = content_reviews[content_reviews.duplicated()]
duplicate_reviews_2

Unnamed: 0,reviewid,artist,score,author,pub_year,content


In [16]:
top10 = pd.read_sql("SELECT * FROM top10", conn)
top10

Unnamed: 0,title,artist,year
0,"Hey, Soul Sister",Train,2010
1,Love The Way You Lie,Eminem,2010
2,TiK ToK,Kesha,2010
3,Bad Romance,Lady Gaga,2010
4,Just the Way You Are,Bruno Mars,2010
...,...,...,...
3613,Find U Again (feat. Camila Cabello),Mark Ronson,2019
3614,Cross Me (feat. Chance the Rapper & PnB Rock),Ed Sheeran,2019
3615,"No Brainer (feat. Justin Bieber, Chance the Ra...",DJ Khaled,2019
3616,Nothing Breaks Like a Heart (feat. Miley Cyrus),Mark Ronson,2019


In [17]:
#checking to see if there are any duplicate Spotify chart entries
top10_dupes = top10[top10.duplicated()]
top10_dupes

Unnamed: 0,title,artist,year
322,Sugar,Maroon 5,2015
603,"Hey, Soul Sister",Train,2010
604,Love The Way You Lie,Eminem,2010
605,TiK ToK,Kesha,2010
606,Bad Romance,Lady Gaga,2010
...,...,...,...
3613,Find U Again (feat. Camila Cabello),Mark Ronson,2019
3614,Cross Me (feat. Chance the Rapper & PnB Rock),Ed Sheeran,2019
3615,"No Brainer (feat. Justin Bieber, Chance the Ra...",DJ Khaled,2019
3616,Nothing Breaks Like a Heart (feat. Miley Cyrus),Mark Ronson,2019


In [18]:
#it's hard to get a sense of the dupes with the command above, since they're not sorted. 
#Let's sort them by title so that we can visualize the duplicate entries
top10_dupes.sort_values(by=['title'])


Unnamed: 0,title,artist,year
802,#Beautiful,Mariah Carey,2013
1405,#Beautiful,Mariah Carey,2013
2611,#Beautiful,Mariah Carey,2013
3214,#Beautiful,Mariah Carey,2013
2008,#Beautiful,Mariah Carey,2013
...,...,...,...
3528,no tears left to cry,Ariana Grande,2018
1719,no tears left to cry,Ariana Grande,2018
1116,no tears left to cry,Ariana Grande,2018
2322,no tears left to cry,Ariana Grande,2018


In [19]:
#Drop columns with identical values in all columns, but keep the first entry
top10 = top10.drop_duplicates(subset=['title', 'artist', 'year'], keep='first')

In [20]:
#now if we sort the top10 by title, we shouldn't be seeing any dupes
top10.sort_values(by=['title'])

Unnamed: 0,title,artist,year
199,#Beautiful,Mariah Carey,2013
240,#SELFIE,The Chainsmokers,2014
173,#thatPOWER,will.i.am,2013
569,...Ready For It? - BloodPop® Remix,Taylor Swift,2018
87,1+1,Beyoncé,2011
...,...,...,...
490,Younger Now,Miley Cyrus,2017
19,Your Love Is My Drug,Kesha,2010
538,Youth (feat. Khalid),Shawn Mendes,2018
233,human,Christina Perri,2014


In [21]:
#we know that eminem is in our spotify chart data. Let's make sure that there are pitchfork reviews on him too. 
#this way, we know that we have data to join
content_reviews[content_reviews["artist"].isin(["eminem"])]

Unnamed: 0,reviewid,artist,score,author,pub_year,content
3704,18733,eminem,4.7,craig jenkins,2013,The more the triumphs of Eminem’s world beatin...
7704,14380,eminem,2.8,jayson greene,2010,Watching Eminem attempt to re-situate himself ...
8990,13034,eminem,4.8,ian cohen,2009,"You might think calling Eminem ""divisive"" in ..."
13124,2773,eminem,6.9,sean fennessey,2005,"Frankly, I don't want to hear these songs anym..."
14384,2772,eminem,6.5,scott plagenhoef,2004,"In 2000, Eminem was frequently vilified as a h..."
16951,2771,eminem,9.1,ethan p.,2002,ryan loves it and he likes mmlp too so he's al...


In [22]:
#making the artist column in the Pitchfork Reviews a Title format, so that we can join it to the Spotify data
content_reviews['artist'] = content_reviews['artist'].str.title()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [23]:
#confirming that the artist names are now in title format
content_reviews

Unnamed: 0,reviewid,artist,score,author,pub_year,content
0,22703,Massive Attack,9.3,nate patrin,2017,"“Trip-hop” eventually became a ’90s punchline,..."
1,22721,Krallice,7.9,zoe camp,2017,"Eight years, five albums, and two EPs in, the ..."
2,22659,Uranium Club,7.3,david glickman,2017,Minneapolis’ Uranium Club seem to revel in bei...
3,22661,"Kleenex, Liliput",9.0,jenn pelly,2017,Kleenex began with a crash. It transpired one ...
4,22725,Taso,8.1,kevin lozano,2017,It is impossible to consider a given release b...
...,...,...,...,...,...,...
18396,1535,Coldcut,8.9,james p. wisdom,1999,The marketing guys of yer average modern megac...
18397,1341,Cassius,4.8,james p. wisdom,1999,"Well, it's been two weeks now, and I guess it'..."
18398,5376,Mojave 3,6.3,jason josephes,1999,"Out of Tune is a Steve Martin album. Yes, I'l..."
18399,2413,Don Caballero,7.2,james p. wisdom,1999,"Well, kids, I just went back and re-read my re..."


In [25]:
#Merge top10 with content_reviews
top10_content_reviews = content_reviews.merge(top10, how='inner',on='artist')
top10_content_reviews

Unnamed: 0,reviewid,artist,score,author,pub_year,content,title,year
0,22645,John Legend,7.0,marcus j. moore,2016,John Legend doesn’t waste time getting to the ...,All of Me,2014
1,22645,John Legend,7.0,marcus j. moore,2016,John Legend doesn’t waste time getting to the ...,Start,2016
2,22634,Bruno Mars,6.2,katherine st. asaph,2016,It’s good to remember the improbable things in...,Just the Way You Are,2010
3,22634,Bruno Mars,6.2,katherine st. asaph,2016,It’s good to remember the improbable things in...,Marry You,2010
4,22634,Bruno Mars,6.2,katherine st. asaph,2016,It’s good to remember the improbable things in...,Just the Way You Are,2011
...,...,...,...,...,...,...,...,...
609,2756,Missy Elliott,7.2,sam chennault,2002,"Aside from Eminem, Missy ""Misdemeanor"" Elliott...",WTF (Where They From),2016
610,2756,Missy Elliott,7.2,sam chennault,2002,"Aside from Eminem, Missy ""Misdemeanor"" Elliott...",Pep Rally,2016
611,2757,Missy Elliott,8.2,dan kilian,2001,A lot of albums kick off with the killer track...,WTF (Where They From),2016
612,2757,Missy Elliott,8.2,dan kilian,2001,A lot of albums kick off with the killer track...,Pep Rally,2016


In [26]:
#Checking for duplicates - there should be none
duplicate_entries_3 = top10_content_reviews[top10_content_reviews.duplicated()]
duplicate_entries_3

Unnamed: 0,reviewid,artist,score,author,pub_year,content,title,year


In [27]:
top10_content_reviews[top10_content_reviews["artist"].isin(["Eminem"])]

Unnamed: 0,reviewid,artist,score,author,pub_year,content,title,year
532,18733,Eminem,4.7,craig jenkins,2013,The more the triumphs of Eminem’s world beatin...,Love The Way You Lie,2010
533,18733,Eminem,4.7,craig jenkins,2013,The more the triumphs of Eminem’s world beatin...,Walk On Water (feat. Beyoncé),2018
534,14380,Eminem,2.8,jayson greene,2010,Watching Eminem attempt to re-situate himself ...,Love The Way You Lie,2010
535,14380,Eminem,2.8,jayson greene,2010,Watching Eminem attempt to re-situate himself ...,Walk On Water (feat. Beyoncé),2018
536,13034,Eminem,4.8,ian cohen,2009,"You might think calling Eminem ""divisive"" in ...",Love The Way You Lie,2010
537,13034,Eminem,4.8,ian cohen,2009,"You might think calling Eminem ""divisive"" in ...",Walk On Water (feat. Beyoncé),2018
538,2773,Eminem,6.9,sean fennessey,2005,"Frankly, I don't want to hear these songs anym...",Love The Way You Lie,2010
539,2773,Eminem,6.9,sean fennessey,2005,"Frankly, I don't want to hear these songs anym...",Walk On Water (feat. Beyoncé),2018
540,2772,Eminem,6.5,scott plagenhoef,2004,"In 2000, Eminem was frequently vilified as a h...",Love The Way You Lie,2010
541,2772,Eminem,6.5,scott plagenhoef,2004,"In 2000, Eminem was frequently vilified as a h...",Walk On Water (feat. Beyoncé),2018
