# Pitchfork album reviews and song lyrics analysis
# Formatting the data
---

In [3]:
import pandas as pd
import numpy as np
import pickle
import json
import os

## Load the Pitchfork reviews

In [2]:
# Pitchfork album reviews
filename_album_reviews = "./data/album_reviews.pickle"
if os.path.exists(filename_album_reviews) and os.path.getsize(filename_album_reviews) > 0:
    artist_reviews = pickle.load(open(filename_album_reviews, 'rb'))

## Convert the reviews from JSON string to Pitchfork Review format

In [9]:
from pitchfork import pitchfork as pf
from bs4 import BeautifulSoup

In [10]:
def convert_json_to_review(review_as_string):
    json_obj = json.loads(review_as_string)
    return pf.Review(json_obj['searched_artist'], json_obj['searched_album'],
                     json_obj['matched_artist'], json_obj['matched_album'],
                     json_obj['query'], json_obj['url'], BeautifulSoup(json_obj['soup'], "lxml"))

In [None]:
# Convert to Review format
all_reviews = {}
for name, albums in artist_reviews.items():
    print(name)
    album_reviews = []
    for album, review in albums.items():
        album_reviews.append(convert_json_to_review(review))
    all_reviews[name] = album_reviews
    
# List of all artist names with reviews
artist_names_with_reviews = list(all_reviews.keys())
print('\nDone.')

In [15]:
all_reviews.keys()

dict_keys(['Kanye West', 'The Beatles', '2Pac', 'Eminem', 'The Notorious B.I.G.', 'Nas', 'Ice Cube', 'JAY-Z', 'Snoop Dogg', 'Dr. Dre', 'Kendrick Lamar', 'A$AP Rocky', '50 Cent', 'Busta Rhymes', 'J. Cole', 'Method Man', 'KRS-One', 'Ludacris', 'Lil Wayne', 'Common', 'Ghostface Killah', 'T.I.', 'Lauryn Hill', 'Scarface', 'Raekwon', 'The Game', "Ol' Dirty Bastard", 'Mobb Deep', 'Logic', 'Chance The Rapper', 'Cypress Hill', 'Lupe Fiasco', 'RZA', 'GZA', 'Warren G', 'Q-Tip', 'Talib Kweli', 'Missy Elliott', 'Immortal Technique', 'Big Sean', 'Kid Cudi', 'Twista', 'Big Boi', 'Drake', 'Wiz Khalifa', 'Eric B. & Rakim', 'ScHoolboy Q', 'Run-D.M.C.', 'Hopsin', 'Nelly', 'Tyler, The Creator', 'Jadakiss', 'Kurupt', 'Grandmaster Flash & The Furious Five', 'N.W.A', 'Gang Starr', 'E-40', 'Pusha T', 'De La Soul', 'Yelawolf', 'Earl Sweatshirt', 'Fabolous', 'Mac Miller', 'Jeezy', 'Kodak Black', '2 Chainz', 'Chief Keef', 'Gucci Mane', 'Salt-N-Pepa', "Lil' Kim", 'Nicki Minaj', 'M.I.A.', 'Azealia Banks', 'Angel 

## Convert Pitchfork Reviews to JSON format

In [200]:
import re

In [None]:
reviews_json = []
re_year = re.compile(r"\d{4}") # Extracting publication date
for artist, album_reviews in all_reviews.items():
    print(40 * '-')
    print(artist)
    for review in album_reviews:
        review_json_obj = {}
        review_method_names = [m for m in dir(r) if not m.find('_') == 0 and (m not in ['soup', 'to_json', 'full_text', 'methods', 'query'])]
        review_method_names = [m for m in review_method_names if ('matched_' not in m) and ('searched_' not in m)]
        for method in review_method_names:
            try:
                if method == 'year':
                    val = re_year.search(getattr(review, method)()).group()                                        
                elif method == 'url':
                    val = getattr(review, method)
                elif method == 'cover':
                    val = review.soup.find(class_='single-album-tombstone__art').find('img').attrs['src']
                else:
                    val = getattr(review, method)();
                    
                if isinstance(val, str):
                    review_json_obj[method] = val.strip()
                else:
                    review_json_obj[method] = val
            except:
                review_json_obj[method] = None;                
        reviews_json.append(review_json_obj)

In [206]:
# Save the JSON object
with open('./data/pitchfork_reviews_json.json', 'w') as outfile:
    json.dump(reviews_json, outfile)

# Remove samples without both Reviews and Songs

In [4]:
import numpy as np

In [4]:
# Load the Pitchfork album reviews
reviews = pd.read_json('./data/pitchfork_reviews_json.json')

In [5]:
# Load the song lyrics
songs = pd.read_json('./data/lyrics/all_songs.json')
songs.drop(columns='raw', inplace=True) # Drop the raw data column
songs.dropna(subset=['album'], inplace=True) # Drop rows with missing albums

In [6]:
print(reviews.shape)
print(songs.shape)

(585, 10)
(45098, 8)


In [7]:
# Determine masks for excluding songs without reviews and vice versa
tmp = {'artists': [], 'albums': []}
masks = {'songs': tmp.copy(), 'reviews': tmp.copy()}
del(tmp)

# Only keep artists who have Songs and Reviews
artists_with_both = np.intersect1d(songs.artist, reviews.artist)
is_missing_artist = lambda x: x not in artists_with_both
masks['songs']['artists'] = np.asarray(list(map(is_missing_artist, songs.artist))).reshape((len(songs.artist), 1))
masks['reviews']['artists'] = np.asarray(list(map(is_missing_artist, reviews.artist))).reshape((len(reviews.artist), 1))

# Only keep albums which have Songs and Reviews
albums_with_both = np.intersect1d(songs.album, reviews.album)
is_missing_album = lambda x: x not in albums_with_both
masks['songs']['albums'] = np.asarray(list(map(is_missing_album, songs.album))).reshape((len(songs.album), 1))
masks['reviews']['albums'] = np.asarray(list(map(is_missing_album, reviews.album))).reshape((len(reviews.album), 1))

In [8]:
# Drop rows from the Songs DataFrame
songs.mask(np.tile(masks['songs']['artists'], songs.shape[1]), inplace=True)
songs.dropna(inplace=True);

In [9]:
# Drop rows from the Reviews DataFrame
reviews.mask(np.tile(masks['reviews']['artists'], reviews.shape[1]), inplace=True)
reviews.dropna(inplace=True);

In [10]:
print(reviews.shape)
print(songs.shape)

(532, 10)
(16896, 8)


## Add gender and genre information to the Pitchfork reviews

In [11]:
# Create a dictionary containing extra info for each artist
artist_info = {}
for idx, row in songs.iterrows():
    artist = row.artist
    if artist not in artist_info:
        artist_info[artist] = {'gender': row.gender, 'genre': row.genre}

In [12]:
# Create new columns for the Reviews DF
genders, genres = [], []
for n, artist in enumerate(reviews.artist):
    genders.append(artist_info[artist]['gender'])
    genres.append(artist_info[artist]['genre'])

# Add the new columns to the Reviews DataFrame
reviews['gender'] = genders
reviews['genre'] = genres

## Finish formatting and save  `songs` and `reviews` DFs

In [13]:
# Reorder the DataFrame columns
reviews = reviews[['artist', 'album', 'year', 'genre',
                   'gender', 'score', 'best_new_music',
                   'abstract', 'editorial', 'label', 'url', 'cover']]

songs = songs[['title', 'artist', 'album', 'year', 'genre',
               'gender', 'lyrics', 'image']]

In [14]:
# Save the DataFrames as JSON files
with open('./data/pitchfork_reviews_matched.json', 'w') as outfile:
    json.dump(json.loads(reviews.to_json(orient='records')), outfile)
with open('./data/song_lyrics_matched.json', 'w') as outfile:
    json.dump(json.loads(songs.to_json(orient='records')), outfile)

In [15]:
songs.head(2)

Unnamed: 0,title,artist,album,year,genre,gender,lyrics,image
0,16 on Death Row,2Pac,R U Still Down? (Remember Me),1997-11-25,rap,male,Death Row\nThat's where mothafuckas is endin' ...,https://images.genius.com/4f924e981b74c6d754d2...
4,2 of Amerikaz Most Wanted,2Pac,All Eyez on Me,1996-05-07,rap,male,Up out of there\nAin't nothin' but a gangsta p...,https://images.genius.com/e47c72fcb1f2756daafc...


In [16]:
reviews.head(2)

Unnamed: 0,artist,album,year,genre,gender,score,best_new_music,abstract,editorial,label,url,cover
0,Kanye West,Graduation,2007.0,rap,male,8.7,1.0,Though it doesn't quite match\n ...,While rap music famously thrives on the kind o...,Roc-A-Fella\n / \n ...,/reviews/albums/10658-graduation/,https://media.pitchfork.com/photos/5929b270ea9...
1,Kanye West,Late Registration,2005.0,rap,male,9.5,1.0,The producer-rapper follows his Pazz and Jop-w...,"""Can I talk my shit again?""\n ...",Roc-A-Fella\n / \n ...,/reviews/albums/8768-late-registration/,https://media.pitchfork.com/photos/5929aa8db13...
