# Pitchfork album reviews and song lyrics analysis
# Formatting the data
---

In [1]:
import pandas as pd
import pickle
import json
import os

## Load the Pitchfork reviews

In [2]:
# Pitchfork album reviews
filename_album_reviews = "./data/album_reviews.pickle"
if os.path.exists(filename_album_reviews) and os.path.getsize(filename_album_reviews) > 0:
    artist_reviews = pickle.load(open(filename_album_reviews, 'rb'))

## Convert the reviews from JSON string to Pitchfork Review format

In [9]:
from pitchfork import pitchfork as pf
from bs4 import BeautifulSoup

In [10]:
def convert_json_to_review(review_as_string):
    json_obj = json.loads(review_as_string)
    return pf.Review(json_obj['searched_artist'], json_obj['searched_album'],
                     json_obj['matched_artist'], json_obj['matched_album'],
                     json_obj['query'], json_obj['url'], BeautifulSoup(json_obj['soup'], "lxml"))

In [None]:
# Convert to Review format
all_reviews = {}
for name, albums in artist_reviews.items():
    print(name)
    album_reviews = []
    for album, review in albums.items():
        album_reviews.append(convert_json_to_review(review))
    all_reviews[name] = album_reviews
    
# List of all artist names with reviews
artist_names_with_reviews = list(all_reviews.keys())
print('\nDone.')

In [15]:
all_reviews.keys()

dict_keys(['Kanye West', 'The Beatles', '2Pac', 'Eminem', 'The Notorious B.I.G.', 'Nas', 'Ice Cube', 'JAY-Z', 'Snoop Dogg', 'Dr. Dre', 'Kendrick Lamar', 'A$AP Rocky', '50 Cent', 'Busta Rhymes', 'J. Cole', 'Method Man', 'KRS-One', 'Ludacris', 'Lil Wayne', 'Common', 'Ghostface Killah', 'T.I.', 'Lauryn Hill', 'Scarface', 'Raekwon', 'The Game', "Ol' Dirty Bastard", 'Mobb Deep', 'Logic', 'Chance The Rapper', 'Cypress Hill', 'Lupe Fiasco', 'RZA', 'GZA', 'Warren G', 'Q-Tip', 'Talib Kweli', 'Missy Elliott', 'Immortal Technique', 'Big Sean', 'Kid Cudi', 'Twista', 'Big Boi', 'Drake', 'Wiz Khalifa', 'Eric B. & Rakim', 'ScHoolboy Q', 'Run-D.M.C.', 'Hopsin', 'Nelly', 'Tyler, The Creator', 'Jadakiss', 'Kurupt', 'Grandmaster Flash & The Furious Five', 'N.W.A', 'Gang Starr', 'E-40', 'Pusha T', 'De La Soul', 'Yelawolf', 'Earl Sweatshirt', 'Fabolous', 'Mac Miller', 'Jeezy', 'Kodak Black', '2 Chainz', 'Chief Keef', 'Gucci Mane', 'Salt-N-Pepa', "Lil' Kim", 'Nicki Minaj', 'M.I.A.', 'Azealia Banks', 'Angel 

## Convert Pitchfork Reviews to JSON format

In [200]:
import re

In [None]:
reviews_json = []
re_year = re.compile(r"\d{4}") # Extracting publication date
for artist, album_reviews in all_reviews.items():
    print(40 * '-')
    print(artist)
    for review in album_reviews:
        review_json_obj = {}
        review_method_names = [m for m in dir(r) if not m.find('_') == 0 and (m not in ['soup', 'to_json', 'full_text', 'methods', 'query'])]
        review_method_names = [m for m in review_method_names if ('matched_' not in m) and ('searched_' not in m)]
        for method in review_method_names:
            try:
                if method == 'year':
                    val = re_year.search(getattr(review, method)()).group()                                        
                elif method == 'url':
                    val = getattr(review, method)
                elif method == 'cover':
                    val = review.soup.find(class_='single-album-tombstone__art').find('img').attrs['src']
                else:
                    val = getattr(review, method)();
                    
                if isinstance(val, str):
                    review_json_obj[method] = val.strip()
                else:
                    review_json_obj[method] = val
            except:
                review_json_obj[method] = None;                
        reviews_json.append(review_json_obj)

In [206]:
# Save the JSON object
with open('./data/pitchfork_reviews.json', 'w') as outfile:
    json.dump(reviews_json, outfile)

## Only keep songs for artists and albums that have reviews

In [213]:
import numpy as np

In [209]:
# Pitchfork album reviews
del(reviews)
reviews = pd.read_json('./data/pitchfork_reviews.json')

In [207]:
# Load the song lyrics
songs = pd.read_json('./data/lyrics/all_songs.json')

In [210]:
artists_with_reviews = list(map(str.lower, set(reviews.artist)))
albums_with_reviews = list(map(str.lower, set(reviews.album)))

In [214]:
mask_artists = np.ndarray(songs.shape, dtype=bool)
for n in np.arange(len(songs)):
    row = songs.iloc[n]
    if row.artist is None or row.album is None:
        mask_artists[n, :]
    elif (row.artist.lower() in artists_with_reviews) and (row.album.lower() in albums_with_reviews):
        mask_artists[n, :] = False
    else:
        mask_artists[n, :] = True 

In [215]:
# Remove the songs without corresponding Pitchfork reviews
songs.mask(mask_artists, inplace=True)
songs.dropna(inplace=True);

In [216]:
with open('./data/songs_with_reviews.json', 'w') as outfile:
    json.dump(json.loads(songs.to_json(orient='records')), outfile)