# Comparing Pitchfork reviews and artist lyrics

In [179]:
import pitchfork.pitchfork as pitchfork
from pitchfork.pitchfork.pitchfork import Review # For restoring saved reviews
from bs4 import BeautifulSoup
import json
import pickle
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import time

## Load the lyrics data

In [7]:
# Load the lyrics
artists_rap = json.load(open('lyrics/rap_lyrics.json'))
artists_rock = json.load(open('lyrics/rock_lyrics.json'))
artists_country = json.load(open('lyrics/country_lyrics.json'))
artists_raw = artists_rap + artists_rock + artists_country

In [10]:
# Determine
def getArtistImage(songs):    
    urls = [None]
    for song in songs:
        if song['raw']['album'] is not None:
            try:                
                urls.append(song['raw']['album']['artist']['image_url'])                
            except:
                pass
    
    return Counter(urls).most_common()[0][0] # Most common image

# Use a generator to exclude artists with too few songs
def filterBySongCount(artists, min_count=10, max_count=10000):
    for artist in artists:        
        num_songs = len(artist['songs'])
        if (num_songs >= min_count) and (num_songs < max_count): yield artist

min_songs, max_songs = 10, 2500
artists = list(filterBySongCount(artists_raw, min_songs, max_songs))

# Artist images
for artist in artists:
    artist['image_url'] = getArtistImage(artist['songs'])
    
    # Kludge
    if artist['artist'] == 'Black Thought':
        artist['image_url'] = 'https://images.genius.com/23de511f4ac50d900128ea363b7d81b3.450x320x1.jpg'
    if artist['artist'] == 'André 3000':
        artist['image_url'] = 'http://images.genius.com/64b15c9489c65f5bf8f6577334347404.434x434x1.jpg'            
        
# Gender
labels_gender = np.array([a['gender'] for a in artists])
mask_female = np.array([g=='female' for g in labels_gender])
mask_male   = np.array([g=='male'   for g in labels_gender])
print("You have lyrics from {} artists, with a minimum of {} songs per artist.\nFemale: {}, male: {}"
      .format(len(artists), min_songs, (labels_gender=='female').sum(), (labels_gender=='male').sum()))

# Add all song lyrics to a single list
all_songs = []
for artist, gender in zip(artists, labels_gender):
    for song in artist['songs']:        
        song['gender'] = gender
        all_songs.append(song)
              
song_count = len(all_songs)
print("Database contains {} songs.".format(song_count))

# Store all lyrics in a single string
all_lyrics = " ".join([song['lyrics'] for song in all_songs])

You have lyrics from 303 artists, with a minimum of 10 songs per artist.
Female: 79, male: 222
Database contains 58239 songs.


## Get list of all artist names and their albums

In [189]:
# For each artist, get a list of their albums
artist_albums = {}
for artist in artists:
    album_song_counts = {} # Keep track of how many songs show up for each album
    for song in artist['songs']:        
        album = song['album']        
        if album in album_song_counts:
            album_song_counts[album] += 1
        elif album is not None:
            album_song_counts[album]  = 1

    # Only store albums with a bare minimum of songs (a rough indication of a real album)
    artist_albums[artist['artist']] = [album[0] for album in album_song_counts.items() if album[1] >= 7]

# Filter out artists without any albums matching the criteria    
artist_albums = {artist: albums for artist, albums in artist_albums.items() if len(albums)>0}

In [190]:
artist_albums['The Beatles']

['Live At The BBC. Disk 2',
 'The Beatles Album Art',
 'Abbey Road',
 'Let It Be',
 'Help!',
 "Sgt. Pepper's Lonely Hearts Club Band",
 "A Hard Day's Night",
 'Anthology 1',
 'Live At The BBC. Disk 1',
 'With the Beatles',
 'Anthology 3',
 'Magical Mystery Tour',
 'Please Please Me',
 "The Beatles' Christmas Album",
 'Beatles for Sale',
 'The Beatles (The White Album)',
 'The Savage Young Beatles',
 'Hey Jude',
 'Rubber Soul',
 'Revolver',
 "Sgt. Pepper's Lonely Hearts Club Band 50th Anniversary Bonus Disc"]

## Search Pitchfork for the album names we've collected

In [191]:
artist_albums = {'Kanye West': ['my beautiful', 'Graduation', 'Late Registration'], 'The Beatles': ['Abbey Road','Let It Be',"Sgt. Pepper's Lonely Hearts Club Band"]}

In [194]:
# Check if the pickle file already exists
filename_album_reviews = "album_reviews.pickle"
if os.path.exists(filename_album_reviews) and os.path.getsize(filename_album_reviews) > 0:
    artist_reviews = pickle.load(open(filename_album_reviews, 'rb'))
else:
    artist_reviews = {}
    
# Search for reviews for each artist's albums    
start = time.time()
for artist in artist_albums:
    print("\n{row}\nSearching for reviews by {artist}...".format(row="-"*50, artist=artist))
    try:
        album_reviews = artist_reviews[artist] if artist in artist_reviews else {}
        for album in artist_albums[artist]:
            if album not in album_reviews:
                try:
                    review = pitchfork.search(artist, album)
                    album_reviews[album] = review.to_json()
                    print('Found a review for "{album}" / ({rev_name}).'.format(album=album, rev_name=review.album()))
                except:
                    print("Could not find a review for {album}".format(album=album))
                time.sleep(3) # Wait before pulling a new review
            else:
                print("Already have the review for {}".format(album))

        # If reviews were found, add them to the dictionary
        if len(album_reviews) > 0:
            artist_reviews[artist] = album_reviews

        # Save all reviews for current artist into the pickle
        pickle.dump(artist_reviews, open(filename_album_reviews, 'wb'))
    except Exception as e:
        print("Error:\n{e}".format(e=e))
        time.sleep(5*60) # Wait five minutes
    
end = time.time()
print("\n\n{row}\nDone scraping Pitchfork reviews!\nTime elapsed: {t} hours".format(row="*"*60, t=(start-end)/60/60))


--------------------------------------------------
Searching for reviews by Kanye West...
Already have the review for my beautiful
Already have the review for Graduation
Already have the review for Late Registration

--------------------------------------------------
Searching for reviews by The Beatles...
Already have the review for Abbey Road
Already have the review for Let It Be
Already have the review for Sgt. Pepper's Lonely Hearts Club Band


**************************************************
Done scraping Pitchfork reviews!
Time elapsed: -1.970065964592828e-06 hours


## Code for loading the Pitchfork reviews from the pickle

In [174]:
def convert_json_to_review(json_obj):
    return Review(json_obj['searched_artist'], json_obj['searched_album'],
                  json_obj['matched_artist'], json_obj['matched_album'],
                  json_obj['query'], json_obj['url'], BeautifulSoup(json_obj['soup'], "lxml"))

In [175]:
R = convert_json_to_review(j)