In [65]:
import pitchfork.pitchfork as pitchfork
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import time

## Load the lyrics data

In [7]:
# Load the lyrics
artists_rap = json.load(open('lyrics/rap_lyrics.json'))
artists_rock = json.load(open('lyrics/rock_lyrics.json'))
artists_country = json.load(open('lyrics/country_lyrics.json'))
artists_raw = artists_rap + artists_rock + artists_country

In [10]:
# Determine
def getArtistImage(songs):    
    urls = [None]
    for song in songs:
        if song['raw']['album'] is not None:
            try:                
                urls.append(song['raw']['album']['artist']['image_url'])                
            except:
                pass
    
    return Counter(urls).most_common()[0][0] # Most common image

# Use a generator to exclude artists with too few songs
def filterBySongCount(artists, min_count=10, max_count=10000):
    for artist in artists:        
        num_songs = len(artist['songs'])
        if (num_songs >= min_count) and (num_songs < max_count): yield artist

min_songs, max_songs = 10, 2500
artists = list(filterBySongCount(artists_raw, min_songs, max_songs))

# Artist images
for artist in artists:
    artist['image_url'] = getArtistImage(artist['songs'])
    
    # Kludge
    if artist['artist'] == 'Black Thought':
        artist['image_url'] = 'https://images.genius.com/23de511f4ac50d900128ea363b7d81b3.450x320x1.jpg'
    if artist['artist'] == 'André 3000':
        artist['image_url'] = 'http://images.genius.com/64b15c9489c65f5bf8f6577334347404.434x434x1.jpg'            
        
# Gender
labels_gender = np.array([a['gender'] for a in artists])
mask_female = np.array([g=='female' for g in labels_gender])
mask_male   = np.array([g=='male'   for g in labels_gender])
print("You have lyrics from {} artists, with a minimum of {} songs per artist.\nFemale: {}, male: {}"
      .format(len(artists), min_songs, (labels_gender=='female').sum(), (labels_gender=='male').sum()))

# Add all song lyrics to a single list
all_songs = []
for artist, gender in zip(artists, labels_gender):
    for song in artist['songs']:        
        song['gender'] = gender
        all_songs.append(song)
              
song_count = len(all_songs)
print("Database contains {} songs.".format(song_count))

# Store all lyrics in a single string
all_lyrics = " ".join([song['lyrics'] for song in all_songs])

You have lyrics from 303 artists, with a minimum of 10 songs per artist.
Female: 79, male: 222
Database contains 58239 songs.


## Get list of all artist names and their albums

In [58]:
# For each artist, get a list of their albums
artist_albums = {}
for artist in artists:
    album_song_counts = {} # Keep track of how many songs show up for each album
    for song in artist['songs']:        
        album = song['album']        
        if album in album_song_counts:
            album_song_counts[album] += 1
        elif album is not None:
            album_song_counts[album]  = 1

    # Only store albums with a bare minimum of songs (a rough indication of a real album)
    artist_albums[artist['artist']] = [album[0] for album in album_song_counts.items() if album[1] >= 7]

# Filter out artists without any albums matching the criteria    
artist_albums = {artist: albums for artist, albums in artist_albums.items() if len(albums)>0}

## Search Pitchfork for the album names we've collected

In [None]:
for artist in artist_albums:
    for album in artist_albums[artist]:
        try:
            review = pitchfork.search(artist, album)
        except:
            pass # Couldn't find a review
                
        time.sleep(10) # Wait before pulling a new review