In [None]:
# https://github.com/evanm31/p4k-scraper/blob/master/data/scrapefork.py

In [9]:
import urllib
import re
import time
import retry
import backoff
import pandas as pd
import requests
import datetime
import os
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen

In [292]:
def gather_info(album_link):
    '''
    This function parses the HTML of the page and attempts to gather attributes like artist name, album, genre,
    date, and the review text itself, instead inputting a null value if the requested element is not found on
    the page. All of the data are put into a Pandas dataframe and returned for use in the gather function.
    VARIABLES
    album_link - A string that refers to the album section of a link to a Pitchfork review.
    e.g. '/reviews/albums/neil-young-promise-of-the-real-visitor/'
    '''
    page = requests.get(album_link) #request URL
    soup = BeautifulSoup(page.content, 'html.parser') #parse with beautifulsoup

    status = True
    while status:
        if page.status_code != 200:
            print("Error: ",page.status_code)
            time.sleep(2)
            page = requests.get(album_link) #request URL
            soup = BeautifulSoup(page.content, 'html.parser') #parse with beautifulsoup
        else:
            status = False
    
    title = str(soup.find('title').string) #album and artist 
    try:
        score = float((soup.find(class_="score").string)) #score
        if not score:
            score = float((soup.find(class_="BaseWrap-sc-gjQpdd BaseText-ewhhUZ Rating-bkjebD iUEiRd bwCcXY imqiqZ").string)) #score
    except AttributeError:
        score = None
    try:
        genre = soup.find(class_="genre-list__link").string #genre
    except AttributeError:
        genre = None
    sents = [element.text for element in soup.find_all('p')] #cleaned text output
    review_text = " ".join(sents)
    try:
        date = str(soup.find(class_="pub-date").string) #date
    except AttributeError:
        date = None

    df = pd.DataFrame({'artist': [get_artist(title)]
                       ,'album': [get_album(title)]
                       ,'score': [score]
                       ,'genre': [genre]
                       ,'review': [review_text]
                       ,'best': [1 if "Best new" in review_text else 0]
                       ,'date': [date]
                       ,'link':[album_link]})
    print(score)
    return df

def get_artist(title):
    '''
    This function retreives the artist name from the scraped title string.
    VARIABLES
    title - A string of a cleaned Pitchfork album review title.
    '''
    str = ''
    for character in title: #for each character in title
        #add to string up until ':' 
        if character != ":":
            str += character
        else:
            break
    return str
        
def get_album(title):
    '''
    This function retreives the album name from the scraped title string.
    VARIABLES
    title - A string of a cleaned Pitchfork album review title.
    ''' 
    str = ''
    #find ':' and index and start there
    index = title.find(":")
    title = title[index+2:]
    #for each character afterwards, add it until '|'
    for character in title:
        if character == "|":
            break
        else:
            str +=character
    return str[:-14] #return just the title

In [242]:
pd.DataFrame({'artist': 'ab'
              ,'album': 'b'
              ,'score': 'c'
              ,'genre': 'd'
              ,'link':'e'})

ValueError: If using all scalar values, you must pass an index

In [193]:
@retry.retry(urllib.error.URLError, tries=4, delay=3, backoff=2)
def urlopen_with_
retry(url):
    return urlopen(url)

In [241]:
# Load list of album review links
links_file_list = os.listdir("/Users/kylezengo/Desktop/DS/Music Ratings/pitchfork_links")

links_list = []
for i in links_file_list:
    links = open("/Users/kylezengo/Desktop/DS/Music Ratings/pitchfork_links/"+i)
    links = links.read().splitlines()
    
    links_list += links

links_list = list(set(links_list))
links_list = [x for x in links_list if x.startswith("https")]
print(f'{len(links_list)} album review links')

23830 album review links


In [262]:
%%time
# Get the link of each album review
# Takes over 14 hours to run!

new_links = []
# for i in range(1,1987): # 1987 final page on 2021-08-18
for i in range(1,5):
    print(str(i)+":",requests.get("https://pitchfork.com/reviews/albums/?page=" + str(i)))
    req = Request("https://pitchfork.com/reviews/albums/?page=" + str(i))
    html_page = urlopen_with_retry(req)

    soup = BeautifulSoup(html_page, "lxml")
    
    for link in soup.findAll('a'):
        link_get_href = link.get('href')
        if link_get_href == '/reviews/albums/':
            pass
        elif link_get_href.startswith( '/reviews/albums/?genre=' ):
            pass
        elif link_get_href.startswith( '/reviews/albums/' ):
            new_links.append("https://pitchfork.com"+link_get_href)

new_links = list(set(new_links))

1: <Response [200]>
2: <Response [200]>
3: <Response [200]>
4: <Response [200]>
CPU times: user 704 ms, sys: 43.3 ms, total: 747 ms
Wall time: 4.12 s


In [263]:
len(new_links)

384

In [200]:
# Save list since takes time to generate
with open(f'/Users/kylezengo/Desktop/DS/Music Ratings/pitchfork_links/pitchfork_links_{datetime.datetime.now().date()}.txt', 'w') as f:
    for item in new_links:
        f.write("%s\n" % item)

In [266]:
%%time
dat = []
for idx, i in enumerate(new_links):
    print(f'{idx}: {i}')
    dat.append(gather_info(i))

reviews = pd.concat(dat)

0: https://pitchfork.com/reviews/albums/james-devane-searching/
1: https://pitchfork.com/reviews/albums/mustafa-dunya/
2: https://pitchfork.com/reviews/albums/taylor-swift-the-tortured-poets-department-the-anthology/
3: https://pitchfork.com/reviews/albums/garbage-version-2/
4: https://pitchfork.com/reviews/albums/tucker-zimmerman-dance-of-love/
5: https://pitchfork.com/reviews/albums/magdalena-bay-imaginal-disk/
6: https://pitchfork.com/reviews/albums/eris-drew-raving-disco-breaks-vol-ii/
7: https://pitchfork.com/reviews/albums/2300-bully/
8: https://pitchfork.com/reviews/albums/kaytranada-timeless/
9: https://pitchfork.com/reviews/albums/xaviersobased-with-2/
10: https://pitchfork.com/reviews/albums/i-jordan-i-am-jordan/
11: https://pitchfork.com/reviews/albums/chief-keef-almighty-so-2/
12: https://pitchfork.com/reviews/albums/why-bonnie-wish-on-the-bone/
13: https://pitchfork.com/reviews/albums/english-teacher-this-could-be-texas/
14: https://pitchfork.com/reviews/albums/horse-jumpe

KeyboardInterrupt: 

In [293]:
# Test if break on bad page
gather_info("https://pitchfork.com/reviews/albums/james-devane-searching/")

None


Unnamed: 0,artist,album,score,genre,review,best,date,link
0,James Devane,Searching,,,Find anything you save across the site in your...,0,,https://pitchfork.com/reviews/albums/james-dev...


In [287]:
reviews = pd.concat(dat)
reviews

Unnamed: 0,artist,album,score,genre,review,best,date,link
0,James Devane,Searching,,,Find anything you save across the site in your...,0,,https://pitchfork.com/reviews/albums/james-dev...
0,Mustafa,Dunya,,,Find anything you save across the site in your...,0,,https://pitchfork.com/reviews/albums/mustafa-d...
0,Taylor Swift,The Tortured Poets Department / The Anthology,,,Find anything you save across the site in your...,0,,https://pitchfork.com/reviews/albums/taylor-sw...
0,Garbage,Version 2.0,,,Find anything you save across the site in your...,0,,https://pitchfork.com/reviews/albums/garbage-v...
0,Tucker Zimmerman,Dance of Love,,,Find anything you save across the site in your...,0,,https://pitchfork.com/reviews/albums/tucker-zi...
0,Magdalena Bay,Imaginal Disk,,,Find anything you save across the site in your...,0,,https://pitchfork.com/reviews/albums/magdalena...
0,Eris Drew,Raving Disco Breaks Vol. II,,,Find anything you save across the site in your...,0,,https://pitchfork.com/reviews/albums/eris-drew...
0,2300,Bully,,,Find anything you save across the site in your...,0,,https://pitchfork.com/reviews/albums/2300-bully/
0,Kaytranada,Timeless,,,Find anything you save across the site in your...,0,,https://pitchfork.com/reviews/albums/kaytranad...
0,Xaviersobased,with 2,,,Find anything you save across the site in your...,0,,https://pitchfork.com/reviews/albums/xaviersob...


In [None]:
# reviews.to_csv("/Users/kylezengo/Desktop/reviews.csv",index=False)