In [None]:
# https://github.com/evanm31/p4k-scraper/blob/master/data/scrapefork.py

In [None]:
# !python -m pip install retry
# !python -m pip install backoff

Import packages and set up enviornment

In [1]:
import urllib
import re
import time
import retry
import backoff
import pandas as pd
import requests
import datetime
import os
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen

# If working on mac, use desktop folders, otherwise just work from git repo/ directory
if os.name == "posix":
    os.chdir("/Users/kylezengo/Desktop/DS/Music Ratings/")

Define Pitchfork Scraping Functions

In [47]:
def gather_info(album_link):
    '''
    This function parses the HTML of the page and attempts to gather attributes like artist name, album, genre,
    date, and the review text itself, instead inputting a null value if the requested element is not found on
    the page. All of the data are put into a Pandas dataframe and returned for use in the gather function.
    VARIABLES
    album_link - A string that refers to the album section of a link to a Pitchfork review.
    e.g. '/reviews/albums/neil-young-promise-of-the-real-visitor/'
    '''
    page = requests.get(album_link) #request URL
    soup = BeautifulSoup(page.content, 'html.parser') #parse with beautifulsoup

    status = True
    while status:
        if page.status_code != 200:
            print("Error: ",page.status_code)
            time.sleep(2)
            page = requests.get(album_link) #request URL
            soup = BeautifulSoup(page.content, 'html.parser') #parse with beautifulsoup
        else:
            status = False

    title = str(soup.find('title').string) #album and artist 
    sents = [element.text for element in soup.find_all('p')] #cleaned text output
    all_text = " ".join(sents)
    selected_text = all_text.split('Reviewed: ',1)[1]
    selected_text = selected_text.split(" ",3)
    review_text = selected_text[3]
    review_text = review_text.split(" By signing up you agree to our User Agreement (including the class action waiver and arbitration",1)[0]
    
    try:
        score = float((soup.find(class_="score").string)) #score
    except AttributeError:
        try:
            score = float((soup.find(class_="BaseWrap-sc-gjQpdd BaseText-ewhhUZ Rating-bkjebD iUEiRd bwCcXY imqiqZ").string))
        except:
            # score = float((soup.find(class_="BaseWrap-sc-gjQpdd BaseText-ewhhUZ Rating-iATjmx iUEiRd bwCcXY dBMsvl").string))
            score = float((soup.find(class_="BaseWrap-sc-gjQpdd BaseText-ewhhUZ Rating-bkjebD iUEiRd bwCcXY fuVxVq").string))
    try:
        genre = soup.find(class_="genre-list__link").string #genre
    except AttributeError:
        try:
            genre = all_text.split("Genre: ",1)[1]
            genre = genre.split("Label: ",1)[0].strip()
        except IndexError:
            genre = None
    try:
        reviewed_date = str(soup.find(class_="pub-date").string) #date
    except AttributeError:
        reviewed_date = selected_text[0]+" "+selected_text[1]+" "+selected_text[2]
    try:
        artist = soup.find(class_="BaseWrap-sc-gjQpdd BaseText-ewhhUZ SplitScreenContentHeaderArtist-ftloCc iUEiRd Byyns kRtQWW").string
    except:
        artist = get_artist(title)
    try:
        album = soup.find(class_="BaseWrap-sc-gjQpdd BaseText-ewhhUZ SplitScreenContentHeaderHed-lcUSuI iUEiRd ckzqqn fTtZlw").string
    except:
        album = get_album(title)
        
    df = pd.DataFrame({'artist': [artist]
                       ,'album': [album]
                       ,'score': [score]
                       ,'genre': [genre]
                       ,'review': [review_text]
                       ,'best': [1 if "best new" in all_text.lower() else 0]
                       ,'reviewed_date': [reviewed_date]
                       ,'link':[album_link]})
    return df

def get_artist(title):
    '''
    This function retreives the artist name from the scraped title string.
    VARIABLES
    title - A string of a cleaned Pitchfork album review title.
    '''
    artist = ''
    for character in title:
        #add to string up until ':' 
        if character != ":":
            artist += character
        else:
            break
    return artist
        
def get_album(title):
    '''
    This function retreives the album name from the scraped title string.
    VARIABLES
    title - A string of a cleaned Pitchfork album review title.
    ''' 
    my_str = ''
    #find ':' and index and start there
    index = title.find(":")
    title = title[index+2:]
    #for each character afterwards, add it until '|'
    for character in title:
        if character == "|":
            break
        else:
            my_str +=character
    album = my_str[:-14] #return just the title
    return album

In [9]:
@retry.retry(urllib.error.URLError, tries=4, delay=3, backoff=2)
def urlopen_with_retry(url):
    return urlopen(url)

Load historic data

In [171]:
# Load list of album review links
links_file_list = os.listdir("new_links")

links_list = []
for i in links_file_list:
    links = open("new_links/"+i)
    links = links.read().splitlines()
    
    links_list += links

links_list = list(set(links_list))
links_list = [x for x in links_list if x.startswith("https")]
print(f'{len(links_list)} album review links')

9976 album review links


Get the link of each album review and save off to text file

In [165]:
%%time
new_links = []
i = 1
while requests.get("https://pitchfork.com/reviews/albums/?page="+str(i)).status_code==200:
    req = Request("https://pitchfork.com/reviews/albums/?page=" + str(i))
    html_page = urlopen_with_retry(req)
    print(f'{i}: html_page status code {html_page.getcode()}')

    soup = BeautifulSoup(html_page, "lxml")
    
    for link in soup.findAll('a'):
        link_get_href = link.get('href')
        if link_get_href == '/reviews/albums/':
            pass
        elif link_get_href.startswith( '/reviews/albums/?genre=' ):
            pass
        elif link_get_href.startswith( '/reviews/albums/' ):
            new_links.append("https://pitchfork.com"+link_get_href)
    i+=1

new_links = list(set(new_links))

1: html_page status code 200
2: html_page status code 200
3: html_page status code 200
4: html_page status code 200
5: html_page status code 200
6: html_page status code 200
7: html_page status code 200
8: html_page status code 200
9: html_page status code 200
10: html_page status code 200
11: html_page status code 200
12: html_page status code 200
13: html_page status code 200
14: html_page status code 200
15: html_page status code 200
16: html_page status code 200
17: html_page status code 200
18: html_page status code 200
19: html_page status code 200
20: html_page status code 200
21: html_page status code 200
22: html_page status code 200
23: html_page status code 200
24: html_page status code 200
25: html_page status code 200
26: html_page status code 200
27: html_page status code 200
28: html_page status code 200
29: html_page status code 200
30: html_page status code 200
31: html_page status code 200
32: html_page status code 200
33: html_page status code 200
34: html_page statu

In [167]:
len(new_links)

9976

In [245]:
# Save list since takes time to generate
with open(f'new_links/new_links_{datetime.datetime.now().date()}.txt', 'w') as f:
    for item in new_links:
        f.write("%s\n" % item)

Scrape reviews for each album and save to csv

In [None]:
# # test links
# new_links = ["https://pitchfork.com/reviews/albums/james-devane-searching/",
#              "https://pitchfork.com/reviews/albums/blood-incantation-absolute-elsewhere/",
#              "https://pitchfork.com/reviews/albums/body-meat-starchris/",
#              "https://pitchfork.com/reviews/albums/fred-again-actual-life-3-january-1-september-9-2022/",
#              "https://pitchfork.com/reviews/albums/100-gecs-10000-gecs/"]

In [59]:
%%time
dat = []
for idx, i in enumerate(new_links):
    print(f'{idx}: {i}')
    dat.append(gather_info(i))

new_reviews = pd.concat(dat).reset_index(drop=True)

0: https://pitchfork.com/reviews/albums/yaya-bey-ten-fold/
1: https://pitchfork.com/reviews/albums/yhapojj-ps-fuck-you/
2: https://pitchfork.com/reviews/albums/chat-pile-cool-world/
3: https://pitchfork.com/reviews/albums/nidia-and-valentina-estradas/
4: https://pitchfork.com/reviews/albums/mercury-rev-born-horses/
5: https://pitchfork.com/reviews/albums/sophie-sophie/
6: https://pitchfork.com/reviews/albums/latto-sugar-honey-iced-tea/
7: https://pitchfork.com/reviews/albums/trent-reznor-atticus-ross-challengers-original-score/
8: https://pitchfork.com/reviews/albums/porches-shirt/
9: https://pitchfork.com/reviews/albums/hovvdy-hovvdy/
10: https://pitchfork.com/reviews/albums/the-belair-lip-bombs-lush-life/
11: https://pitchfork.com/reviews/albums/moses-sumney-sophcore-ep/
12: https://pitchfork.com/reviews/albums/the-hard-quartet-the-hard-quartet/
13: https://pitchfork.com/reviews/albums/skaiwater-gigi/
14: https://pitchfork.com/reviews/albums/2hollis-boy/
15: https://pitchfork.com/rev

In [63]:
# new_reviews

In [55]:
# # Testing errors
# album_link = "https://pitchfork.com/reviews/albums/various-artists-even-the-forest-hums-ukrainian-sonic-archives/" 
# gather_info(album_link)

# page = requests.get(album_link) #request URL
# soup = BeautifulSoup(page.content, 'html.parser') #parse with beautifulsoup
# soup

In [67]:
new_reviews.to_csv(f'new_reviews/new_reviews_{datetime.datetime.now().date()}.csv',index=False)