In [None]:
# https://github.com/evanm31/p4k-scraper/blob/master/data/scrapefork.py

In [9]:
# !python -m pip install retry
# !python -m pip install backoff

In [7]:
import urllib
import re
import time
import retry
import backoff
import pandas as pd
import requests
import datetime
import os
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen

In [139]:
def gather_info(album_link):
    '''
    This function parses the HTML of the page and attempts to gather attributes like artist name, album, genre,
    date, and the review text itself, instead inputting a null value if the requested element is not found on
    the page. All of the data are put into a Pandas dataframe and returned for use in the gather function.
    VARIABLES
    album_link - A string that refers to the album section of a link to a Pitchfork review.
    e.g. '/reviews/albums/neil-young-promise-of-the-real-visitor/'
    '''
    page = requests.get(album_link) #request URL
    soup = BeautifulSoup(page.content, 'html.parser') #parse with beautifulsoup

    status = True
    while status:
        if page.status_code != 200:
            print("Error: ",page.status_code)
            time.sleep(2)
            page = requests.get(album_link) #request URL
            soup = BeautifulSoup(page.content, 'html.parser') #parse with beautifulsoup
        else:
            status = False
    
    title = str(soup.find('title').string) #album and artist 
    try:
        score = float((soup.find(class_="score").string)) #score
    except AttributeError:
        try:
            score = float((soup.find(class_="BaseWrap-sc-gjQpdd BaseText-ewhhUZ Rating-bkjebD iUEiRd bwCcXY imqiqZ").string)) #score
        except:
            score = None
    try:
        genre = soup.find(class_="genre-list__link").string #genre
    except AttributeError:
        genre = None
    sents = [element.text for element in soup.find_all('p')] #cleaned text output
    the_text = " ".join(sents).split('Reviewed: ',1)[1]
    the_text = the_text.split(" ",3)
    review_text = the_text[3]
    try:
        reviewed_date = str(soup.find(class_="pub-date").string) #date
    except AttributeError:
        reviewed_date = the_text[0]+" "+the_text[1]+" "+the_text[2]

    df = pd.DataFrame({'artist': [get_artist(title)]
                       ,'album': [get_album(title)]
                       ,'score': [score]
                       ,'genre': [genre]
                       ,'review': [review_text]
                       ,'best': [1 if "Best new" in review_text else 0]
                       ,'reviewed_date': [reviewed_date]
                       ,'link':[album_link]})
    return df

def get_artist(title):
    '''
    This function retreives the artist name from the scraped title string.
    VARIABLES
    title - A string of a cleaned Pitchfork album review title.
    '''
    str = ''
    for character in title: #for each character in title
        #add to string up until ':' 
        if character != ":":
            str += character
        else:
            break
    return str
        
def get_album(title):
    '''
    This function retreives the album name from the scraped title string.
    VARIABLES
    title - A string of a cleaned Pitchfork album review title.
    ''' 
    str = ''
    #find ':' and index and start there
    index = title.find(":")
    title = title[index+2:]
    #for each character afterwards, add it until '|'
    for character in title:
        if character == "|":
            break
        else:
            str +=character
    return str[:-14] #return just the title

In [None]:
pd.DataFrame({'artist': 'ab'
              ,'album': 'b'
              ,'score': 'c'
              ,'genre': 'd'
              ,'link':'e'})

In [None]:
@retry.retry(urllib.error.URLError, tries=4, delay=3, backoff=2)
def urlopen_with_
retry(url):
    return urlopen(url)

In [None]:
# Load list of album review links
links_file_list = os.listdir("/Users/kylezengo/Desktop/DS/Music Ratings/pitchfork_links")

links_list = []
for i in links_file_list:
    links = open("/Users/kylezengo/Desktop/DS/Music Ratings/pitchfork_links/"+i)
    links = links.read().splitlines()
    
    links_list += links

links_list = list(set(links_list))
links_list = [x for x in links_list if x.startswith("https")]
print(f'{len(links_list)} album review links')

In [None]:
%%time
# Get the link of each album review
# Takes over 14 hours to run!

new_links = []
# for i in range(1,1987): # 1987 final page on 2021-08-18
for i in range(1,5):
    print(str(i)+":",requests.get("https://pitchfork.com/reviews/albums/?page=" + str(i)))
    req = Request("https://pitchfork.com/reviews/albums/?page=" + str(i))
    html_page = urlopen_with_retry(req)

    soup = BeautifulSoup(html_page, "lxml")
    
    for link in soup.findAll('a'):
        link_get_href = link.get('href')
        if link_get_href == '/reviews/albums/':
            pass
        elif link_get_href.startswith( '/reviews/albums/?genre=' ):
            pass
        elif link_get_href.startswith( '/reviews/albums/' ):
            new_links.append("https://pitchfork.com"+link_get_href)

new_links = list(set(new_links))

In [None]:
len(new_links)

In [None]:
# Save list since takes time to generate
with open(f'/Users/kylezengo/Desktop/DS/Music Ratings/pitchfork_links/pitchfork_links_{datetime.datetime.now().date()}.txt', 'w') as f:
    for item in new_links:
        f.write("%s\n" % item)

In [None]:
%%time
dat = []
for idx, i in enumerate(new_links):
    print(f'{idx}: {i}')
    dat.append(gather_info(i))

reviews = pd.concat(dat)

In [37]:
# # Test if break on bad page
# gather_info("https://pitchfork.com/reviews/albums/james-devane-searching/")

In [141]:
gather_info("https://pitchfork.com/reviews/albums/james-devane-searching/")

Unnamed: 0,artist,album,score,genre,review,best,reviewed_date,link
0,James Devane,Searching,7.7,,In the 14 years between his first and second s...,0,"May 28, 2024",https://pitchfork.com/reviews/albums/james-dev...


In [None]:
reviews = pd.concat(dat)
reviews

In [None]:
# reviews.to_csv("/Users/kylezengo/Desktop/reviews.csv",index=False)