In [None]:
# https://github.com/evanm31/p4k-scraper/blob/master/data/scrapefork.py

In [None]:
# !python -m pip install retry
# !python -m pip install backoff

In [None]:
# Import packages and set up enviornment
import urllib
import re
import time
import retry
import backoff
import pandas as pd
import requests
import datetime
import os
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen

if os.name == "posix":
    os.chdir("/Users/kylezengo/Desktop/DS/Music Ratings/")

In [None]:
def gather_info(album_link):
    '''
    This function parses the HTML of the page and attempts to gather attributes like artist name, album, genre,
    date, and the review text itself, instead inputting a null value if the requested element is not found on
    the page. All of the data are put into a Pandas dataframe and returned for use in the gather function.
    VARIABLES
    album_link - A string that refers to the album section of a link to a Pitchfork review.
    e.g. '/reviews/albums/neil-young-promise-of-the-real-visitor/'
    '''
    page = requests.get(album_link) #request URL
    soup = BeautifulSoup(page.content, 'html.parser') #parse with beautifulsoup

    status = True
    while status:
        if page.status_code != 200:
            print("Error: ",page.status_code)
            time.sleep(2)
            page = requests.get(album_link) #request URL
            soup = BeautifulSoup(page.content, 'html.parser') #parse with beautifulsoup
        else:
            status = False

    title = str(soup.find('title').string) #album and artist 
    sents = [element.text for element in soup.find_all('p')] #cleaned text output
    all_text = " ".join(sents)
    selected_text = all_text.split('Reviewed: ',1)[1]
    selected_text = selected_text.split(" ",3)
    review_text = selected_text[3]
    review_text = review_text.split(" By signing up you agree to our User Agreement (including the class action waiver and arbitration",1)[0]
    
    try:
        score = float((soup.find(class_="score").string)) #score
    except AttributeError:
        try:
            score = float((soup.find(class_="BaseWrap-sc-gjQpdd BaseText-ewhhUZ Rating-bkjebD iUEiRd bwCcXY imqiqZ").string))
        except:
            # score = float((soup.find(class_="BaseWrap-sc-gjQpdd BaseText-ewhhUZ Rating-iATjmx iUEiRd bwCcXY dBMsvl").string))
            score = float((soup.find(class_="BaseWrap-sc-gjQpdd BaseText-ewhhUZ Rating-bkjebD iUEiRd bwCcXY fuVxVq").string))
    try:
        genre = soup.find(class_="genre-list__link").string #genre
    except AttributeError:
        genre = all_text.split("Genre: ",1)[1]
        genre = genre.split("Label: ",1)[0].strip()
    try:
        reviewed_date = str(soup.find(class_="pub-date").string) #date
    except AttributeError:
        reviewed_date = selected_text[0]+" "+selected_text[1]+" "+selected_text[2]
    try:
        artist = soup.find(class_="BaseWrap-sc-gjQpdd BaseText-ewhhUZ SplitScreenContentHeaderArtist-ftloCc iUEiRd Byyns kRtQWW").string
    except:
        artist = get_artist(title)
    try:
        album = soup.find(class_="BaseWrap-sc-gjQpdd BaseText-ewhhUZ SplitScreenContentHeaderHed-lcUSuI iUEiRd ckzqqn fTtZlw").string
    except:
        album = get_album(title)
        
    df = pd.DataFrame({'artist': [artist]
                       ,'album': [album]
                       ,'score': [score]
                       ,'genre': [genre]
                       ,'review': [review_text]
                       ,'best': [1 if "best new" in all_text.lower() else 0]
                       ,'reviewed_date': [reviewed_date]
                       ,'link':[album_link]})
    return df

def get_artist(title):
    '''
    This function retreives the artist name from the scraped title string.
    VARIABLES
    title - A string of a cleaned Pitchfork album review title.
    '''
    artist = ''
    for character in title:
        #add to string up until ':' 
        if character != ":":
            artist += character
        else:
            break
    return artist
        
def get_album(title):
    '''
    This function retreives the album name from the scraped title string.
    VARIABLES
    title - A string of a cleaned Pitchfork album review title.
    ''' 
    my_str = ''
    #find ':' and index and start there
    index = title.find(":")
    title = title[index+2:]
    #for each character afterwards, add it until '|'
    for character in title:
        if character == "|":
            break
        else:
            my_str +=character
    album = my_str[:-14] #return just the title
    return album

In [None]:
@retry.retry(urllib.error.URLError, tries=4, delay=3, backoff=2)
def urlopen_with_retry(url):
    return urlopen(url)

In [None]:
# Load list of album review links
links_file_list = os.listdir("pitchfork_links")

links_list = []
for i in links_file_list:
    links = open("pitchfork_links/"+i)
    links = links.read().splitlines()
    
    links_list += links

links_list = list(set(links_list))
links_list = [x for x in links_list if x.startswith("https")]
print(f'{len(links_list)} album review links')

In [None]:
%%time
# Get the link of each album review
# Takes over 14 hours to run!

new_links = []
# for i in range(1,1987): # 1987 final page on 2021-08-18
for i in range(1,5):
    print(str(i)+":",requests.get("https://pitchfork.com/reviews/albums/?page=" + str(i)))
    req = Request("https://pitchfork.com/reviews/albums/?page=" + str(i))
    html_page = urlopen_with_retry(req)

    soup = BeautifulSoup(html_page, "lxml")
    
    for link in soup.findAll('a'):
        link_get_href = link.get('href')
        if link_get_href == '/reviews/albums/':
            pass
        elif link_get_href.startswith( '/reviews/albums/?genre=' ):
            pass
        elif link_get_href.startswith( '/reviews/albums/' ):
            new_links.append("https://pitchfork.com"+link_get_href)

new_links = list(set(new_links))

In [None]:
len(new_links)

In [None]:
# Save list since takes time to generate
with open(f'pitchfork_links/pitchfork_links_{datetime.datetime.now().date()}.txt', 'w') as f:
    for item in new_links:
        f.write("%s\n" % item)

In [None]:
# # test links
# new_links = ["https://pitchfork.com/reviews/albums/james-devane-searching/",
#              "https://pitchfork.com/reviews/albums/blood-incantation-absolute-elsewhere/",
#              "https://pitchfork.com/reviews/albums/body-meat-starchris/",
#              "https://pitchfork.com/reviews/albums/fred-again-actual-life-3-january-1-september-9-2022/",
#              "https://pitchfork.com/reviews/albums/100-gecs-10000-gecs/"]

In [None]:
%%time
dat = []
for idx, i in enumerate(new_links):
    print(f'{idx}: {i}')
    dat.append(gather_info(i))

reviews = pd.concat(dat).reset_index(drop=True)

In [None]:
# reviews.to_csv("/Users/kylezengo/Desktop/reviews.csv",index=False)