# Pitchfork review scraper

In [52]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import datetime as dt

In [53]:
def gather_links(pages, startPage):
    pageList = [] #list of album review pages
    linkList = [] #list of album links
    for x in range(startPage,(startPage+pages)): #check the first n pages after the requested one
        pageList.append(requests.get("https://pitchfork.com/reviews/albums/?page=" + str(x))) #add each page to list
    for page in pageList:
        soup = BeautifulSoup(page.content, 'html.parser') #parse its contents
        links = soup.find_all(class_="review__link") #gather its links (in raw html)
        for link in links: #for each link
            linkList.append(link.get('href')) #append only the link itself
    return linkList

In [54]:
def gather_info(album_link):
    page = requests.get("https://pitchfork.com" + album_link) #request URL
    soup = BeautifulSoup(page.content, 'html.parser') #parse with beautifulsoup
    title = str(soup.find('title').string) #album and artist 
    try:
        score = float((soup.find(class_="score").string)) #score
    except AttributeError:
        score = None
    try:
        genre = soup.find(class_="genre-list__link").string #genre
    except AttributeError:
        genre = None
    sents = [element.text for element in soup.find_all('p')] #cleaned text output
    string = " ".join(sents)
    try:
        date = str(soup.find(class_="pub-date").string) #date
    except AttributeError:
        date = None
    #create dataframe with column labels
    d = {'artist': [get_artist(title)], 'album': [get_album(title)], 'score': [score], 'genre': [genre], 'review': [string], 'best': [1 if "Best new" in string else 0], 'date': [date]}
    df = pd.DataFrame(data=d)
    return df

In [55]:
def get_artist(title):
    str = ''
    for character in title: #for each character in title
        #add to string up until ':' 
        if character is not ':':
            str += character
        else:
            break
    return str

In [56]:
def get_album(title):
    str = ''
    #find ':' and index and start there
    index = title.find(":")
    title = title[index+2:]
    #for each character afterwards, add it until '|'
    for character in title:
        if character is '|':
            break
        else:
            str +=character
    return str[:-14] #return just the title

In [57]:
def gather(pages, startPage, fileLocation, fileName):
    linkList = gather_links(pages, startPage) #gather links
    first = True #special variable for first scrape
    newDF = pd.DataFrame()
    for link in linkList: #for each link
        data = gather_info(link) #gather info
        #if first, newDF becomes the initial dataframe
        if first:
            newDF = data
            first = False
        #otherwise append it
        else:
            newDF = newDF.append(data, ignore_index = True)
    #when scraping complete, export to .csv 
    newDF.to_csv(path_or_buf = fileLocation + "/" + fileName + ".csv")
    #return true if gather was successful
    return True

In [61]:
start = dt.datetime.now()
gather(1016, 1, "C:\\Users\\sanderdv\\OneDrive - University of Cincinnati\\School\\Second Year\\Spring 2019\\Machine Learning and Data Analytics\\Project", "pitchfork1016")
print(str(dt.datetime.now()-start))

ConnectionError: HTTPSConnectionPool(host='pitchfork.com', port=443): Max retries exceeded with url: /reviews/albums/22747-helices-ep/ (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x000001D402EA7C50>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))

In [62]:
start = dt.datetime.now()
gather(254, 1, "C:\\Users\\sanderdv\\OneDrive - University of Cincinnati\\School\\Second Year\\Spring 2019\\Machine Learning and Data Analytics\\Project", "pitchfork1")
print(str(dt.datetime.now()-start))

1:13:54.559121


In [63]:
start = dt.datetime.now()
gather(254, 255, "C:\\Users\\sanderdv\\OneDrive - University of Cincinnati\\School\\Second Year\\Spring 2019\\Machine Learning and Data Analytics\\Project", "pitchfork2")
print(str(dt.datetime.now()-start))

1:15:25.205973


In [64]:
start = dt.datetime.now()
gather(254, 509, "C:\\Users\\sanderdv\\OneDrive - University of Cincinnati\\School\\Second Year\\Spring 2019\\Machine Learning and Data Analytics\\Project", "pitchfork3")
print(str(dt.datetime.now()-start))

1:25:00.634204


In [65]:
start = dt.datetime.now()
gather(254, 763, "C:\\Users\\sanderdv\\OneDrive - University of Cincinnati\\School\\Second Year\\Spring 2019\\Machine Learning and Data Analytics\\Project", "pitchfork4")
print(str(dt.datetime.now()-start))

1:32:41.913302
