# AOTY scrapper notebook

This notebook is designed to scrape entries in the albumoftheyear.org for album information and reviews. The number of new entries can be determined for a single code run.

There are approximately 290.000 entries in the website, so it may take several days and weeks to scrape all of the data, although you may search for newer entries only if you have downloaded the dataset from Github or Kaggle. Also, you have the option to only update the existing dataset (by setting the "update_only" variable to True).

Steps:
1. Import the necesary libraries
2. Modify the number of new entries and set the update option to True or False (optional)
3. Load the dataset (if scrapping for second time)
4. Execute the cell containing the scrapping script and wait until the scrapping is completed
5. Repeat the process from step 2 or 3


## 1. Import the necesary libraries

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

## Determine the number of new entries in the dataset

In [None]:
# Determine the number of new entries to append to the existing dataset
number_of_entries = 10000
# Set the option to scrape all of the site's data to False, 
# Set the option to only update the existing data's indexes to True
update_only = False

## Load data and print the last entry in the index column (for updating existing dataset)

In [None]:
try:
    data = pd.read_csv("database.csv")
except:
    print("No dataset file was detected")
    dataset = False
else:
    max_index = data["Index"].max()
    total_index = max_index + number_of_entries
    print("The last entry in the index column is "+str(max_index))
    print("The last entry in index column after scrapping will be "+str(total_index))
    dataset = True

## Do the scrapping and append the newly generated entries to the dataset file

In [None]:
values = []
base_url = "https://www.albumoftheyear.org/album/"
headers = {'User-Agent': 'Mozilla/5.0'}
if update_only:
    for id in data["Index"]:
        index = id
        url = base_url+str(index)
        try:
            req = requests.get(url, headers=headers)
        except:
            print("request error for "+url, end="\r")
            continue
        else:
            req = requests.get(url, headers=headers)
        soup = BeautifulSoup(req.text, "html.parser")
        album = soup.find("div", {"class":"fullWidth"})
        try:
            artist = album.find("div",{"class":"artist"}).find("span").text
        except:
            continue
        else:
            artist = album.find("div",{"class":"artist"}).find("span").text
            title = album.find("div",{"class":"albumTitle"}).find("span").text
            if album.find("div",{"class":"albumCriticScore"}).text=="NR":
                continue
            else:
                critic_score = album.find("div",{"class":"albumCriticScore"}).find("a").text
            if album.find("div",{"class":"albumUserScore"}).text=="NR":
                user_score = ""
            else:
                user_score = album.find("div",{"class":"albumUserScore"}).find("a").text
            reviews = album.find_all("div",{"class":"text numReviews"})
            critic_reviews = reviews[0].find("span").text
            try:
                user_reviews = reviews[1].find("strong").text
            except:
                user_reviews = ""
            else:
                user_reviews = reviews[1].find("strong").text
            release_date = album.find("div",{"class":"albumTopBox info"}).find("div",{"class":"detailRow"})
            if len(release_date.text)<22:
                release_month = ""
                try:
                    release_year = release_date.find_all("a")[0].text.strip()
                except:
                    continue
                else:
                    release_year = release_date.find_all("a")[0].text.strip()
                release_day = ""
            else:
                try:
                    release_month = release_date.find_all("a")[0].text
                    release_year = release_date.find_all("a")[1].text.strip()
                    release_day = release_date.text[-24:-22].strip()
                except:
                    release_month = ""
                    release_year = ""
                    release_day = ""
                else:
                    release_month = release_date.find_all("a")[0].text
                    release_year = release_date.find_all("a")[1].text.strip()
                    release_day = release_date.text[-24:-22].strip()
            album_format = album.find("div",{"class":"albumTopBox info"}).find_all("div",{"class":"detailRow"})[1].text[:-8].strip()
            if album.find("div",{"class":"albumTopBox info"}).find_all("div",{"class":"detailRow"})[2].find("span").text=="/\xa0Genres":
                label = ""
                if len(album.find("div",{"class":"albumTopBox info"}).find_all("div",{"class":"detailRow"})[2].find_all("a"))==0:
                    genre = ""
                else:
                    genre = album.find("div",{"class":"albumTopBox info"}).find_all("div",{"class":"detailRow"})[2].find("a").text
            else:
                label = album.find("div",{"class":"albumTopBox info"}).find_all("div",{"class":"detailRow"})[2].text[:-7].strip()
                if len(album.find("div",{"class":"albumTopBox info"}).find_all("div",{"class":"detailRow"})[3].find_all("a"))==0:
                    genre = ""
                else:
                    genre = album.find("div",{"class":"albumTopBox info"}).find_all("div",{"class":"detailRow"})[3].find_all("a")[0].text
            values.append([index,artist,title,critic_score,user_score,critic_reviews,user_reviews,release_month,release_day,release_year,album_format,label,genre])
            print(str(index)+"/"+str(total_index), end="\r")
    df = pd.DataFrame(values, columns=(["Index","Artist","Title","Critic Score","User Score","Critic Reviews","User Reviews","Release Month","Release Day","Release Year","Format","Label","Genre"]))
    df.to_csv("database.csv", mode="w",header=True, index=False)
else:    
    for id in range(number_of_entries):
        index = max_index+id+1
        url = base_url+str(index)
        try:
            req = requests.get(url, headers=headers)
        except:
            print("request error for "+url)
            continue
        else:
            req = requests.get(url, headers=headers)
        soup = BeautifulSoup(req.text, "html.parser")
        album = soup.find("div", {"class":"fullWidth"})
        try:
            artist = album.find("div",{"class":"artist"}).find("span").text
        except:
            continue
        else:
            artist = album.find("div",{"class":"artist"}).find("span").text
            title = album.find("div",{"class":"albumTitle"}).find("span").text
            if album.find("div",{"class":"albumCriticScore"}).text=="NR":
                continue
            else:
                critic_score = album.find("div",{"class":"albumCriticScore"}).find("a").text
            if album.find("div",{"class":"albumUserScore"}).text=="NR":
                user_score = ""
            else:
                user_score = album.find("div",{"class":"albumUserScore"}).find("a").text
            reviews = album.find_all("div",{"class":"text numReviews"})
            critic_reviews = reviews[0].find("span").text
            try:
                user_reviews = reviews[1].find("strong").text
            except:
                user_reviews = ""
            else:
                user_reviews = reviews[1].find("strong").text
            release_date = album.find("div",{"class":"albumTopBox info"}).find("div",{"class":"detailRow"})
            if len(release_date.text)<22:
                release_month = ""
                try:
                    release_year = release_date.find_all("a")[0].text.strip()
                except:
                    continue
                else:
                    release_year = release_date.find_all("a")[0].text.strip()
                release_day = ""
            else:
                try:
                    release_month = release_date.find_all("a")[0].text
                    release_year = release_date.find_all("a")[1].text.strip()
                    release_day = release_date.text[-24:-22].strip()
                except:
                    release_month = ""
                    release_year = ""
                    release_day = ""
                else:
                    release_month = release_date.find_all("a")[0].text
                    release_year = release_date.find_all("a")[1].text.strip()
                    release_day = release_date.text[-24:-22].strip()
            album_format = album.find("div",{"class":"albumTopBox info"}).find_all("div",{"class":"detailRow"})[1].text[:-8].strip()
            if album.find("div",{"class":"albumTopBox info"}).find_all("div",{"class":"detailRow"})[2].find("span").text=="/\xa0Genres":
                label = ""
                if len(album.find("div",{"class":"albumTopBox info"}).find_all("div",{"class":"detailRow"})[2].find_all("a"))==0:
                    genre = ""
                else:
                    genre = album.find("div",{"class":"albumTopBox info"}).find_all("div",{"class":"detailRow"})[2].find("a").text
            else:
                label = album.find("div",{"class":"albumTopBox info"}).find_all("div",{"class":"detailRow"})[2].text[:-7].strip()
                if len(album.find("div",{"class":"albumTopBox info"}).find_all("div",{"class":"detailRow"})[3].find_all("a"))==0:
                    genre = ""
                else:
                    genre = album.find("div",{"class":"albumTopBox info"}).find_all("div",{"class":"detailRow"})[3].find_all("a")[0].text
            values.append([index,artist,title,critic_score,user_score,critic_reviews,user_reviews,release_month,release_day,release_year,album_format,label,genre])
            print(str(index)+"/"+str(total_index), end="\r")
    df = pd.DataFrame(values, columns=(["Index","Artist","Title","Critic Score","User Score","Critic Reviews","User Reviews","Release Month","Release Day","Release Year","Format","Label","Genre"]))
    if dataset:
        df.to_csv("database.csv", mode="a",header=False, index=False)
    else:
        df.to_csv("database.csv", mode="a",header=True, index=False)