# Importing Modules

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import time
import os

# UtaNet Class

This was my initial attempt at scraping lyrics. I ended up not using this class due to the following reasons:

1. The `search_utanet_lyrics()` method sometimes returns an `AttributeError` when extracting the song information on [Uta-Net](https://www.uta-net.com/). More specifically, the desired text chunk is not found, which returns `None`. Then finding another text chunk within the first chunk means calling `None.find()`, which most likely causes the error (see [Tests](#tests)). This method worked in an earlier version of this class, but the website underwent a change since I last updated the code.

2. The [Uta-Net](https://www.uta-net.com/) database has less songs (at least the ones I wanted) compared to the [Lyrical Nonsense](https://www.lyrical-nonsense.com/) database.

3. The query search is pretty strict because it must have the exact song title and artist to work properly. Unlike the `LyricsExtract` class, the `UtaNet` class does not make use of any search engines (which are more flexible with keywords).

In [2]:
class UtaNet:
    """Scrapes lyrics from Uta-Net (https://www.uta.net.com)."""
    def __init__(self):
        """Initializes the class."""
        pass

    def get_beautiful_soup_html(self, url, params=None):
        """Returns the HTML of a URL as a BeautifulSoup object."""
        # open the webpage of the URL and extract the HTML text
        with requests.Session() as session:
            response = session.get(url, params=params)
            html = response.text
        
        # return the HTML as a BeautifulSoup object
        soup = BeautifulSoup(html, "html.parser")
        return soup
    
    def get_utanet_lyrics(self, url):
        """Takes a Uta-Net URL to a song and returns the lyrics with line breaks intact."""
        # get the HTML text (a BeautifulSoup object) from the URL
        soup = self.get_beautiful_soup_html(url)

        # extract the section containing the lyrics
        lyrics = soup.find("div", class_="row pt-4 pe-2 ps-lg-3 pb-lg-4 kashi").find("div", itemprop="text")
        
        # clean the lyrics - keep line breaks & remove div tags
        lyrics = str(lyrics).replace("<br/>", "\n")
        lyrics = re.sub(r"</?div.*?>", "", lyrics)
        
        return lyrics

    def search_utanet_lyrics(self, artist, song):
        """Given an artist and one of their songs, returns the corrected artist name and lyrics with line breaks intact.
        If no such entry exists on Uta-Net, prints the available song entries for the given artist and returns False.
        """
        # parameters to search on Uta-Net
        params = {
            'Aselect': 1,
            'Keyword': artist,
            'Bselect': 3,
            'x': 0,
            'y': 0
        }
        
        # extract the results, which are contained in tables
        soup = self.get_beautiful_soup_html("https://www.uta-net.com/search/", params=params)
        tables = soup.find_all("tbody")

        # find the HTML section that contains the song name
        entries = []
        for table in tables:
            entries += table.find_all("tr")
        
        # find the match for the song
        song_no = None
        for entry in entries:
            if entry.find("td", class_="side td1").find("a", text=song):
                song_no = entry.find("td", class_="side td1").find("a")["href"]
                break
        
        # case 1: if the song was not found, print a list of available songs from the artist
        if not song_no:
            available_songs = [entry.find("td", class_="side td1").find("a").get_text() for entry in entries]
            print("The artist and song combination was not found on Uta-Net.")
            print("Available songs for {}:".format(artist))
            print(available_songs)
            print("-" * 20)
            return False
        
        # case 2: if the song was found, extract the lyrics and return it
        time.sleep(3)
        song_url = "https://www.uta-net.com" + song_no
        lyrics = self.get_utanet_lyrics(song_url)
        
        return lyrics
    
    def check_lyrics_file(self, artist, track, folder):
        """Checks if the lyrics file exists on the user's computer."""
        file_name = f"{artist}「{track}」.txt"
        path = os.path.join(folder, file_name)
        return os.path.isfile(path)
    
    def get_all_lyrics(self, artist, folder="artists", buffer_time=7):
        """For a given artist, outputs text files of the lyrics to every one of their songs.
        For ethical web-scraping, the default buffer time between each scrape is 7 seconds (like a human visitor).
        """
        # if the directory folder/artist doesn't exist, create a new directory
        artist_folder = f"{folder}/{artist}"
        if not os.path.isdir(artist_folder):
            os.makedirs(artist_folder)

        # parameters to search on Uta-Net
        params = {
            'Aselect': 1,
            'Keyword': artist,
            'Bselect': 4
        }
        
        # extract the results, which are contained in tables
        soup = self.get_beautiful_soup_html("https://www.uta-net.com/search/", params=params)
        tables = soup.find_all("tbody")

        # case 1: if there are no results, then return None
        if not tables:
            print("There were no entries found for this artist.")
            print("Make sure to write the exact artist name you're looking for.")
            return None

        # case 2: if there are results, then proceed

        # find the HTML section that contains the song name
        entries = []
        for table in tables:
            entries += table.find_all("tr")

        # extract the song number on Uta-Net, song title, and artist
        for entry in entries:
            song_no = entry.find("td", class_="side td1").find("a")["href"]
            song_title = entry.find("td", class_="side td1").find("a").get_text()
            song_artist = entry.find("td", class_="td2").find("a").get_text()

            # if the lyrics file doesn't exist in the given folder, create the file if possible
            if not self.check_lyrics_file(song_artist, song_title, artist_folder):
                time.sleep(buffer_time)
                song_url = "https://www.uta-net.com" + song_no
                lyrics = self.get_utanet_lyrics(song_url)

                file_name = f"{song_artist}「{song_title}」.txt"
                path = os.path.join(artist_folder, file_name)

                # case 1: create the lyrics file
                try:
                    with open(path, "w") as out_file:
                        out_file.write(lyrics)
                    print(f"* {song_artist}「{song_title}」: File created")
                
                # case 2: error in creating the lyrics file
                except FileNotFoundError:
                    print(f"! {song_artist}「{song_title}」: Error creating file")

            # case 3: if the lyrics file already exists, then no need to do anything
            else:
                print(f"{song_artist}「{song_title}」: File already exists")
        
        print("Finished.")

# Tests

In [3]:
utanet = UtaNet()
utanet.get_beautiful_soup_html("https://www.uta-net.com/song/317311/")

<!DOCTYPE html>

<html lang="ja">
<head>
<meta charset="utf-8"/>
<title>Official髭男dism ミックスナッツ 歌詞 - 歌ネット</title>
<link as="style" href="https://ures.jp/uta-net.com/css/3rd/bootstrap/bootstrap.css" rel="preload"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="width=device-width, initial-scale=1.0, minimum-scale=1.0" name="viewport"/>
<meta content="telephone=no" name="format-detection"/>
<meta content="「ミックスナッツ/Official髭男dism」の歌詞 って「イイネ！」" property="og:title"/>
<meta content="「袋に詰められたナッツのような世間では 誰…」勇気をもらったり、泣けたり、癒されたり…、この歌詞をチェックしてみて！人の心を打つ「言葉」がぎっしり！" property="og:description"/>
<meta content="https://www.uta-net.com/song/317311/" property="mixi:device-smartphone"/>
<meta content="notitle, nodescription" name="mixi-check-robots"/>
<meta content="https://www.uta-net.com/reverse/mixi_check/uta_net_logo_m.gif" property="og:image"/>
<meta content="Official髭男dismの「ミックスナッツ」歌詞ページです。作詞:藤原聡,作曲:藤原聡。SPY×FAMILY オープニング (歌いだし)袋に詰められたナッツのような 歌ネットは無料の歌詞検索サービスです。" na

In [4]:
lyrics = utanet.get_utanet_lyrics("https://www.uta-net.com/song/317311/")
print(lyrics)

袋に詰められたナッツのような世間では
誰もがそれぞれ出会った誰かと寄り添い合ってる
そこに紛れ込んだ僕らはピーナッツみたいに
木の実のフリしながら　微笑み浮かべる

幸せのテンプレートの上　文字通り絵に描いたうわべの裏
テーブルを囲み手を合わすその時さえ　ありのままでは居られないまま

隠し事だらけ　継ぎ接ぎだらけのHome, you know？
噛み砕いても無くならない　本音が歯に挟まったまま
不安だらけ　成り行き任せのLife, and I know
仮初めまみれの日常だけど　ここに僕が居て　あなたが居る
この真実だけでもう　胃がもたれてゆく

化けの皮剥がれた一粒のピーナッツみたいに
世間から一瞬で弾かれてしまう　そんな時こそ
曲がりなりで良かったらそばに居させて
共に煎られ　揺られ　踏まれても　割れない殻みたいになるから

生まれた場所が木の上か地面の中か　それだけの違い
許されないほどにドライなこの世界を　等しく雨が湿らせますように

時に冷たくて　騒がしい窓の向こうyou know？
星の一つも見つからない　雷に満ちた日があっても良い
ミスだらけ　アドリブ任せのShow, but I know
所詮ひとかけの日常だから　腹の中にでも　流して寝よう

隠し事だらけ　継ぎ接ぎだらけのHome, you know？
とっておきも出来合いも　残さずに全部食らいながら
普通などない　正解などないLife, and I know
仮初めまみれの日常だけど　ここに僕が居て　あなたが居る
この真実だけでもう　胃がもたれてゆく
この一掴みの奇跡を　噛み締めてゆく


In [5]:
utanet.check_lyrics_file("Official髭男dism", "ミックスナッツ", "artists")

False

In [6]:
utanet.get_all_lyrics("ヨルシカ", "夜行")

AttributeError: 'NoneType' object has no attribute 'find'