In [69]:
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import lyricsgenius
import requests
import api_key

import re
import pandas as pd
import os
from pathlib import Path
import time
from datetime import datetime
import xlsxwriter

In [68]:
client_access_token = api_key.my_api_key
LyricsGenius = lyricsgenius.Genius(client_access_token)
LyricsGenius.remove_section_headers = True
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
}

In [3]:
artist = "Taylor Swift"
artist = artist.replace(" ", "-")
artist

'Taylor-Swift'

# Define all functions

## Get the cleaned song titles of an album

In [30]:
def clean_up(song_title):
    if "Ft" in song_title:
        before_ft_pattern = re.compile(".*(?=\(Ft)")
        song_title_before_ft = before_ft_pattern.search(song_title).group(0)
        clean_song_title = song_title_before_ft.strip()
        clean_song_title = clean_song_title.replace("/", "-")

    else:
        song_title_no_lyrics = song_title.replace("Lyrics", "")
        clean_song_title = song_title_no_lyrics.strip()
        clean_song_title = clean_song_title.replace("/", "-")

    return clean_song_title

In [6]:
def title_contains_weird_strings(song_title) :
    pattern = r"Reputation Magazine|\[Poem\]|\[Prologue\]|\[Foreword\]|\[Liner Notes\]"
    return True if re.search(pattern, song_title) else False

In [31]:
def get_song_titles(album_name, artist) :
    # Clean album name and artist
    album_name = re.sub(r"\(Taylor's Version\)", "taylors-version", album_name)
    album_name = re.sub(r"[\[\]\(\)]", "", album_name)
    album_name = re.sub(r" ", "-", album_name)

    # Get the HTML page
    URL = f"https://genius.com/albums/{artist}/{album_name}"
    document = ''
    while document == '' :
        try :
            response = requests.get(URL, headers=headers)
            html_string = response.text
            document = BeautifulSoup(html_string, "html.parser")
            break
        except :
            print("Sleep now... Try connecting again")
            time.sleep(5)

    # Extract song titles and clean them
    song_title_tags = document.find_all("h3", attrs={"class": "chart_row-content-title"})
    song_titles = [song_title.text for song_title in song_title_tags]
    song_titles_legit = [song_title for song_title in song_titles if not title_contains_weird_strings(song_title)]
    return song_titles_legit

In [32]:
def get_clean_song_titles(song_titles) :
    clean_songs = []
    for song_title in song_titles:
        clean_song = clean_up(song_title)
        clean_songs.append(clean_song)

    return clean_songs

## Get the track numbers

In [34]:
def create_track_numbers(clean_songs) :
    return [i for i in range(1, len(clean_songs)+1)]

## Get featured artists of an album

In [38]:
def get_featured_artists(song_title):
    if "Ft" in song_title :
        song_title = re.sub("\xa0", " ", song_title)
        matched = re.findall(r"\(Ft\.\s*(.*?)\)", song_title)
        featured_artist = matched[0]
    else :
        featured_artist = ''
    return featured_artist

In [39]:
def get_featured_artists_in_one_album(album_name, artist) :
    song_titles = get_song_titles(album_name, artist)
    featured_artists = []
    for song_title in song_titles:
        featured_artist = get_featured_artists(song_title)
        featured_artists.append(featured_artist)

    return featured_artists

## Get additional information

In [55]:
def clean_lyrics(lyrics) :
    # Remove non-lyric part
    lyrics_cleaned = re.search(r'\sLyrics(.*?)\d*Embed', lyrics, re.DOTALL).group(1)

    # Remove the advertisement
    lyrics_clean = re.sub(r"See.*\$.*You might also like", '\n', lyrics_cleaned)

    return lyrics_clean

### Get release date, url and cleaned lyrics

In [56]:
def get_song_information(song_title, artist) :
    song_dict = ''
    while song_dict == '' :
        try :
            song_dict = LyricsGenius.search_song(title=song_title, artist=artist, get_full_info=True).to_dict()
            break
        except ConnectionError as e :
            print(e)
            print("Sleep now... Try connecting again")
            time.sleep(5)

    release_year = song_dict["release_date_components"]['year']
    release_month = song_dict["release_date_components"]['month']
    release_day = song_dict["release_date_components"]['day']

    date_string = song_dict["release_date_for_display"]
    parsed_date = datetime.strptime(date_string, "%B %d, %Y")
    release_date = parsed_date.strftime("%Y-%m-%d")

    url = song_dict["url"]
    lyrics = song_dict["lyrics"]
    return [release_date, release_year, release_month, release_day, url, clean_lyrics(lyrics)]

### Get producers, writers and background vocals

In [57]:
def get_song_more_information(url_one_song) :

    document = ''
    while document == '' :
        try :
            response = requests.get(url_one_song, headers=headers)
            html_string = response.text
            document = BeautifulSoup(html_string, "html.parser")
            break
        except ConnectionError as e :
            print(e)
            print("Sleep now... Try connecting again")
            time.sleep(5)

    song_info_tags = document.find_all("div", attrs={"class": "SongInfo__Credit-nekw6x-3"})

    # Get producers
    producers_tag = [tag for tag in song_info_tags if tag.find_all("div", string="Produced By")][0]
    producer_names_tag = producers_tag.find_all("a", attrs={"class": "StyledLink-sc-3ea0mt-0"})
    producer_names = [producer.text for producer in producer_names_tag]

    # Get writers
    writers_tag = [tag for tag in song_info_tags if tag.find_all("div", string="Written By")][0]
    writer_names_tag = writers_tag.find_all("a", attrs={"class": "StyledLink-sc-3ea0mt-0"})
    writer_names = [writer.text for writer in writer_names_tag]

    # Get background vocals
    try :
        bg_vocals_tag = [tag for tag in song_info_tags if tag.find_all("div", string="Background Vocals")][0]
        bg_vocal_names_tag = bg_vocals_tag.find_all("a", attrs={"class": "StyledLink-sc-3ea0mt-0"})
        bg_vocal_names = [bg_vocal.text for bg_vocal in bg_vocal_names_tag]
    except :
        bg_vocal_names = ''
    return [producer_names, writer_names, bg_vocal_names]

### Combine both new informations

In [58]:
def get_additional_info(clean_songs) :
    df_info = pd.DataFrame(columns=['Release Date', 'Release Year', 'Release Month', 'Release Day', 'URL',
                                    'Lyrics', 'Producer(s)', 'Writer(s)', 'Background Vocal(s)'])
    for song in clean_songs :
        info1 = get_song_information(song, artist)
        url = info1[4]
        info2 = get_song_more_information(url)
        new_data = info1 + info2

        df_info.loc[len(df_info)] = new_data

    return df_info

# Loop all albums

## Get all album names

In [53]:
artist_id = "1177" # Artist ID of Taylor Swift

all_album_names = []
for page in range(1, 10) :
    album_list = LyricsGenius.artist_albums("1177", per_page=50, page=page)['albums']
    for album_dict in album_list :
        all_album_names.append(album_dict['name'])

all_album_names

['THE TORTURED POETS DEPARTMENT',
 'THE TORTURED POETS DEPARTMENT + Bonus Track “The Albatross”',
 'THE TORTURED POETS DEPARTMENT + Bonus Track “The Bolter”',
 'THE TORTURED POETS DEPARTMENT + Bonus Track ”The Black Dog”',
 'THE TORTURED POETS DEPARTMENT (Physical Version)',
 '1989 (Taylor’s Version) [Webstore Deluxe]',
 '1989 (Taylor’s Version) [Tangerine Edition]',
 '1989 (Taylor’s Version) [Deluxe]',
 '1989 (Taylor’s Version)',
 'The Cruelest Summer',
 'Speak Now (Taylor’s Version) [Digital Deluxe]',
 'Speak Now (Taylor’s Version)',
 'Midnights (The Til Dawn Edition)',
 'Midnights (The Late Night Edition)',
 'folklore: the long pond studio sessions (Record Store Day Exclusive)',
 'The More Lover Chapter',
 'The More Fearless (Taylor’s Version) Chapter',
 'The More Red (Taylor’s Version) Chapter',
 'Lavender Haze (Remixes)',
 'Lover (Live From Paris) Heart Shaped Vinyl',
 'Anti-Hero (Remixes) ',
 'Midnights (3am Edition)',
 'Midnights (Target Exclusive)',
 'Midnights (Apple Music Exc

In [72]:
# Only select relevant albums
album_names_used = ["Taylor Swift",
                    "Fearless (Taylor's Version)",
                    "Speak Now (Taylor's Version)",
                    "Red (Taylor's Version)",
                    "1989 (Taylor's Version) [Deluxe]",
                    "reputation",
                    "Lover",
                    "folklore (deluxe version)",
                    "evermore (deluxe version)",
                    "Midnights (The Til Dawn Edition)",
                    "Midnights (The Late Night Edition)", # Get "You're Losing Me"
                    "The More Red (Taylor's Version) Chapter", # Get "Safe & Sound" and "Eyes Open"
                    "The More Fearless (Taylor's Version) Chapter", # Get "If This Was A Movie"
                    "The More Lover Chapter"#, # Get "All Of The Girls You Loved Before"
                    #"THE TORTURED POETS DEPARTMENT" # When the album is released!
                   ]

In [59]:
def create_df_one_album(album, artist) :
    # Get all songs from one album
    song_titles = get_song_titles(album, artist)
    clean_songs = get_clean_song_titles(song_titles)
    featured_artists = get_featured_artists_in_one_album(album, artist)
    track_numbers = create_track_numbers(clean_songs)

    df = pd.DataFrame(columns=['Album', 'Song Title', 'Track Number', 'Featured Artist(s)'])
    df['Song Title'] = clean_songs
    df['Track Number'] = track_numbers
    df['Featured Artist(s)'] = featured_artists
    df['Album'] = album

    # Enrich the information of the songs

    df_info = get_additional_info(clean_songs)

    if len(df) == len(df_info) :
        df_complete = pd.concat([df, df_info], axis=1)
    else :
        print(f"The information for the album {album} can't be created. Different lengths of data.")

    return df_complete

## Create excel files for complete data for each album

In [74]:
def create_excel_full_data(album, artist) :
    album_name = re.sub(r" ", "-", album)
    directory_path = f"Taylor-Swift_{album_name}"
    
    if not os.path.exists(directory_path) : 
        os.makedirs(directory_path)
        
    df_output = create_df_one_album(album, artist)
    file_name = f"{directory_path}/{album_name}_full_album_info.xlsx"
    
    df_output.to_excel(file_name, sheet_name=f"{album_name}")

In [77]:
# Create excel for each album. Please do it separately to avoid ConnectionError!
# create_excel_full_data("Taylor Swift", artist)
# create_excel_full_data("Fearless (Taylor's Version)", artist)
# create_excel_full_data("Speak Now (Taylor's Version)", artist)
# create_excel_full_data("Red (Taylor's Version)", artist)
# create_excel_full_data("1989 (Taylor's Version) [Deluxe]", artist)
# create_excel_full_data("reputation", artist)
# create_excel_full_data("Lover", artist)
# create_excel_full_data("folklore (deluxe version)", artist)
# create_excel_full_data("evermore (deluxe version)", artist)
# create_excel_full_data("Midnights (The Til Dawn Edition)", artist)
# create_excel_full_data("Midnights (The Late Night Edition)", artist)
# create_excel_full_data("The More Red (Taylor's Version) Chapter", artist)
# create_excel_full_data("The More Fearless (Taylor's Version) Chapter", artist)
# create_excel_full_data("The More Lover Chapter", artist)