In [None]:
# 1. -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import lyricsgenius
import requests
import api_key

import re
import json
from math import ceil, floor
import pandas as pd
import os


from pathlib import Path
import time
from datetime import datetime
import xlsxwriter

In [None]:
client_access_token = api_key.my_api_key
LyricsGenius = lyricsgenius.Genius(client_access_token)
LyricsGenius.remove_section_headers = True
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
}

In [None]:
artist = "Taylor Swift"
artist = artist.replace(" ", "-")
artist

# Define all functions

## Get the cleaned song titles of an album

In [None]:
def clean_up(song_title):
    if "Ft" in song_title:
        before_ft_pattern = re.compile(".*(?=\(Ft)")
        song_title_before_ft = before_ft_pattern.search(song_title).group(0)
        clean_song_title = song_title_before_ft.strip()
        clean_song_title = clean_song_title.replace("/", "-")

    else:
        song_title_no_lyrics = song_title.replace("Lyrics", "")
        clean_song_title = song_title_no_lyrics.strip()
        clean_song_title = clean_song_title.replace("/", "-")

    return clean_song_title

In [None]:
def title_contains_weird_strings(song_title) :
    pattern = r"Reputation Magazine|\[Poem\]|\[Prologue\]|\[Foreword\]|\[Liner Notes\]"
    return True if re.search(pattern, song_title) else False

In [None]:
def get_song_titles(album_name, artist) :
    # Clean album name and artist
    album_name = re.sub(r"\(Taylor's Version\)", "taylors-version", album_name)
    album_name = re.sub(r"[\[\]\(\)\:]", "", album_name)
    album_name = re.sub(r" ", "-", album_name)

    # Get the HTML page
    URL = f"https://genius.com/albums/{artist}/{album_name}"
    document = ''
    while document == '' :
        try :
            response = requests.get(URL, headers=headers)
            html_string = response.text
            document = BeautifulSoup(html_string, "html.parser")
            break
        except :
            print("Sleep now... Try connecting again")
            time.sleep(5)

    # Extract song titles and clean them
    song_title_tags = document.find_all("h3", attrs={"class": "chart_row-content-title"})
    song_titles = [song_title.text for song_title in song_title_tags]
    song_titles_legit = [song_title for song_title in song_titles if not title_contains_weird_strings(song_title)]
    return song_titles_legit

In [None]:
def get_clean_song_titles(song_titles) :
    clean_songs = []
    for song_title in song_titles:
        clean_song = clean_up(song_title)
        clean_songs.append(clean_song)

    return clean_songs

## Get the track numbers

In [None]:
def create_track_numbers(clean_songs) :
    return [i for i in range(1, len(clean_songs)+1)]

## Get featured artists of an album

In [None]:
def get_featured_artists(song_title):
    if "Ft" in song_title :
        song_title = re.sub("\xa0", " ", song_title)
        matched = re.findall(r"\(Ft\.\s*(.*?)\)", song_title)
        featured_artist = matched[0]
    else :
        featured_artist = ''
    return featured_artist

In [None]:
def get_featured_artists_in_one_album(album_name, artist) :
    song_titles = get_song_titles(album_name, artist)
    featured_artists = []
    for song_title in song_titles:
        featured_artist = get_featured_artists(song_title)
        featured_artists.append(featured_artist)

    return featured_artists

## Get additional information

In [None]:
def clean_lyrics(lyrics) :
    # Remove non-lyric part
    lyrics_cleaned = re.search(r'\sLyrics(.*?)\d*Embed', lyrics, re.DOTALL).group(1)

    # Remove the advertisement
    lyrics_clean = re.sub(r"See.*\$.*You might also like", '\n', lyrics_cleaned)

    # Remove the first "\n" if available
    if lyrics_clean.startswith("\n") : 
        return lyrics_clean.lstrip("\n")
    else :
        return lyrics_clean

### Get release date, url and cleaned lyrics

In [None]:
def get_song_information(song_title, artist) :
    song_dict = ''
    while song_dict == '' :
        try :
            song_dict = LyricsGenius.search_song(title=song_title, artist=artist, get_full_info=True).to_dict()
            break
        except TimeoutError as e :
            print(e)
            print("Sleep now... Try connecting again")
            time.sleep(5)

    release_year = song_dict["release_date_components"]['year']
    release_month = song_dict["release_date_components"]['month']
    release_day = song_dict["release_date_components"]['day']

    date_string = song_dict["release_date_for_display"]
    parsed_date = datetime.strptime(date_string, "%B %d, %Y")
    release_date = parsed_date.strftime("%Y-%m-%d")

    url = song_dict["url"]
    song_id = song_dict["id"]
    lyrics = song_dict["lyrics"]
    return [release_date, release_year, release_month, release_day, url, song_id, clean_lyrics(lyrics)]

### Get producers, writers and background vocals

In [None]:
def get_song_more_information(url_one_song) :
    document = ''
    while document == '' :
        try :
            response = requests.get(url_one_song, headers=headers)
            html_string = response.text
            document = BeautifulSoup(html_string, "html.parser")
            break
        except TimeoutError as e :
            print(e)
            print("Sleep now... Try connecting again")
            time.sleep(5)

    song_info_tags = document.find_all("div", attrs={"class": "SongInfo__Credit-nekw6x-3"})

    # Get producers
    producers_tag = [tag for tag in song_info_tags if tag.find_all("div", string="Produced By")][0]
    producer_names_tag = producers_tag.find_all("a", attrs={"class": "StyledLink-sc-3ea0mt-0"})
    producer_names = [producer.text for producer in producer_names_tag]

    # Get writers
    writers_tag = [tag for tag in song_info_tags if tag.find_all("div", string="Written By")][0]
    writer_names_tag = writers_tag.find_all("a", attrs={"class": "StyledLink-sc-3ea0mt-0"})
    writer_names = [writer.text for writer in writer_names_tag]

    # Get background vocals
    try :
        bg_vocals_tag = [tag for tag in song_info_tags if tag.find_all("div", string="Background Vocals")][0]
        bg_vocal_names_tag = bg_vocals_tag.find_all("a", attrs={"class": "StyledLink-sc-3ea0mt-0"})
        bg_vocal_names = [bg_vocal.text for bg_vocal in bg_vocal_names_tag]
    except :
        bg_vocal_names = ''
    return [producer_names, writer_names, bg_vocal_names]

### Get song length

In [None]:
def get_song_length(song_id) :
    url = f"https://genius.com/songs/{song_id}/apple_music_player?react=1"
    document = ''
    while document == '' :
        try :
            response = requests.get(url, headers=headers)
            html_string = response.text
            document = BeautifulSoup(html_string, "html.parser")
            break
        except TimeoutError as e :
            print(e)
            print("Sleep now... Try connecting again")
            time.sleep(5)
            
    apple_music_player_tag = document.find("apple-music-player")
    attributes = apple_music_player_tag.attrs
    track_preview = json.loads(attributes["preview_track"])
    
    # Get the duration
    duration_seconds = floor(track_preview["duration"])
    duration_min_sec = f"{duration_seconds // 60} minutes {duration_seconds % 60} seconds"
    
    return [duration_seconds, duration_min_sec]

### Combine new informations

In [None]:
def get_additional_info(clean_songs) :
    df_info = pd.DataFrame(columns=['Release Date', 'Release Year', 'Release Month', 'Release Day', 'URL',
                                    'Song_ID', 'Lyrics', 'Song Length (sec)', 'Song Length (min)',
                                    'Producer(s)', 'Writer(s)', 'Background Vocal(s)'])
    for song in clean_songs :
        info1 = get_song_information(song, artist) # Release Date - Lyrics
        url = info1[4]
        song_id = info1[5]
        info2 = get_song_length(song_id) # Song Length (sec) - Song Length (min)
        info3 = get_song_more_information(url) # Producer(s) - Background Vocal(s)
        
        new_data = info1 + info2 + info3

        df_info.loc[len(df_info)] = new_data

    return df_info

# Loop all albums

## Get all album names

In [None]:
artist_id = "1177" # Artist ID of Taylor Swift

all_album_names = []
for page in range(1, 10) :
    album_list = LyricsGenius.artist_albums("1177", per_page=50, page=page)['albums']
    for album_dict in album_list :
        all_album_names.append(album_dict['name'])

all_album_names

In [None]:
# Only select relevant albums
album_names_used = ["Taylor Swift",
                    "Fearless (Taylor's Version)",
                    "Speak Now (Taylor's Version)",
                    "Red (Taylor's Version)",
                    "1989 (Taylor's Version) [Deluxe]",
                    "reputation",
                    "Lover",
                    "folklore (deluxe version)",
                    "evermore (deluxe version)",
                    "Midnights (The Til Dawn Edition)",
                    "Midnights (The Late Night Edition)", # Get "You're Losing Me"
                    "The More Red (Taylor's Version) Chapter", # Get "Safe & Sound" and "Eyes Open"
                    "The More Fearless (Taylor's Version) Chapter", # Get "If This Was A Movie"
                    "The More Lover Chapter", # Get "All Of The Girls You Loved Before"
                    "THE TORTURED POETS DEPARTMENT: THE ANTHOLOGY"
                   ]

In [None]:
def clean_album_name(album) : 
    album = re.sub(r"[\(\)'\[\]\:]", "", album)
    album = re.sub(r" ", "-", album)
    return album

## Create directory paths

In [None]:
for album_name in album_names_used : 
    album = clean_album_name(album_name)
    directory_path = f"{artist}_{album}"
    
    if not os.path.exists(directory_path) : 
        os.makedirs(directory_path)

## Create .txt file

In [None]:
def create_txt_file(album, artist) : 
    # Get all songs from one album
    song_titles = get_song_titles(album, artist)
    clean_songs = get_clean_song_titles(song_titles)
    album_name = clean_album_name(album)
    
    for index, song in enumerate(clean_songs) : 
        file_path = f"{artist}_{album_name}/{song}"
        song_object = LyricsGenius.search_song(song, artist)
        lyrics = song_object.to_dict()["lyrics"]
        cleaned_lyrics = clean_lyrics(lyrics)
        
        if "\"" in song :
            song = re.sub(r"\"", "\'", song)
        
        if "?" in song : 
            song = re.sub(r"\?", "", song)
        
        with open(f"{directory_path}/{index+1}. {song}.txt", "w") as f :
            f.write(cleaned_lyrics)
            print(f"Finished writing {song}.txt\n")
    print(f"Finished writing all songs in the album {album} into .txt files")

In [None]:
# Create .txt files. Please do it separately to avoid Timeout Error!

# create_txt_file("Taylor Swift", artist)
# create_txt_file("Fearless (Taylor's Version)", artist)
# create_txt_file("Speak Now (Taylor's Version)", artist)
# create_txt_file("Red (Taylor's Version)", artist)
# create_txt_file("1989 (Taylor's Version) [Deluxe]", artist)
# create_txt_file("reputation", artist)
# create_txt_file("Lover", artist)
# create_txt_file("folklore (deluxe version)", artist)
# create_txt_file("evermore (deluxe version)", artist)
# create_txt_file("Midnights (The Til Dawn Edition)", artist)
# create_txt_file("Midnights (The Late Night Edition)", artist)
# create_txt_file("The More Red (Taylor's Version) Chapter", artist)
# create_txt_file("The More Fearless (Taylor's Version) Chapter", artist)
# create_txt_file("The More Lover Chapter", artist)
# create_txt_file("THE TORTURED POETS DEPARTMENT: THE ANTHOLOGY", artist)

## Create excel files for complete data for each album

In [None]:
def create_df_one_album(album, artist) :
    # Get all songs from one album
    song_titles = get_song_titles(album, artist)
    clean_songs = get_clean_song_titles(song_titles)
    featured_artists = get_featured_artists_in_one_album(album, artist)
    track_numbers = create_track_numbers(clean_songs)

    df = pd.DataFrame(columns=['Album', 'Song Title', 'Track Number', 'Featured Artist(s)'])
    df['Song Title'] = clean_songs
    df['Track Number'] = track_numbers
    df['Featured Artist(s)'] = featured_artists
    df['Album'] = album

    # Enrich the information of the songs

    df_info = get_additional_info(clean_songs)

    if len(df) == len(df_info) :
        df_complete = pd.concat([df, df_info], axis=1)
    else :
        print(f"The information for the album {album} can't be created. Different lengths of data.")

    return df_complete

In [None]:
def create_excel_full_data(album, artist) :
    album_name = clean_album_name(album)
    df_output = create_df_one_album(album, artist)
    file_name = f"{artist}_{album_name}/{album_name}_full_album_info.xlsx"
    
    df_output.to_excel(file_name, sheet_name=f"{album_name[:30]}", encoding='utf8')
    print(f"Finished writing dataframe for the album {album}\n")

In [None]:
# Create excel for each album. Please do it separately to avoid Timeout Error!

# create_excel_full_data("Taylor Swift", artist)
# create_excel_full_data("Fearless (Taylor's Version)", artist)
# create_excel_full_data("Speak Now (Taylor's Version)", artist)
# create_excel_full_data("Red (Taylor's Version)", artist)
# create_excel_full_data("1989 (Taylor's Version) [Deluxe]", artist)
# create_excel_full_data("reputation", artist)
# create_excel_full_data("Lover", artist)
# create_excel_full_data("folklore (deluxe version)", artist)
# create_excel_full_data("evermore (deluxe version)", artist)
# create_excel_full_data("Midnights (The Til Dawn Edition)", artist)
# create_excel_full_data("Midnights (The Late Night Edition)", artist)
# create_excel_full_data("The More Red (Taylor's Version) Chapter", artist)
# create_excel_full_data("The More Fearless (Taylor's Version) Chapter", artist)
# create_excel_full_data("The More Lover Chapter", artist)
# create_excel_full_data("THE TORTURED POETS DEPARTMENT: THE ANTHOLOGY", artist)

## Create the extended dataframe for each album

### Separating each line of the lyric into a row

In [None]:
def create_expanded_dataframe_lyric(album, artist) :
    df_original = create_df_one_album(album, artist)
    df_output = pd.DataFrame(columns=['Album', 'Song Title', 'Line Number', 'Line'])
    for index, row in df_original.iterrows():
        album =  row['Album']
        song_title = row['Song Title']
        lyrics = row['Lyrics'].split("\n")
        lyrics_clean = [line for line in lyrics if len(line)>0]
        
        for line_number, line in enumerate(lyrics_clean) : 
            new_row = [album, song_title, line_number+1, line]       
            df_output.loc[len(df_output)] = new_row
    
    return df_output

In [None]:
def create_excel_expanded_full_data(album, artist) :
    album_name = clean_album_name(album)
    df_output = create_expanded_dataframe_lyric(album, artist)
    file_name = f"{artist}_{album_name}/{album_name}_expanded_full_album_info.xlsx"
    
    df_output.to_excel(file_name, sheet_name=f"{album_name[:30]}", encoding='utf8')
    print(f"Finished writing expanded dataframe for the album {album}.")

In [None]:
# Create extended dataframe for each album. Please do it separately to avoid Timeout Error!

# create_excel_expanded_full_data("Taylor Swift", artist)
# create_excel_expanded_full_data("Fearless (Taylor's Version)", artist)
# create_excel_expanded_full_data("Speak Now (Taylor's Version)", artist)
# create_excel_expanded_full_data("Red (Taylor's Version)", artist)
# create_excel_expanded_full_data("1989 (Taylor's Version) [Deluxe]", artist)
# create_excel_expanded_full_data("reputation", artist)
# create_excel_expanded_full_data("Lover", artist)
# create_excel_expanded_full_data("folklore (deluxe version)", artist)
# create_excel_expanded_full_data("evermore (deluxe version)", artist)
# create_excel_expanded_full_data("Midnights (The Til Dawn Edition)", artist)
# create_excel_expanded_full_data("Midnights (The Late Night Edition)", artist)
# create_excel_expanded_full_data("The More Red (Taylor's Version) Chapter", artist)
# create_excel_expanded_full_data("The More Fearless (Taylor's Version) Chapter", artist)
# create_excel_expanded_full_data("The More Lover Chapter", artist)
# create_excel_expanded_full_data("THE TORTURED POETS DEPARTMENT: THE ANTHOLOGY", artist)