In [14]:
# add libraries
import requests
import json
import pandas as pd
import os
import pprint
import time
import re

# spotipy
import spotipy
from spotipy.oauth2 import SpotifyOAuth

# lyrics genius
from lyricsgenius import Genius

# Data Acquisition
### Grab data from different API sources (Spotify, Genius, Billboard 100)

In [None]:
# function for grabbing api key
def get_file_contents(filename):
    """ Given a filename,
        return the contents of that file
    """
    try:
        with open(filename, 'r') as f:
            # It's assumed our file contains a single line,
            # with our API key
            return f.read().strip()
    except FileNotFoundError:
        print("'%s' file not found" % filename)

#### Spotify

In [None]:
spotify_client = "../spotify_client"
spotify_client_secret = "../spotify_client_secret"
spotify_redirect = "../spotify_redirect"

os.environ['SPOTIPY_CLIENT_ID'] = get_file_contents(spotify_client)
os.environ['SPOTIPY_CLIENT_SECRET'] = get_file_contents(spotify_client_secret)
os.environ['SPOTIPY_REDIRECT_URI'] = get_file_contents(spotify_redirect)


spotify_artist_id = "06HL4z0CvFAxyc27GXpf02"

Endpoint usage: artist albums -> for each get album tracks -> for each track get the tracks audio features

In [None]:
# # artist albums
# scope = "user-library-read"

# sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope))

# results = sp.current_user_saved_tracks()
# for idx, item in enumerate(results['items']):
#     track = item['track']
#     print(idx, track['artists'][0]['name'], " – ", track['name'])

In [None]:
scope = "user-library-read"

sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope))

In [None]:
taylor_url = f'spotify:artist:{spotify_artist_id}'
results = sp.artist_albums(taylor_url, album_type='album', country = 'CA')

albums = results['items']

while results['next']:
    results = sp.next(results)
    albums.extend(results['items'])

# for album in albums:
#     print(album['id'])

# for album in albums:
#     print(album['name'])

In [None]:
# there are some duplicate albums because some songs are explicit creating a new album
# so lets just try to take the unique album names
# after grabbing unique album also take the singles potentially
# lets take songs from these albums
# pick Taylor's Version and Deluxe Albums
# deluxe version usually is a rerelease but with more songs and with some acoustic versions

# I will put the index of the album beside the name
# Midnights (3am Edition)        0 
# Red (Taylor's Version)         4
# Fearless (Taylor's Version)    6
# evermore (deluxe version)      7
# folklore (deluxe version)     13
# Lover                         17
# reputation                    19
# 1989 (deluxe version)         26               
# Taylor Swift                  45
# Speak Now (Deluxe Edition)    36
picked_albums= []

picked_albums.append(albums[2])
picked_albums.append(albums[6])
picked_albums.append(albums[8])
picked_albums.append(albums[9])
picked_albums.append(albums[15])
picked_albums.append(albums[19])
picked_albums.append(albums[21])
picked_albums.append(albums[28])
picked_albums.append(albums[45])
picked_albums.append(albums[38])

In [None]:
for album in picked_albums:
    print(album['name'])

In [None]:
tracks = []
album_names = []
album_release_date = []
for album in picked_albums:
    for i in range(album['total_tracks']):
        album_names.append(album['name'])
        album_release_date.append(album['release_date'])
    results = sp.album_tracks(album['id'])
    tracks.extend(results['items'])

names_tracks = []
for track in tracks:
    names_tracks.append(track['name'])
    print(track['name'])

print(len(tracks))
# there are 197 tracks

In [None]:
# using the track id get track audio features and audio analysis
# getting audio features requires sending a get request for a comma seperated list of spotify ids with max 100 IDS

track_ids = []

for track in tracks:
    track_ids.append(track['id'])

In [None]:
# now we make api call to get the audio features
audio_features = []

results = sp.audio_features(track_ids[0:100])
audio_features.extend(results)

results = sp.audio_features(track_ids[100:])
audio_features.extend(results)

In [None]:
audio_features

In [None]:
# make a data frame to store all the data
first_df = pd.DataFrame.from_dict(audio_features)

In [None]:
first_df.head()

In [None]:
# now add names to each and the album that each track belongs to
first_df['Name'] = names_tracks

# lets also try to add the album names
first_df['Album Name'] = album_names

# add release date of album
first_df['Release Date'] = album_release_date

In [None]:
first_df.head()

In [None]:
# now we can cut out some features
# like urls, type, href

first_df = first_df.drop(['analysis_url', 'track_href', 'uri', 'type'], axis = 1)

In [None]:
first_df.head()

In [None]:
# check for null values

first_df.isnull().any()

In [None]:
# save this csv 
first_df.to_csv('../../data/picked_tracks.csv')

#### Genius

In [None]:
# use the lyricsgenius library by John Miller 
# https://lyricsgenius.readthedocs.io/en/master/

GENIUS_API_TOKEN = "../genius_api"

os.environ['GENIUS_API_TOKEN'] = get_file_contents(GENIUS_API_TOKEN)
genius =  Genius(os.getenv('GENIUS_API_TOKEN'))


In [None]:
# artist = genius.search_artist("Taylor Swift", max_songs=3, sort="title")
# print(artist.songs)

In [None]:
song = genius.search_song("Lavender Haze", artist.name)

In [None]:
print(song.lyrics)

In [None]:
# make a giant corpus of the picked tracks lyrics to analyze using NLP
# or make a corpus out of the album track lyrics

In [None]:
unique_albums = first_df['Album Name'].unique().tolist()

In [None]:
unique_albums

In [None]:
# 1989 title wrong
unique_albums.remove("1989 (Deluxe Edition)")
unique_albums.append("1989 (Deluxe)")

# same with Speak Now

unique_albums.remove("Speak Now (Deluxe Edition)")
unique_albums.append("Speak Now (Deluxe)")

In [None]:
unique_albums

In [None]:
genius.remove_section_headers = True # Remove section headers (e.g. [Chorus]) from lyrics when searching
genius.skip_non_songs = False # Include hits thought to be non-songs (e.g. track lists)
genius.excluded_terms = ["(Remix)", "(Live)", "Memo", "Pop"] # Exclude songs with these words in their title

# for album in unique_albums:
#     album = genius.search_album(album, "Taylor Swift")
#     album.save_lyrics()
#     time.sleep(30)

In [None]:
album = genius.search_album("Taylor Swift")
# album.save_lyrics()
album

# find the English version 
# albumID 
# 12682

In [None]:
album = genius.search_album(album_id=12682)
album.save_lyrics()

In [2]:
# now time to parse the lyrics
# grab the lyrics and line number seperated by \n in the json

with open('../../data/lyrics/Lyrics_TaylorSwift.json', 'r') as f:
  data = json.load(f)

{'_type': 'album',
 'api_path': '/albums/12682',
 'artist': {'_type': 'artist',
            'api_path': '/artists/1177',
            'header_image_url': 'https://images.genius.com/a74ab9ea8f7314b6e015133c5bd0314a.1000x333x1.jpg',
            'id': 1177,
            'image_url': 'https://images.genius.com/866d31e6a0fb376d0117018b5913369f.1000x1000x1.png',
            'index_character': 't',
            'iq': 1544,
            'is_meme_verified': True,
            'is_verified': True,
            'name': 'Taylor Swift',
            'slug': 'Taylor-swift',
            'url': 'https://genius.com/artists/Taylor-swift'},
 'comment_count': 1,
 'cover_art_thumbnail_url': 'https://images.genius.com/be8b30abcf286f1bf996e82e7e96dc14.300x300x1.jpg',
 'cover_art_url': 'https://images.genius.com/be8b30abcf286f1bf996e82e7e96dc14.1000x1000x1.jpg',
 'cover_arts': [{'_type': 'cover_art',
                 'annotated': True,
                 'api_path': '/albums/12682',
                 'current_user_meta

In [51]:
dict = {}
dict['title'] = []
dict['lyrics'] = []
dict['line'] = []
dict['album'] = []
dict['year'] = []

# the first line doesn't contain lyrics, there is also an ad inside the lyrics? 
# get rid of 1st line and the 30 line
# See Taylor Swift LiveGet tickets as low as $1,111You might also like
# the last line also contains an embed 

    
for track in data['tracks']:
    title = track.get('song').get('title')
    lyrics = track.get('song').get('lyrics')
    lines = lyrics.split("\n")
    album = data['name']
    year = data['release_date_components']['year']
    line_number = 0
    
    if '[Liner Notes]' in title:
        continue
    for line in lines:
        lower_line = line.lower()
        
        if 'contributorstranslations' in lower_line:
            continue

        if line == "":
            continue

        if 'get tickets' in lower_line:
            continue

        if '25Embed' in line:
            line = re.sub('25Embed', '', line)

        dict['year'].append(year)
        dict['album'].append(album)
        dict['title'].append(title)
        dict['lyrics'].append(line)
        dict['line'].append(line_number)
        line_number += 1


In [52]:
dict

{'title': ['Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Tim McGraw',
  'Picture to Burn',
  'Picture to Burn',
  'Picture to Burn',
  'Picture to Burn',
  'Picture to Burn',
  'Picture to Burn',
 

In [53]:
taylor_swift_df = pd.DataFrame.from_dict(dict)

taylor_swift_df.sample(10)

In [56]:
with open('../../data/lyrics/Lyrics_RedTaylorsVersion.json', 'r') as f:
  data = json.load(f)

In [58]:
dict = {}
dict['title'] = []
dict['lyrics'] = []
dict['line'] = []
dict['album'] = []
dict['year'] = []

# the first line doesn't contain lyrics, there is also an ad inside the lyrics? 
# get rid of 1st line and the 30 line
# See Taylor Swift LiveGet tickets as low as $1,111You might also like
# the last line also contains an embed 

    
for track in data['tracks']:
    title = track.get('song').get('title')
    lyrics = track.get('song').get('lyrics')
    lines = lyrics.split("\n")
    album = data['name']
    year = data['release_date_components']['year']
    line_number = 0
    
    if '[Liner Notes]' in title:
        continue
    for line in lines:
        lower_line = line.lower()
        
        if 'contributorstranslations' in lower_line:
            continue

        if line == "":
            continue

        if 'get tickets' in lower_line:
            continue

        if '25Embed' in line:
            line = re.sub('25Embed', '', line)

        dict['year'].append(year)
        dict['album'].append(album)
        dict['title'].append(title)
        dict['lyrics'].append(line)
        dict['line'].append(line_number)
        line_number += 1


In [59]:
red_df = pd.DataFrame.from_dict(dict)

red_df.sample(10)

Unnamed: 0,title,lyrics,line,album,year
832,The Moment I Knew (Taylor’s Version),"And they're all standing around me, singing",52,Red (Taylor’s Version),2021
434,Stay Stay Stay (Taylor’s Version),And I love you because you have given me,15,Red (Taylor’s Version),2021
1185,Message In A Bottle (Taylor’s Version) [From t...,A message in a bottle is all I can do,9,Red (Taylor’s Version),2021
605,The Lucky One (Taylor’s Version),"Oh, oh, oh",24,Red (Taylor’s Version),2021
197,All Too Well (Taylor’s Version),"And you've still got it in your drawer, even now",3,Red (Taylor’s Version),2021
266,22 (Taylor’s Version),But I'm feelin' twenty-two,14,Red (Taylor’s Version),2021
1274,Forever Winter (Taylor’s Version) [From the Va...,He spends most of his nights wishing it was ho...,2,Red (Taylor’s Version),2021
267,22 (Taylor’s Version),Everything will be alright if,15,Red (Taylor’s Version),2021
737,Starlight (Taylor’s Version),Like we dream impossible dreams,47,Red (Taylor’s Version),2021
1110,Nothing New (Taylor’s Version) [From the Vault],And I wake up (Wake up) in the middle of the n...,27,Red (Taylor’s Version),2021
