In [2]:
import pandas as pd
import numpy as np

# Get kaggle df

In [6]:
df = pd.read_csv("spotify_songs.csv")
len(df)

32833

# Cut columns

In [7]:
print(df.columns)
drop_columns_arr = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'playlist_id', 'track_album_id', 'key', 'mode', 'playlist_name', 'playlist_subgenre', 'track_id']


# could track_popularity be used to rank the results ? 
df = df.drop(columns=drop_columns_arr)

Index(['track_id', 'track_name', 'track_artist', 'track_popularity',
       'track_album_id', 'track_album_name', 'track_album_release_date',
       'playlist_name', 'playlist_id', 'playlist_genre', 'playlist_subgenre',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_ms'],
      dtype='object')


# Sorting & RM Duplicates

In [9]:
df = df.sort_values("track_popularity", ascending=False)

# don't like this because tracks could be the same name from different artists
df = df.drop_duplicates(subset=['track_name'], keep='first') 
df = df.reset_index(drop=True)
df

Unnamed: 0,track_name,track_artist,track_popularity,track_album_name,track_album_release_date,playlist_genre,duration_ms
0,Dance Monkey,Tones and I,100,Dance Monkey (Stripped Back) / Dance Monkey,2019-10-17,latin,209438
1,ROXANNE,Arizona Zervas,99,ROXANNE,2019-10-10,r&b,163636
2,Blinding Lights,The Weeknd,98,Blinding Lights,2019-11-29,latin,201573
3,Circles,Post Malone,98,Hollywood's Bleeding,2019-09-06,pop,215280
4,Tusa,KAROL G,98,Tusa,2019-11-07,rap,200960
...,...,...,...,...,...,...,...
23445,Drogadicto en Serie,Crack Family,0,La Familia (Capitulo 1),2010-10-12,rap,206106
23446,Levantando Bandera,Remik Gonzalez,0,Rolando Hits,2016,rap,246857
23447,Tomb (feat. Rasha Kamal),Adrinaline,0,Tomb (feat. Rasha Kamal),2018-10-11,rap,177293
23448,Somnia,Jay Hardway,0,Somnia,2016-09-19,edm,168000


In [10]:
df = df[:10000]

# Get lyrics

In [11]:
import requests
from bs4 import BeautifulSoup
import os
from dotenv import load_dotenv

load_dotenv()
API_TOKEN = os.getenv('api_key')
BASE_URL = 'https://api.genius.com'

# Function to search for a song
def search_song(artist, title):
    search_url = f"{BASE_URL}/search"
    headers = {'Authorization': f'Bearer {API_TOKEN}'}
    params = {'q': f"{title} {artist}"}
    response = requests.get(search_url, headers=headers, params=params)
    return response.json()

# Function to get song lyrics
def get_lyrics(song_api_path):
    song_url = f"{BASE_URL}{song_api_path}"
    headers = {'Authorization': f'Bearer {API_TOKEN}'}
    response = requests.get(song_url, headers=headers)
    json_response = response.json()
    path = json_response['response']['song']['path']
    
    # Fetch the song lyrics from the Genius website (not directly available via API)
    page_url = f"https://genius.com{path}"
    page = requests.get(page_url)
    soup = BeautifulSoup(page.text, 'html.parser')

    # Find the lyrics container with the data-lyrics-container="true" attribute
    lyrics_div = soup.find('div', {'data-lyrics-container': 'true'})
    lyrics = lyrics_div.get_text(separator='\n') if lyrics_div else "Lyrics not found."
    return lyrics

def get_lyrics_for_song(artist, title, iteration):
    search_result = search_song(artist, title)
    if search_result['response']['hits']:
        song_api_path = search_result['response']['hits'][0]['result']['api_path']
        print(f"{iteration} - FOUND - Title: {title} - Artist: {artist}")
        return get_lyrics(song_api_path)            
    print(f"{iteration} - FAIL - Title: {title} - Artist: {artist}")
    return None

def get_lyrics_for_df(df: pd.DataFrame):
    df['lyrics'] = None
    
    for index, row in df.iterrows():
        try: 
            lyrics: str = get_lyrics_for_song(row['track_name'], row['track_artist'], index)   
        except:
            lyrics = None
        df.at[index, 'lyrics'] = lyrics

        try:
            df.to_csv('song_data_raw_with_new_lyrics.csv', index=True)
        except:
            print("when saving something when wrong!")

get_lyrics_for_df(df)
    





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lyrics'] = None


0 - FOUND - Title: Tones and I - Artist: Dance Monkey
1 - FOUND - Title: Arizona Zervas - Artist: ROXANNE
2 - FOUND - Title: The Weeknd - Artist: Blinding Lights
3 - FOUND - Title: Post Malone - Artist: Circles
4 - FOUND - Title: KAROL G - Artist: Tusa
5 - FOUND - Title: Roddy Ricch - Artist: The Box
6 - FOUND - Title: Maroon 5 - Artist: Memories
7 - FOUND - Title: Dua Lipa - Artist: Don't Start Now
8 - FOUND - Title: Trevor Daniel - Artist: Falling
9 - FOUND - Title: Billie Eilish - Artist: everything i wanted
10 - FOUND - Title: The Black Eyed Peas - Artist: RITMO (Bad Boys For Life)
11 - FOUND - Title: Justin Bieber - Artist: Yummy
12 - FOUND - Title: Billie Eilish - Artist: bad guy
13 - FOUND - Title: blackbear - Artist: hot girl bummer
14 - FOUND - Title: Regard - Artist: Ride It
15 - FOUND - Title: Lewis Capaldi - Artist: Someone You Loved
16 - FOUND - Title: Camila Cabello - Artist: My Oh My (feat. DaBaby)
17 - FOUND - Title: Travis Scott - Artist: HIGHEST IN THE ROOM
18 - FOUND

# Clean data

In [3]:

df = pd.read_csv("song_data_raw_with_lyrics.csv")

df = df.dropna(subset=['lyrics'])

df.reset_index(drop=True)

df.to_csv('song_data.csv', index=True)

# 3914 songs with lyrics - for testing ok


# Remove extra index

In [10]:
df = pd.read_csv("song_data.csv")

df.columns


Index(['Unnamed: 0', 'track_name', 'track_artist', 'track_popularity',
       'track_album_name', 'track_album_release_date', 'playlist_genre',
       'duration_ms', 'lyrics'],
      dtype='object')

In [11]:
df = df.drop(columns=['Unnamed: 0'])
df.to_csv("song_data.csv", index=False)