# goal strukture 

## phase one: 
>track_name, track_artist, id, lyrics (empty)

## phase two: 
>track_name, track_artist, lyrics, id

## phase three: 
>track_name, track_artist, lyrics, embedding, id


# Phase one preprocessing

In [2]:
import pandas as pd 
import numpy as np

# importing the raw data sets 

df_MSSS = pd.read_csv("../raw_data/Most_Streamed_Spotify_Songs_2024.csv", encoding='latin1')
df_SMSD = pd.read_csv("../raw_data/spotify_millsongdata.csv")
df_SS = pd.read_csv("../raw_data/spotify_songs.csv")

# cleaning them to only the two rows we care about 

df_MSSS_reduced = df_MSSS[["Artist", "Track"]].rename(columns={"Artist": "track_artist", "Track": "track_name"})
df_MSSS_reduced["track_lyrics"] = None

df_SMSD_reduced = df_SMSD[["artist", "song", "text"]].rename(columns={"artist": "track_artist", "song": "track_name", "text": "track_lyrics"}) # also has the text so here we are goingt to put the text from the stuff here already
df_SS_reduced = df_SS[["track_name", "track_artist"]]
df_SS_reduced["track_lyrics"] = None


# combine dfs 

df_base = df_MSSS_reduced
df_base = pd.concat([df_base, df_SMSD_reduced], ignore_index=True)
df_base = pd.concat([df_base, df_SS_reduced], ignore_index=True)

# del duplicates 
df_clean = df_base.drop_duplicates(subset=['track_artist', 'track_name'])

# adding ids

import uuid

def generate_uuid():
    return str(uuid.uuid4())

df_clean['track_id'] = [generate_uuid() for _ in range(len(df_clean))]

df = df_clean


# about 86000 rows 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_SS_reduced["track_lyrics"] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['track_id'] = [generate_uuid() for _ in range(len(df_clean))]


# Phase two preprocessing

In [11]:
from dotenv import load_dotenv
import requests
import os
from bs4 import BeautifulSoup
import pandas as pd

load_dotenv()

df = pd.read_csv("big_song_data.csv")

try: 
    df = df.drop('Unnamed: 0', axis=1)
except:
    pass


API_TOKEN = os.getenv('client_access_token')
BASE_URL = 'https://api.genius.com'


def search_song(artist: str, title: str):
    search_url = f"{BASE_URL}/search"
    headers = {'Authorization': f'Bearer {API_TOKEN}'}
    params = {'q': f"{title} {artist}"}
    response = requests.get(search_url, headers=headers, params=params)
    return response.json()

# Function to get song lyrics
def get_lyrics(song_api_path: str):
    song_url = f"{BASE_URL}{song_api_path}"
    headers = {'Authorization': f'Bearer {API_TOKEN}'}
    response = requests.get(song_url, headers=headers)
    json_response = response.json()
    path = json_response['response']['song']['path']
    
    # Fetch the song lyrics from the Genius website (not directly available via API)
    page_url = f"https://genius.com{path}"
    page = requests.get(page_url)
    soup = BeautifulSoup(page.text, 'html.parser')

    # Find the lyrics container with the data-lyrics-container="true" attribute
    lyrics_div = soup.find('div', {'data-lyrics-container': 'true'})
    lyrics = lyrics_div.get_text(separator='\n') if lyrics_div else "Lyrics not found."
    return lyrics

def get_lyrics_for_song(artist: str, title: str, iteration: int) -> str:
    search_result = search_song(artist, title)
    if search_result['response']['hits']:
        song_api_path = search_result['response']['hits'][0]['result']['api_path']
        print(f"{iteration} - FOUND - Title: {title} - Artist: {artist}")
        return get_lyrics(song_api_path)            
    print(f"{iteration} - FAIL - Title: {title} - Artist: {artist}")
    print(search_result)
    return None

# iterating through the df in batches - something like 500 songs 
# throwing it at threads because its a task that can be perfectly threaded
# saving the batches to save progress 
"""
for idx, row in df.iterrows():
    if not pd.isna(row['track_lyrics']):
        continue

    artist = row["track_artist"]
    track_name = row["track_name"]

    lyrics = get_lyrics_for_song(artist, track_name, idx)
    df.at[idx, 'track_lyrics'] = lyrics
    df.to_csv("big_song_data.csv", index=False)
"""


'\nfor idx, row in df.iterrows():\n    if not pd.isna(row[\'track_lyrics\']):\n        continue\n\n    artist = row["track_artist"]\n    track_name = row["track_name"]\n\n    lyrics = get_lyrics_for_song(artist, track_name, idx)\n    df.at[idx, \'track_lyrics\'] = lyrics\n    df.to_csv("big_song_data.csv", index=False)\n'

# Threading

In [12]:
# now the multithread approach
import threading
import pandas as pd

df = pd.read_csv("big_song_data.csv")

# going here and slicing the original df into equal parts
part_count = 10
len_df = df.shape[0]
slice_size = len_df // part_count

df_name = []

def scrape_lyrics_df(df_path: str):
    print("thread started")
    part_df = pd.read_csv(df_path)

    for idx, row in part_df.iterrows():
        try:
            if not pd.isna(row['track_lyrics']):
                continue

            artist = row["track_artist"]
            track_name = row["track_name"]

            lyrics = get_lyrics_for_song(artist, track_name, idx)
            part_df.at[idx, 'track_lyrics'] = lyrics
            part_df.to_csv(df_path, index=False)
        except:
            print(f"Something went wrong with: {artist}, {track_name}")
    


for i in range(part_count):
    if i != part_count-1:
        df_part = df.iloc[i*slice_size:((i+1)*slice_size-1)]
    else: 
        df_part = df.iloc[i*slice_size:]
    part_name = f"tmp/big_song_data_part_{i}.csv"
    df_part.to_csv(part_name, index=False)
    df_name.append(part_name)

for df_path in df_name:
    print(df_path)
    a = threading.Thread(target=scrape_lyrics_df, args=(df_path,))
    a.start()




# give everyt thread one part of the fun 
# do the lyrics search 

# recombine the dfs 
# and save



tmp/big_song_data_part_0.csv
thread started
tmp/big_song_data_part_1.csv
thread started
tmp/big_song_data_part_2.csv
thread started
tmp/big_song_data_part_3.csv
thread started
tmp/big_song_data_part_4.csv
thread started
tmp/big_song_data_part_5.csv
thread started
tmp/big_song_data_part_6.csv
thread started
tmp/big_song_data_part_7.csv
thread started
tmp/big_song_data_part_8.csv
thread started
tmp/big_song_data_part_9.csv
thread started


8 - FAIL - Title: Danza Kuduro - Cover - Artist: MUSIC LAB JPN
{'meta': {'status': 200}, 'response': {'hits': []}}
0 - FOUND - Title: Zenci - Artist: LOKO BEN
0 - FOUND - Title: Leaving with Me - Artist: Next
1341 - FOUND - Title: I Don't Care (with Justin Bieber) - Loud Luxury Remix - Artist: Ed Sheeran
24 - FAIL - Title: Beat Automotivo Tan Tan Tan Viral - Artist: WZ Beat
{'meta': {'status': 200}, 'response': {'hits': []}}


Baller los Bazuka, bring' dein Business heut' auf Schufa
[?]
Rapper sind auf Aggro, auf puta fico schon klarem Zustand
Ja, ich will den ganzen Kuchen, bin hier nicht aus Zufall
Zenci, jebemti, ich komm vorbei und mach dich muerto
Comprender? Loco loco, Bruder bleibt Legende
Krimineller Südländer, Streetmember, Großhändler
Vatos Locos, mach' Alarm wie der 11. September
Deutsche Rapper alles Blender, ich bin Einzelgänger
Xatar, wie ein Käfigkämpfer demolieren, [was 'n Penner?]
Ich dreh' heut' mein Mafia-Film, fliegen so wie Aladin
Erzähl mir nichts von Kokain, Loco handelt doch mit Benzin
H&M Verein bekommt heute mal seinen Preis
Stiftung Warentest, Hurensohn Nummer 1
Irgendwann mal stürm'n wir rein, machen den Scheißladen klein
(Irgendwann mal stürm'n wir rein, machen den Scheißladen klein)
[Hook]
Meine Jungs sind Täter
Nachts auf der Street, verbrennen das Paper
Kunden wollen Beyda, ich krieg zu viel, scheiß' auf dein Zehner
Meine Jungs sind Täter
Nachts auf der Street, verbrennen das 

29 - FAIL - Title: Danza Kuduro - Cover - Artist: LOVE BGM JPN
{'meta': {'status': 200}, 'response': {'hits': []}}
40 - FAIL - Title: Cupid ï¿½ï¿½ï¿½ Twin Ver. (FIFTY FIFTY) ï¿½ï¿½ï¿½ Spe - Artist: sped up 8282
{'meta': {'status': 200}, 'response': {'hits': []}}
1 - FOUND - Title: Fuck Tha Police - Artist: N.W.A.


Are you leaving with me, me, me, me?
Now you leaving with me, me, me, me
Are you leaving with me, me, me, me?
Are you?
Baby tell me what it's gonna take, (gonna take)
To keep you with me right now, right now
In the room that I wanna brake, I wanna brake
Too keep you around, around
Cuz when they turn down the lights
I wanna feel you feeling on me
Know it's wrong but so right
Girl how I want you so bad
By the end of the night
I know that you'll be leaving with me
Know it's wrong but so right
That I try to take you home tonight
Soon as I walk in the club
You've been showing me love
You've be showing love
Baby what it's gonna be
Tell me that you gonna leave with me
Are you leaving with me, me, me, me?
Leaving with me, me, me, me?
Are you leaving with me, me, me, me?
Are you?
Got, got 3 bottles with wine rose
So tell me what you tryna do
Yeah
Bring a friend that wanna taste
Cuz baby she can get it too
Cuz when they turn down the lights
I wanna feel you feeling on me
Know it's wrong but so r

58 - FAIL - Title: Very Cute Melody by Marimba Tone (39813) - Artist: mitsu sound
{'meta': {'status': 200}, 'response': {'hits': []}}
1342 - FOUND - Title: Memories - Dillon Francis Remix - Artist: Maroon 5
73 - FAIL - Title: Laxed ï¿½ï¿½ï¿½ Sire - Artist: Jawsh 685
{'meta': {'status': 200}, 'response': {'hits': []}}
1 - FOUND - Title: Trapicana - Artist: Nguvo
85 - FAIL - Title: Titï¿½ï¿½ Me Pregu - Artist: Bad Bunny
{'meta': {'status': 200}, 'response': {'hits': []}}
114 - FAIL - Title: Chegou a Hora de Ir para Cama (Playback) - Artist: 3 Palavrinhas
{'meta': {'status': 200}, 'response': {'hits': []}}
132 - FAIL - Title: PHONK BRASILEIRO FRESCO (Slowed + Reverb) - Artist: DJ MOIGUS
{'meta': {'status': 200}, 'response': {'hits': []}}
133 - FAIL - Title: Sooseki (From "Pushpa 2 The Rule") [TELUGU] - Artist: Shreya Ghoshal
{'meta': {'status': 200}, 'response': {'hits': []}}
135 - FAIL - Title: Casca de Bala - Artist: Thullio Milionï¿½ï¿
{'meta': {'status': 200}, 'response': {'hits': []}

In [11]:
# combining the dfs again
# something is still messed up with the slicing or something idk but there are to many entrys now 
# somehow doubled the amount of line idk


df_base_combine = pd.read_csv(df_name[0])

for df_path in df_name[1:]:
    df_base_combine = pd.concat([df_base_combine, pd.read_csv(df_path)], ignore_index=True)


df_base_combine.to_csv("big_song_data_test.csv", index=False)

In [15]:
check_df = pd.read_csv("big_song_data_test.csv")

(173428, 4)