# goal strukture 

## phase one: 
>track_name, track_artist, id, lyrics (empty)

## phase two: 
>track_name, track_artist, lyrics, id

## phase three: 
>track_name, track_artist, lyrics, embedding, id


# Phase one preprocessing

In [2]:
import pandas as pd 
import numpy as np

# importing the raw data sets 

df_MSSS = pd.read_csv("../raw_data/Most_Streamed_Spotify_Songs_2024.csv", encoding='latin1')
df_SMSD = pd.read_csv("../raw_data/spotify_millsongdata.csv")
df_SS = pd.read_csv("../raw_data/spotify_songs.csv")

# cleaning them to only the two rows we care about 

df_MSSS_reduced = df_MSSS[["Artist", "Track"]].rename(columns={"Artist": "track_artist", "Track": "track_name"})
df_MSSS_reduced["track_lyrics"] = None

df_SMSD_reduced = df_SMSD[["artist", "song", "text"]].rename(columns={"artist": "track_artist", "song": "track_name", "text": "track_lyrics"}) # also has the text so here we are goingt to put the text from the stuff here already
df_SS_reduced = df_SS[["track_name", "track_artist"]]
df_SS_reduced["track_lyrics"] = None


# combine dfs 

df_base = df_MSSS_reduced
df_base = pd.concat([df_base, df_SMSD_reduced], ignore_index=True)
df_base = pd.concat([df_base, df_SS_reduced], ignore_index=True)

# del duplicates 
df_clean = df_base.drop_duplicates(subset=['track_artist', 'track_name'])

# adding ids

import uuid

def generate_uuid():
    return str(uuid.uuid4())

df_clean['track_id'] = [generate_uuid() for _ in range(len(df_clean))]

df = df_clean


# about 86000 rows 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_SS_reduced["track_lyrics"] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['track_id'] = [generate_uuid() for _ in range(len(df_clean))]


# Phase two preprocessing

In [2]:
from dotenv import load_dotenv
import requests
import os
from bs4 import BeautifulSoup
import pandas as pd

load_dotenv()

df = pd.read_csv("big_song_data.csv")

try: 
    df = df.drop('Unnamed: 0', axis=1)
except:
    pass


API_TOKEN = os.getenv('client_access_token')
BASE_URL = 'https://api.genius.com'


def search_song(artist: str, title: str):
    search_url = f"{BASE_URL}/search"
    headers = {'Authorization': f'Bearer {API_TOKEN}'}
    params = {'q': f"{title} {artist}"}
    response = requests.get(search_url, headers=headers, params=params)
    return response.json()

# Function to get song lyrics
def get_lyrics(song_api_path: str):
    song_url = f"{BASE_URL}{song_api_path}"
    headers = {'Authorization': f'Bearer {API_TOKEN}'}
    response = requests.get(song_url, headers=headers)
    json_response = response.json()
    path = json_response['response']['song']['path']
    
    # Fetch the song lyrics from the Genius website (not directly available via API)
    page_url = f"https://genius.com{path}"
    page = requests.get(page_url)
    soup = BeautifulSoup(page.text, 'html.parser')

    # Find the lyrics container with the data-lyrics-container="true" attribute
    lyrics_div = soup.find('div', {'data-lyrics-container': 'true'})
    lyrics = lyrics_div.get_text(separator='\n') if lyrics_div else "Lyrics not found."
    return lyrics

def get_lyrics_for_song(artist: str, title: str, iteration: int) -> str:
    search_result = search_song(artist, title)
    if search_result['response']['hits']:
        song_api_path = search_result['response']['hits'][0]['result']['api_path']
        print(f"{iteration} - FOUND - Title: {title} - Artist: {artist}")
        return get_lyrics(song_api_path)            
    print(f"{iteration} - FAIL - Title: {title} - Artist: {artist}")
    return None

# iterating through the df in batches - something like 500 songs 
# throwing it at threads because its a task that can be perfectly threaded
# saving the batches to save progress 

for idx, row in df.iterrows():
    if not pd.isna(row['track_lyrics']):
        continue

    artist = row["track_artist"]
    track_name = row["track_name"]

    lyrics = get_lyrics_for_song(artist, track_name, idx)
    df.at[idx, 'track_lyrics'] = lyrics
    df.to_csv("big_song_data.csv", index=False)



8 - FAIL - Title: Danza Kuduro - Cover - Artist: MUSIC LAB JPN
24 - FAIL - Title: Beat Automotivo Tan Tan Tan Viral - Artist: WZ Beat
29 - FAIL - Title: Danza Kuduro - Cover - Artist: LOVE BGM JPN
40 - FAIL - Title: Cupid ï¿½ï¿½ï¿½ Twin Ver. (FIFTY FIFTY) ï¿½ï¿½ï¿½ Spe - Artist: sped up 8282
58 - FAIL - Title: Very Cute Melody by Marimba Tone (39813) - Artist: mitsu sound
73 - FAIL - Title: Laxed ï¿½ï¿½ï¿½ Sire - Artist: Jawsh 685
85 - FAIL - Title: Titï¿½ï¿½ Me Pregu - Artist: Bad Bunny
114 - FAIL - Title: Chegou a Hora de Ir para Cama (Playback) - Artist: 3 Palavrinhas
132 - FAIL - Title: PHONK BRASILEIRO FRESCO (Slowed + Reverb) - Artist: DJ MOIGUS
133 - FAIL - Title: Sooseki (From "Pushpa 2 The Rule") [TELUGU] - Artist: Shreya Ghoshal
135 - FAIL - Title: Casca de Bala - Artist: Thullio Milionï¿½ï¿
138 - FAIL - Title: Jedag Jedug Capcut 2024 - Artist: Afrian Af
143 - FAIL - Title: ýýýýýýýýýýýýýýýýýýýýý - Artist: FLI:P
151 - FAIL - Title: Again - Artist: Kurochuu
157 - FAIL - Title: 

KeyboardInterrupt: 

# Threading

In [4]:
# now the multithread approach
import threading

df = pd.read_csv("big_song_data.csv")

# going here and slicing the original df into equal parts
part_count = 10
len_df = df.shape[0]
slice_size = len_df // len_df

df_name = []


def scrape_lyrics_df(df_path: str):
    part_df = pd.read_csv(df_path)

    for idx, row in part_df.iterrows():
        if not pd.isna(row['track_lyrics']):
            continue

        artist = row["track_artist"]
        track_name = row["track_name"]

        lyrics = get_lyrics_for_song(artist, track_name, idx)
        part_df.at[idx, 'track_lyrics'] = lyrics
        part_df.to_csv(df_path, index=False)

for i in range(part_count):
    if i != part_count-1:
        df_part = df.iloc[i*slice_size:(i*slice_size-1)]
    else: 
        df_part = df.iloc[i*slice_size:]
    part_name = f"tmp/big_song_data_part_{i}.csv"
    df_part.to_csv(part_name, index=False)
    df_name.append(part_name)

for df_path in df_name:
    threading.Thread(target=scrape_lyrics_df, args=(df_path))

    



# give everyt thread one part of the fun 
# do the lyrics search 

# recombine the dfs 
# and save

df.to_csv("big_song_data.csv", index=False)
