<a href="https://colab.research.google.com/github/jerrytsai961117/7022/blob/main/Untitled10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q -U google-generativeai pandas numpy scikit-learn nltk opendatasets

In [None]:
import google.generativeai as genai
import pandas as pd
import numpy as np
import re
import os
import opendatasets as od
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity # Scikit-learn's optimized cosine similarity

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
try:
    from google.colab import userdata
    genai.configure(api_key=userdata.get('GOOGLE_API_KEY'))
    print("API key loaded from Colab Secrets.")
except Exception:
    print("Please ensure you've set your GOOGLE_API_KEY in Colab Secrets. If not, replace 'YOUR_API_KEY' below manually.")
    # genai.configure(api_key="YOUR_API_KEY") # Replace with your actual API key

# Initialize the embedding model
embedding_model = "models/text-embedding-004"
print(f"Embedding model '{embedding_model}' initialized.")

In [None]:
# Attempt to download the dataset from Kaggle
try:
    od.download("https://www.kaggle.com/datasets/tharunprabu/songs-data-with-full-lyrics")
    df = pd.read_csv("songs-data-with-full-lyrics/songs_with_lyrics.csv")
    print("Dataset downloaded and loaded successfully from Kaggle!")
except Exception as e:
    print(f"Could not download dataset from Kaggle. Error: {e}")
    print("Attempting to load from a local file path (assuming you've uploaded 'songs_with_lyrics.csv'):")
    try:
        df = pd.read_csv("songs_with_lyrics.csv")
        print("Loaded from local file successfully!")
    except FileNotFoundError:
        print("Could not find 'songs_with_lyrics.csv'. Creating a sample DataFrame for testing.")
        # Create a sample DataFrame with at least 100 entries for testing purposes
        data = {
            'Song Name': [f'Song {i}' for i in range(1, 151)],
            'Artist': [f'Artist {i}' for i in range(1, 151)],
            'Lyrics': [
                "This is a happy song about sunshine and flowers. Feeling good today, everything is bright.",
                "A sad melody, about lost love and rainy days. Tears fall like rain.",
                "Rock and roll all night, party every day. Electric guitar and loud drums.",
                "Smooth jazz rhythms, chill vibes, late night city lights. Relax and unwind.",
                "Hip hop beats, rhymes, and urban tales. Breaking down barriers, raising our voices.",
                "Country roads, take me home, to the place I belong. Simple life, open fields.",
                "Upbeat pop, dancing, feeling free. Summer nights and endless fun.",
                "Another happy tune, full of joy and laughter. Life is a wonderful journey.",
                "Deep thoughts, philosophical lyrics, questioning existence. The universe within.",
                "Romantic ballad, hearts entwined, forever together. Love is eternal.",
                "This is a happy song about sunshine and flowers. Feeling good today, everything is bright. More lyrics for a longer entry to test max_tokens. This song is truly uplifting and brings joy to my heart. The sun is shining, birds are singing, and life is beautiful. There's nothing to worry about, just pure bliss. Let's sing along and celebrate this moment. The rhythm is catchy and the melody is sweet. It's a perfect day for happiness.",
                "A sad melody, about lost love and rainy days. Tears fall like rain. This is a very melancholic song, reflecting on sorrow and despair. The rain mirrors the tears falling from my eyes. There's a deep sense of longing and a heavy heart. Every note echoes the pain, a symphony of sadness. I miss you more than words can say. This feeling of emptiness consumes me. It's a dark and lonely road ahead, without you by my side. The world seems to have lost its color, fading into gray.",
                "Rock and roll all night, party every day. Electric guitar and loud drums. Let's turn up the volume and let the music take control. The energy is electrifying, the crowd is roaring. We're here to rock, to shout, to let loose. No inhibitions, just pure raw power. The bass is thumping, the guitar riffs are insane. This is what living feels like, wild and free. Every beat pulses through my veins. We'll dance till dawn, till the sun comes up. This is our anthem, our rebellion. Yeah!",
                "Smooth jazz rhythms, chill vibes, late night city lights. Relax and unwind. The saxophone sings a soulful tune, the piano softly plays. A gentle breeze through the open window, the city sleeps. This music soothes my soul, washes away the day's worries. A glass of wine, a comfortable chair, and perfect harmony. The night is young, and the music flows effortlessly. It's a moment of peace, of pure tranquility. Let the smooth sounds envelop you, drift away.",
                "Hip hop beats, rhymes, and urban tales. Breaking down barriers, raising our voices. The streets are alive with the sound of the future. Every word is a statement, every beat a revolution. We speak our truth, we tell our stories. From the block to the top, we never stop. The rhythm takes hold, the flow is unstoppable. This is our culture, our passion. We're here to inspire, to uplift, to empower. Stand tall, stand proud, let your voice be heard.",
                "Country roads, take me home, to the place I belong. Simple life, open fields. The smell of fresh cut hay, the sound of crickets at night. This is where my heart is, in the quiet embrace of nature. No hustle, no bustle, just peaceful serenity. The stars shine brighter here, a blanket of diamonds. I miss the simple things, the warmth of home. Every memory is a comfort, a gentle reminder. This land holds my roots, my history. It's a place of healing, of solace, of belonging.",
                "Upbeat pop, dancing, feeling free. Summer nights and endless fun. The music makes me want to move, to jump, to shout. Every beat is a burst of energy, a surge of happiness. We're living in the moment, no regrets, just pure bliss. The lights are bright, the crowd is alive. This is the soundtrack to our perfect summer. Let's dance until we can't anymore, until the sun rises. The good times are here to stay, forever young. This feeling is electrifying, exhilarating. Yeah!",
                "Another happy tune, full of joy and laughter. Life is a wonderful journey. Every step is an adventure, every day a new beginning. I'm grateful for every moment, every smile. The world is full of beauty, if you just open your eyes. Let's spread kindness, spread love, spread happiness. The positive vibes are contagious, let them flow. This song is a celebration of life, of all its wonders. Embrace the journey, enjoy the ride. It's a beautiful world, let's make it even better.",
                "Deep thoughts, philosophical lyrics, questioning existence. The universe within. What is our purpose, our meaning? The stars gaze back, silent witnesses. We are but dust, yet we hold infinitude. The mysteries of life unfold with every breath. Search for truth, search for understanding. The path is long, the questions endless. This song invites contemplation, introspection. Look within, find your answers. The journey of self-discovery is profound.",
                "Romantic ballad, hearts entwined, forever together. Love is eternal. Your hand in mine, walking through life's garden. Every moment with you is precious, a treasure. Our love story, written in the stars. Two souls, one heart, forever bound. The melody whispers promises, the lyrics sing of devotion. This is more than a song, it's our anthem. Through thick and thin, we'll always be together. Our love is a flame, burning bright, never fading."
            ] + [f"This is a generic song about various topics {i}. It has some random words and phrases to fill up the content for testing purposes. More text to ensure sufficient length for the dataset, reaching over 100 entries. Blah blah blah. Keywords: generic, random, text, test, long, fill, content." for i in range(11, 151)]
        }
        df = pd.DataFrame(data)
        print("Created a sample DataFrame for testing.")

# Ensure 'Lyrics' column exists before proceeding
if 'Lyrics' not in df.columns or df['Lyrics'].empty:
    print("Error: 'Lyrics' column is missing or empty. Cannot proceed with embedding generation.")
    exit() # Exit if essential column is missing

# Define text cleaning function
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing to 'Lyrics' column
df['Cleaned_Lyrics'] = df['Lyrics'].apply(preprocess_text)

# Filter out any entries where Cleaned_Lyrics might have become empty after preprocessing
df_processed = df[df['Cleaned_Lyrics'].str.strip() != ''].reset_index(drop=True)
print(f"\nNumber of valid entries after preprocessing: {len(df_processed)}")

if len(df_processed) < 50:
    print("Warning: Less than 50 valid entries after preprocessing. Embeddings might not be representative.")

In [None]:
# Select text entries for embedding
# We'll embed up to the first 200 entries to manage API calls and runtime efficiently.
# If your dataset is smaller, it will use all available valid entries.
texts_to_embed = df_processed['Cleaned_Lyrics'].tolist()
num_entries_to_embed = min(len(texts_to_embed), 200) # Process max 200 entries
texts_to_embed_subset = texts_to_embed[:num_entries_to_embed]

print(f"Generating embeddings for {len(texts_to_embed_subset)} text entries.")

all_embeddings = []
# It's good practice to chunk requests for very large datasets to avoid API limits.
# For 200 entries, a single call is usually fine.
try:
    response = genai.embed_content(
        model=embedding_model,
        contents=texts_to_embed_subset,
        task_type="SEMANTIC_SIMILARITY"
    )
    all_embeddings = [np.array(e.values) for e in response.embeddings]
    print(f"Successfully generated {len(all_embeddings)} embeddings.")

except Exception as e:
    print(f"Error generating embeddings: {e}")
    all_embeddings = []

# Convert the list of NumPy arrays into a single 2D NumPy array
if all_embeddings:
    embeddings_array = np.array(all_embeddings)
    print(f"\nEmbeddings stored as a NumPy array with shape: {embeddings_array.shape}")

    # Attach embeddings to the DataFrame for easier access
    df_processed = df_processed.head(len(all_embeddings)).copy() # Ensure df_processed length matches embeddings
    df_processed['embedding'] = list(all_embeddings)
    print("\nEmbeddings attached to the DataFrame.")
else:
    print("\nNo embeddings were generated. Cannot proceed with similarity search.")
    exit() # Exit if no embeddings are available

In [None]:
def find_similar_songs(target_song_index, embeddings_df, top_n=5):
    """
    Finds the top N most similar songs based on cosine similarity of their embeddings.

    Args:
        target_song_index (int): The index of the song in the DataFrame for which to find recommendations.
        embeddings_df (pd.DataFrame): DataFrame containing 'Song Name', 'Artist', and 'embedding' columns.
        top_n (int): The number of top similar songs to recommend.

    Returns:
        pd.DataFrame: A DataFrame of the top_n most similar songs (excluding the target song itself).
                      Includes 'Song Name', 'Artist', and 'Similarity_Score'.
    """
    if target_song_index >= len(embeddings_df) or target_song_index < 0:
        print(f"Error: Target song index {target_song_index} is out of bounds.")
        return pd.DataFrame()

    target_embedding = embeddings_df.loc[target_song_index, 'embedding']
    # Ensure target_embedding is a 2D array for cosine_similarity if it's a single vector
    if target_embedding.ndim == 1:
        target_embedding = target_embedding.reshape(1, -1)

    # Calculate cosine similarity between the target embedding and all other embeddings
    # sklearn.metrics.pairwise.cosine_similarity expects 2D arrays
    # so we need to reshape single embeddings (1, embedding_dim)
    all_embeddings_matrix = np.array(embeddings_df['embedding'].tolist())

    similarities = cosine_similarity(target_embedding, all_embeddings_matrix).flatten()

    # Create a Series with similarities, indexed by song index
    similarity_scores = pd.Series(similarities, index=embeddings_df.index)

    # Exclude the target song itself from recommendations
    similarity_scores = similarity_scores.drop(index=target_song_index)

    # Sort by similarity in descending order and get the top N
    top_similar_indices = similarity_scores.nlargest(top_n).index

    # Retrieve the details of the recommended songs
    recommendations = embeddings_df.loc[top_similar_indices, ['Song Name', 'Artist']].copy()
    recommendations['Similarity_Score'] = similarity_scores.loc[top_similar_indices]

    return recommendations.reset_index(drop=True)

print("`find_similar_songs` function defined.")

In [None]:
# Choose a song to get recommendations for
# Let's pick an arbitrary song, e.g., the 5th song (index 4)
# Make sure the chosen index exists in df_processed
if len(df_processed) > 4:
    test_song_index = 4
    test_song_name = df_processed.loc[test_song_index, 'Song Name']
    test_artist = df_processed.loc[test_song_index, 'Artist']

    print(f"\nFinding top 5 similar songs to: '{test_song_name}' by {test_artist}\n")

    # Get recommendations
    top_recommendations = find_similar_songs(test_song_index, df_processed, top_n=5)

    if not top_recommendations.empty:
        print("Top 5 Recommended Songs:")
        print(top_recommendations.to_string(index=False)) # Use to_string to print full DataFrame without index
    else:
        print("Could not retrieve recommendations. Check logs above for errors.")
else:
    print("\nDataset too small to pick a test song at index 4. Please ensure at least 5 songs are loaded.")

In [None]:
# Try another one, maybe a song from the middle or end of your processed dataset
if len(df_processed) > 50: # Ensure there are enough songs for a different index
    test_song_index_2 = 50 # Example: the 51st song
    test_song_name_2 = df_processed.loc[test_song_index_2, 'Song Name']
    test_artist_2 = df_processed.loc[test_song_index_2, 'Artist']

    print(f"\n--- Finding top 5 similar songs to: '{test_song_name_2}' by {test_artist_2} ---\n")

    top_recommendations_2 = find_similar_songs(test_song_index_2, df_processed, top_n=5)

    if not top_recommendations_2.empty:
        print("Top 5 Recommended Songs:")
        print(top_recommendations_2.to_string(index=False))
    else:
        print("Could not retrieve recommendations for the second test song.")