<a href="https://colab.research.google.com/github/jerrytsai961117/7021/blob/main/Untitled13.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q -U google-generativeai pandas numpy scikit-learn nltk opendatasets

In [None]:
import google.generativeai as genai
import pandas as pd
import numpy as np
import re
import os
import opendatasets as od
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import time # For timing operations

In [None]:
try:
    from google.colab import userdata
    genai.configure(api_key=userdata.get('GOOGLE_API_KEY'))
    print("API key loaded from Colab Secrets.")
except Exception:
    print("Please ensure you've set your GOOGLE_API_KEY in Colab Secrets. If not, replace 'YOUR_API_KEY' below manually.")
    # genai.configure(api_key="YOUR_API_KEY") # Replace with your actual API key

# Initialize the embedding model. 'text-embedding-004' is a robust choice.
embedding_model_name = "models/text-embedding-004"
print(f"Embedding model '{embedding_model_name}' initialized.")

In [None]:
# Attempt to download the dataset from Kaggle. You'll be prompted for Kaggle API credentials.
try:
    od.download("https://www.kaggle.com/datasets/tharunprabu/songs-data-with-full-lyrics")
    df = pd.read_csv("songs-data-with-full-lyrics/songs_with_lyrics.csv")
    print("Dataset downloaded and loaded successfully from Kaggle!")
except Exception as e:
    print(f"Could not download dataset from Kaggle. Error: {e}")
    print("Attempting to load from a local file path (assuming you've uploaded 'songs_with_lyrics.csv'):")
    try:
        df = pd.read_csv("songs_with_lyrics.csv")
        print("Loaded from local file successfully!")
    except FileNotFoundError:
        print("Could not find 'songs_with_lyrics.csv'. Creating a sample DataFrame for testing.")
        # Create a sample DataFrame with at least 100 entries for testing purposes
        data = {
            'Song Name': [f'Song {i}' for i in range(1, 151)],
            'Artist': [f'Artist {i}' for i in range(1, 151)],
            'Lyrics': [
                "This is a happy song about sunshine and flowers. Feeling good today, everything is bright.",
                "A sad melody, about lost love and rainy days. Tears fall like rain.",
                "Rock and roll all night, party every day. Electric guitar and loud drums.",
                "Smooth jazz rhythms, chill vibes, late night city lights. Relax and unwind.",
                "Hip hop beats, rhymes, and urban tales. Breaking down barriers, raising our voices.",
                "Country roads, take me home, to the place I belong. Simple life, open fields.",
                "Upbeat pop, dancing, feeling free. Summer nights and endless fun.",
                "Another happy tune, full of joy and laughter. Life is a wonderful journey.",
                "Deep thoughts, philosophical lyrics, questioning existence. The universe within.",
                "Romantic ballad, hearts entwined, forever together. Love is eternal.",
                # Adding more diverse content for better embedding examples
                "A powerful anthem of rebellion and freedom. Break the chains, fight for justice, never surrender.",
                "A calm, soothing lullaby for quiet nights. Sleep softly, dream sweet dreams, wake to a new dawn.",
                "Energetic dance track with pulsating bass and shimmering synths. Feel the rhythm, let go, just move.",
                "Thought-provoking folk song with intricate storytelling and acoustic guitar. Life's journey, lessons learned.",
                "Heavy metal roar, screaming vocals, and shredding guitars. Unleash the beast, rage against the machine.",
                "Soulful R&B groove, smooth vocals, and undeniable charm. Love's embrace, tender moments, deep connection.",
                "Experimental electronic music, glitchy sounds, and ambient textures. Explore new soundscapes, abstract art.",
                "A classical symphony, grand and timeless, echoing through the ages. Majestic, powerful, truly inspiring.",
                "Upbeat reggae beats, positive vibes, and messages of peace. One love, unity, good vibrations for all.",
                "Bluesy lament, raw emotion, and wailing harmonica. Heartbreak, struggle, finding solace in the sound.",
                # Populate with more generic content to reach 150 entries
            ] + [f"This is a generic song about various topics {i}. It has some random words and phrases to fill up the content for testing purposes. More text to ensure sufficient length for the dataset, reaching over 100 entries. Blah blah blah. Keywords: generic, random, text, test, long, fill, content." for i in range(11, 151)]
        }
        df = pd.DataFrame(data)
        print("Created a sample DataFrame for testing.")

# Ensure 'Lyrics' column exists before proceeding
if 'Lyrics' not in df.columns or df['Lyrics'].empty:
    print("Error: 'Lyrics' column is missing or empty. Cannot proceed with embedding generation.")
    # Exit or raise an error in a production environment
    exit()

# Download NLTK resources if not already present
try:
    nltk.data.find('tokenizers/punkt')
except nltk.downloader.DownloadError:
    nltk.download('punkt')
try:
    nltk.data.find('corpora/stopwords')
except nltk.downloader.DownloadError:
    nltk.download('stopwords')
try:
    nltk.data.find('corpora/wordnet')
except nltk.downloader.DownloadError:
    nltk.download('wordnet')

# Define text cleaning function
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = str(text).lower() # Convert to string and lowercase
    text = re.sub(r'\[.*?\]', '', text) # Remove text in square brackets (e.g., [Chorus])
    text = re.sub(r'[^a-z\s]', '', text) # Keep only letters and spaces
    tokens = word_tokenize(text) # Tokenize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and word.strip()] # Lemmatize, remove stopwords, and empty strings
    return ' '.join(tokens)

# Apply preprocessing to 'Lyrics' column
df['Cleaned_Lyrics'] = df['Lyrics'].apply(preprocess_text)

# Filter out entries where Cleaned_Lyrics might have become empty after preprocessing
df_processed = df[df['Cleaned_Lyrics'].str.strip() != ''].reset_index(drop=True)
print(f"Data preprocessing complete. Number of valid entries: {len(df_processed)}")

if len(df_processed) < 50:
    print("Warning: Less than 50 valid entries after preprocessing. Recommendation quality might be limited.")

# Ensure there are enough songs for recommendation
if len(df_processed) < 2:
    print("Error: Not enough songs in the dataset for recommendations. Please ensure at least two songs with valid lyrics.")
    exit()

In [None]:
print("\n--- Generating embeddings for songs in the dataset (this may take a moment)... ---")
start_time = time.time()

# To manage API calls and runtime, we'll embed up to the first 500 songs.
# In a full-scale application, you'd embed your entire dataset.
texts_for_embedding = df_processed['Cleaned_Lyrics'].tolist()
num_to_embed = min(len(texts_for_embedding), 500)
texts_to_embed_subset = texts_for_embedding[:num_to_embed]

all_song_embeddings = []
# The embed_content API efficiently handles batch requests.
try:
    response = genai.embed_content(
        model=embedding_model_name,
        contents=texts_to_embed_subset,
        task_type="SEMANTIC_SIMILARITY" # Specify task type for better embeddings
    )
    all_song_embeddings = [np.array(e.values) for e in response.embeddings]
    print(f"Successfully generated {len(all_song_embeddings)} song embeddings.")

except Exception as e:
    print(f"Error generating song embeddings: {e}")
    all_song_embeddings = []

end_time = time.time()
print(f"Embedding generation took: {end_time - start_time:.2f} seconds")

if not all_song_embeddings:
    print("Error: Failed to generate song embeddings. Please check API key and network connection. Cannot proceed with recommendations.")
    exit()

# Attach embeddings to the DataFrame for easy access
df_processed_with_embeddings = df_processed.head(len(all_song_embeddings)).copy()
df_processed_with_embeddings['embedding'] = list(all_song_embeddings)

# Convert all song embeddings to a single NumPy array for efficient cosine similarity calculation
song_embeddings_matrix = np.array(df_processed_with_embeddings['embedding'].tolist())

print(f"All song embeddings stored as a NumPy array with shape: {song_embeddings_matrix.shape}")

In [None]:
def get_recommendations_from_text(user_input_text, dataframe_with_embeddings, embeddings_matrix, top_n=5):
    """
    Recommends the most similar songs based on user's text input.

    Args:
        user_input_text (str): The user's text description of a song/style.
        dataframe_with_embeddings (pd.DataFrame): DataFrame containing 'Song Name', 'Artist', and 'embedding'.
        embeddings_matrix (np.array): A 2D NumPy array of all song embeddings.
        top_n (int): The number of top similar songs to recommend.

    Returns:
        pd.DataFrame: A DataFrame of the top_n recommended songs, including their 'Song Name', 'Artist', and 'Similarity_Score'.
                      Returns an empty DataFrame if input is invalid or no recommendations are found.
    """
    if not user_input_text.strip():
        print("User input cannot be empty.")
        return pd.DataFrame()

    print(f"\nProcessing your input: '{user_input_text}'...")

    # 1. Preprocess user input
    cleaned_user_input = preprocess_text(user_input_text)
    if not cleaned_user_input:
        print("Preprocessed user input is empty. Cannot generate embedding.")
        return pd.DataFrame()

    # 2. Generate embedding for user input
    try:
        user_embedding_response = genai.embed_content(
            model=embedding_model_name,
            contents=[cleaned_user_input], # The API expects a list of contents
            task_type="SEMANTIC_SIMILARITY"
        )
        user_embedding = np.array(user_embedding_response.embeddings[0].values)
        if user_embedding.ndim == 1:
            user_embedding = user_embedding.reshape(1, -1) # Reshape to 2D for cosine_similarity

    except Exception as e:
        print(f"Error generating embedding for user input: {e}")
        return pd.DataFrame()

    # 3. Calculate cosine similarity between user input and all song embeddings
    # cosine_similarity expects two 2D arrays (n_samples, n_features)
    similarities = cosine_similarity(user_embedding, embeddings_matrix).flatten()

    # 4. Create a Series of similarity scores, indexed by song index
    similarity_scores = pd.Series(similarities, index=dataframe_with_embeddings.index)

    # 5. Get the top N most similar songs
    # .nlargest automatically sorts and takes the top values
    top_similar_indices = similarity_scores.nlargest(top_n).index

    # 6. Retrieve details of recommended songs
    recommendations = dataframe_with_embeddings.loc[top_similar_indices, ['Song Name', 'Artist']].copy()
    recommendations['Similarity_Score'] = similarity_scores.loc[top_similar_indices]

    return recommendations.reset_index(drop=True)

print("`get_recommendations_from_text` function defined.")

In [None]:
def run_recommender_prototype():
    print("\n--- Welcome to the AI Song Recommender! ---")
    print("Describe the type of song, theme, or mood you're looking for, and I'll recommend similar songs.")
    print("Type 'exit' or 'quit' to stop.")

    while True:
        user_input = input("\nPlease describe the song you'd like (e.g., 'a sad song about lost love', 'energetic party music for dancing'): \n> ")

        if user_input.lower() in ['exit', 'quit']:
            print("Thanks for using the recommender! Goodbye!")
            break

        if not user_input.strip():
            print("Your input was empty. Please type a description.")
            continue

        # Get recommendations
        recommended_songs = get_recommendations_from_text(
            user_input_text=user_input,
            dataframe_with_embeddings=df_processed_with_embeddings,
            embeddings_matrix=song_embeddings_matrix,
            top_n=5 # Recommend the top 5 songs
        )

        if not recommended_songs.empty:
            print("\n--- Here are your song recommendations: ---")
            print(recommended_songs.to_string(index=False)) # Use to_string for better DataFrame printing
        else:
            print("\nSorry, I couldn't find relevant recommendations based on your description. Please try a different one.")

# --- Run the main prototype ---
if __name__ == "__main__":
    if 'df_processed_with_embeddings' in locals() and not df_processed_with_embeddings.empty:
        run_recommender_prototype()
    else:
        print("\nSystem initialization failed. Please check previous steps for data loading and embedding generation errors.")