# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity




# Download NLTK Resources

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# Load Dataset

In [4]:
# Load dataset
df = pd.read_csv("spotify_lyrics.csv")

# Keep required columns
df = df[['song', 'artist', 'text']]

# Remove missing lyrics
df.dropna(subset=['text'], inplace=True)

# Reset index
df.reset_index(drop=True, inplace=True)

print("Total songs:", len(df))
df.head()


Total songs: 57650


Unnamed: 0,song,artist,text
0,Ahe's My Kind Of Girl,ABBA,"Look at her face, it's a wonderful face \nAnd..."
1,"Andante, Andante",ABBA,"Take it easy with me, please \nTouch me gentl..."
2,As Good As New,ABBA,I'll never know why I had to go \nWhy I had t...
3,Bang,ABBA,Making somebody happy is a question of give an...
4,Bang-A-Boomerang,ABBA,Making somebody happy is a question of give an...


# Text Preprocessing Function

In [5]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [
        lemmatizer.lemmatize(word)
        for word in tokens
        if word not in stop_words
    ]
    return " ".join(tokens)


# Apply Text Preprocessing

In [7]:
df['clean_lyrics'] = df['text'].apply(preprocess_text)
df[['text', 'clean_lyrics']].head()


Unnamed: 0,text,clean_lyrics
0,"Look at her face, it's a wonderful face \nAnd...",look face wonderful face mean something specia...
1,"Take it easy with me, please \nTouch me gentl...",take easy please touch gently like summer even...
2,I'll never know why I had to go \nWhy I had t...,ill never know go put lousy rotten show boy to...
3,Making somebody happy is a question of give an...,making somebody happy question give take learn...
4,Making somebody happy is a question of give an...,making somebody happy question give take learn...


# TF-IDF Vectorization

In [8]:
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2)
)

tfidf_matrix = vectorizer.fit_transform(df['clean_lyrics'])

print("TF-IDF matrix shape:", tfidf_matrix.shape)


TF-IDF matrix shape: (57650, 5000)


# Lyric Search Prediction Function

In [9]:
def predict_song(lyric_snippet, top_n=3):
    cleaned_input = preprocess_text(lyric_snippet)
    input_vector = vectorizer.transform([cleaned_input])

    similarity_scores = cosine_similarity(input_vector, tfidf_matrix)
    top_indices = similarity_scores[0].argsort()[::-1][:top_n]

    results = []
    for idx in top_indices:
        results.append({
            "Song Title": df.iloc[idx]['song'],
            "Artist": df.iloc[idx]['artist'],
            "Similarity Score": round(similarity_scores[0][idx], 3)
        })

    return results


# Sample Predictions

In [10]:
test_lyrics = [
    "I'm walking down a lonely road",
    "we were both young when I first saw you",
    "hello from the other side"
]

for lyric in test_lyrics:
    print(f"\nInput Lyrics: \"{lyric}\"")
    predictions = predict_song(lyric)

    for i, result in enumerate(predictions, 1):
        print(f"{i}. {result['Song Title']} - {result['Artist']} (Score: {result['Similarity Score']})")



Input Lyrics: "I'm walking down a lonely road"
1. Fire - Ingrid Michaelson (Score: 0.706)
2. Walking On Air - Katy Perry (Score: 0.664)
3. (Feels Like I'm) Walking On Water - Wet Wet Wet (Score: 0.616)

Input Lyrics: "we were both young when I first saw you"
1. I Saw You First - John Mellencamp (Score: 0.559)
2. When I Was Young - Tina Turner (Score: 0.545)
3. I Was Young Once Too - Alabama (Score: 0.465)

Input Lyrics: "hello from the other side"
1. Hello God - Dolly Parton (Score: 0.681)
2. Ratamahatta - Korn (Score: 0.673)
3. Hello Again - Regine Velasquez (Score: 0.659)
