## Prepair Data

In [3]:
# dataset: https://www.kaggle.com/datasets/notshrirang/spotify-million-song-dataset
import pandas as pd
df = pd.read_csv("data.csv")
df = df.sample(5000).drop('link', axis=1).reset_index(drop=True)
df

Unnamed: 0,artist,song,text
0,Nick Cave,Let It Be,When I find myself in times of trouble \r\nMo...
1,Judy Garland,A Cottage For Sale,"Our little dream castle with every dream gone,..."
2,Crowded House,You Are The One To Make Me Cry,Who'll face the tide and take their chances \...
3,Cyndi Lauper,All Through The Night,All through the night \r\nI'll be awake and I...
4,Kanye West,Bad News,Didn't you know \r\nI was waiting on you \r\...
...,...,...,...
4995,Alabama,Getting Over You,Wish I had just one more chance to hold you cl...
4996,Dan Fogelberg,Let Her Go,A bad situation right from the start \r\nShe ...
4997,Green Day,Letterbomb,Nobody likes you \r\nEveryone left you \r\nT...
4998,Black Sabbath,Wicked World,The world today is such a wicked thing \r\nFi...


## Data Preprocessing

In [5]:
# Clean and preprocess the text
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)

In [7]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

# Tokenize the descriptions into individual words or phrases
def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [34]:
# Apply Tokenization func
df['text'] = df['text'].apply(lambda x: tokenization(x))
df['text']

0       when i find myself in time of troubl mother ma...
1       our littl dream castl with everi dream gone , ...
2       who 'll face the tide and take their chanc wil...
3       all through the night i 'll be awak and i 'll ...
4       did n't you know i wa wait on you wait on a dr...
                              ...                        
4995    wish i had just one more chanc to hold you clo...
4996    a bad situat right from the start she play me ...
4997    nobodi like you everyon left you they 're all ...
4998    the world today is such a wick thing fight go ...
4999    sometim i ca n't help the feel that , i 'm liv...
Name: text, Length: 5000, dtype: object

## Feature Extraction

In [35]:
# Convert the tokenized descriptions into numerical representations that can be used by machine learning models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

similarity[0]

array([1.        , 0.00111331, 0.07727983, ..., 0.00774105, 0.01094826,
       0.04473043])

## Recommender Func

In [37]:
# Recommend similar songs func
def recommendation(song_name):
    idx = df[df['song'] == song_name].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

## Final Results

In [36]:
recommendation('A Cottage For Sale')

['Cottage For Sale',
 'Time Waits',
 'Window To The World',
 "I've Got A Crush On You",
 'Box Of Rain',
 'King Of The Road',
 'Baby Girl Window',
 "I've Got A Crush On You",
 'Window',
 'Storytime',
 'The Key',
 'No Small Miracle',
 'Long River',
 'Castles In The Sand',
 'Four Rusted Horses',
 'And Roses And Roses',
 'E-Mail My Heart',
 'Story Of Your Life Is In Your Face',
 'Face Of God',
 'In The Still Of The Night']