In [36]:
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [37]:
df = pd.read_csv('spotify_millsongdata.csv')

In [38]:
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [39]:
df.shape

(57650, 4)

In [40]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [41]:
df.isna().any()

artist    False
song      False
link      False
text      False
dtype: bool

In [42]:
df = df.sample(2000).drop('link',axis=1).reset_index(drop=True)

In [43]:
df.head()

Unnamed: 0,artist,song,text
0,Proclaimers,Beautiful Truth,Beautiful beautiful beautiful truth \r\nDon't...
1,Oasis,Waiting For The Rapture,I still don't know what I was waiting for \r\...
2,Van Halen,Summer Nights,Ain't no way I'm stayin' home tonight \r\nI'l...
3,Steve Miller Band,Wild Mountain Honey,"Ooh, mama \r\nWell look what's been done \r\..."
4,YG,My Hitta,I said that I'mma ride for my motherfuckin' hi...


In [44]:
df['text'][0]

"Beautiful beautiful beautiful truth  \r\nDon't leave because I can't see you  \r\nYou know the hours that I passed without you  \r\nEvery heartbeat that I chose to flout you  \r\nAnd how I felt no shame  \r\nI know this problem is of my own making  \r\nI know you're giving here and I'm not taking  \r\nBut please don't leave this place  \r\nLet me complicate what you made simple  \r\nBy looking fancy or sounding boastful  \r\nPlease bring me down  \r\nArid now that I have said what's on my mind  \r\nAnd its obvious that I am blind  \r\nPlease say you'll stay  \r\nSpin me round  \r\nBring me down  \r\nHe my sound  \r\nBut don't leave.\r\n\r\n"

In [45]:
df.shape

(2000, 3)

In [46]:
df['text'] = df['text'].str.lower().replace(r'\w\s', '').replace(r'\n','',regex=True)

In [47]:
df.tail()

Unnamed: 0,artist,song,text
1995,Cher,Fit To Fly,"oh, brother, man where are you? \ri am buckli..."
1996,Smiths,Unhappy Birthday,i've come to wish you an unhappy birthday \ri...
1997,Rod Stewart,Our Love Is Here To Stay,"it's very clear, our love is here to stay \rn..."
1998,Thin Lizzy,Johnny,somewhere on the waterfront \rjohnny's hiding...
1999,Train,I Got You,"hey, did you hear, about the one that got away..."


In [48]:
stemmer = PorterStemmer()


In [49]:
# -- Download punkt_tab for word tokenize (try punkt if this does not work)

# nltk.download('punkt_tab')

In [50]:
def token(text):
    tokens = nltk.word_tokenize(text)
    a = [stemmer.stem(w) for w in tokens] 
    return " ".join(a)

In [51]:
token(text="you are beautiful")

'you are beauti'

In [52]:
df['text'].apply(lambda x: token(x))

0       beauti beauti beauti truth do n't leav becaus ...
1       i still do n't know what i wa wait for a big l...
2       ai n't no way i 'm stayin ' home tonight i 'll...
3       ooh , mama well look what 's been done you can...
4       i said that i'mma ride for my motherfuckin ' h...
                              ...                        
1995    oh , brother , man where are you ? i am buckli...
1996    i 've come to wish you an unhappi birthday i '...
1997    it 's veri clear , our love is here to stay no...
1998    somewher on the waterfront johnni 's hide with...
1999    hey , did you hear , about the one that got aw...
Name: text, Length: 2000, dtype: object

In [54]:
tfid = TfidfVectorizer(analyzer='word',stop_words='english')

In [55]:
matrix = tfid.fit_transform(df['text'])

In [56]:
cosine_similarity(matrix)

array([[1.        , 0.07664735, 0.02667362, ..., 0.04885312, 0.01184477,
        0.04168566],
       [0.07664735, 1.        , 0.04000909, ..., 0.0162763 , 0.01028295,
        0.08740471],
       [0.02667362, 0.04000909, 1.        , ..., 0.08362571, 0.05759719,
        0.07710619],
       ...,
       [0.04885312, 0.0162763 , 0.08362571, ..., 1.        , 0.04661837,
        0.04453986],
       [0.01184477, 0.01028295, 0.05759719, ..., 0.04661837, 1.        ,
        0.0423993 ],
       [0.04168566, 0.08740471, 0.07710619, ..., 0.04453986, 0.0423993 ,
        1.        ]])

In [57]:
similer = cosine_similarity(matrix)

In [58]:
similer[0]

array([1.        , 0.07664735, 0.02667362, ..., 0.04885312, 0.01184477,
       0.04168566])

In [59]:
df.tail()

Unnamed: 0,artist,song,text
1995,Cher,Fit To Fly,"oh, brother, man where are you? \ri am buckli..."
1996,Smiths,Unhappy Birthday,i've come to wish you an unhappy birthday \ri...
1997,Rod Stewart,Our Love Is Here To Stay,"it's very clear, our love is here to stay \rn..."
1998,Thin Lizzy,Johnny,somewhere on the waterfront \rjohnny's hiding...
1999,Train,I Got You,"hey, did you hear, about the one that got away..."


In [66]:
def recommender(song_name):
    # Checks if the song exists in the DataFrame
    if song_name not in df['song'].values:
        return f"'{song_name}' not found in the dataset."

    # Get the index of the song
    idx = df[df['song'] == song_name].index[0]

    # Sort the distances
    distance = sorted(list(enumerate(similer[idx])), reverse=True, key=lambda x: x[1])

    # Get the top 4 recommendations (excluding the original song)
    song = []
    for id in distance[1:21]: 
        song.append(df.iloc[id[0]].song)

    return song


In [67]:
recommender('Unhappy Birthday')

['Birthday',
 'Happy Birthday',
 'You Say Jump',
 'Since My Love Has Gone',
 'Shots In My System',
 'Sometimes',
 'Happy',
 "Can't Die Tonight",
 'Since I Lost My Baby',
 "I'll Be There For You",
 "I Don't Know Why",
 "Baby Won't You Please Come Home",
 'Can You Come Over',
 'Come Back And See Me',
 'Friend Of Mine',
 'Half Loved',
 'Lost In Your Love',
 'All Tore Up',
 "I Don't Care",
 'Lonely People']

In [71]:
pickle.dump(similer,open('similarity.pkl', 'wb'))

In [72]:
pickle.dump(df,open('df.pkl','wb'))

In [77]:
df[df['song'] == 'Some Love']

Unnamed: 0,artist,song,text
757,Chaka Khan,Some Love,(chorus): \rsome love \ryou know it brings m...
