In [16]:
import pandas as pd
import numpy as np

In [17]:
df = pd.read_csv('songdata.csv')
df.head(3)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...


In [18]:
df.shape

(57650, 4)

In [19]:
df = df.sample(n=5000).drop('link', axis=1).reset_index(drop=True)

In [20]:
df.shape

(5000, 3)

In [21]:
df['text'] = df['text'].str.lower().replace(r'[^\w\s]','').replace(r'\n',' ', regex=True)

In [22]:
df['text'][0]

"clean up before she comes   living in a dusty town   something in her eyes   must be the smoke from my lungs   something in her eyes   must be the smoke from my lungs      twenty months has it all   i must be getting old(3x)   something in her eyes   must be the smoke from my lungs   something in her eyes   must be the smoke from my lungs      clean up the dusty town   living in a dusty ton   clean up before she comes   living in a dusty town      i must be getting old   i must be getting old   i'm starting to eat my vegetables  "

In [23]:
import nltk
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [24]:
df['text'] = df['text'].apply(lambda x: tokenization(x))

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [27]:
similarity[0]

array([1.        , 0.        , 0.        , ..., 0.0336204 , 0.03866582,
       0.01757307], shape=(5000,))

In [28]:
empty_song_index = df[df['song'] == ''].index
if not empty_song_index.empty:
    print(empty_song_index[0])
else:
    print("No rows found with empty song title")

No rows found with empty song title


# recommedation function

In [29]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [33]:
recommendations = recommendation('As Good As New')
if recommendations:
    print(recommendations)

Song 'As Good As New' not found in the dataset.


In [30]:
recommendation('Alma Mater')

IndexError: index 0 is out of bounds for axis 0 with size 0

In [None]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))