In [54]:
import pandas as pd
import seaborn as sns
import plotly.express as px 
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist


In [55]:
df=pd.read_csv(r"C:\Users\KOWSALYA\kowsi personal\music_recommendation_system\spotify_million_dataset\spotify_millsongdata.csv")

In [56]:
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [57]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57650 entries, 0 to 57649
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   artist  57650 non-null  object
 1   song    57650 non-null  object
 2   link    57650 non-null  object
 3   text    57650 non-null  object
dtypes: object(4)
memory usage: 1.8+ MB
None


In [58]:
df.shape


(57650, 4)

In [59]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [60]:
df =df.sample(5000).drop('link', axis=1).reset_index(drop=True)


In [61]:
df.head(10)

Unnamed: 0,artist,song,text
0,Vince Gill,One,Every little whisper every little sound \r\nB...
1,George Jones,Barbara Joy,Will you be there in the courtyard for the joy...
2,Peter Gabriel,Eindringling,"Ich find ueberall einlass, ich knacke fenster ..."
3,Nickelback,Photograph,Look at this photograph \r\nEvery time I do i...
4,Patti Smith,Helpless,[Originally by Neil Young] \r\n \r\nThere is...
5,Carpenters,"Fun, Fun, Fun","Well, she got her daddy's car \r\nAnd she cru..."
6,George Harrison,Shanghai Surprise,I can't understand how I've gone astray \r\nI...
7,Carol Banawa,I Believe,"I see from look in your eyes, there's \r\nSom..."
8,Z-Ro,Guerilla Till I Die,"[Hook - 2x] \r\nGuerilla till I die, mama don..."
9,Dewa 19,Air Mata,[Verse 1] \r\nAir mata yang telah jatuh memba...


In [62]:
df['text'][0]

"Every little whisper every little sound  \r\nBrings me comfort whenever you're around  \r\nA heart full of wonder and sweet reverie  \r\nGives me a reason a reason to believe  \r\n  \r\n[Chorus]  \r\nForever's just begun  \r\nWe'll never turn and run  \r\nSlowly we've become  \r\nOne, one  \r\n  \r\nSlowly we've become one  \r\nI love the way we're different  \r\nAnd the way we're the same  \r\nMaking love to each other  \r\nIs like shelter from the rain  \r\n  \r\nIsn't it amazing  \r\nWhat I see in your eyes  \r\nI'll be your partner  \r\nAnd never leave your side  \r\n  \r\n[Chorus]\r\n\r\n"

In [63]:
df.shape

(5000, 3)

# **TEXT CLEANING/TEXT PREPROCESSING**

In [65]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)


In [69]:
import nltk
nltk.download('punkt')
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KOWSALYA\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [71]:
df['text'] = df['text'].apply(lambda x: tokenization(x))


In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [75]:
tfidvector = TfidfVectorizer(analyzer='word',stop_words='english')
matrix = tfidvector.fit_transform(df['text'])
similarity = cosine_similarity(matrix)

In [77]:
similarity[0]


array([1.        , 0.0171981 , 0.        , ..., 0.05675147, 0.01491873,
       0.05773865])

In [79]:
df[df['song'] == 'Crying Over You']


Unnamed: 0,artist,song,text
1650,UB40,Crying Over You,cri over you in the morn cri over you in the e...


In [81]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(similarity[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [83]:
recommendation('Crying Over You')


["Cry, Cry Darlin'",
 'I Never Cry',
 'Everyday I Have To Cry',
 "If It Don't Work Out",
 'He Cried',
 'Cry Like A Baby',
 'The Boy In The Bubble',
 'When You Cry',
 "You're The One",
 "Cryin' Time",
 "Call Me When You're Sober",
 "I'm Sorry",
 'No Woman No Cry',
 'You Made Me Love You',
 'Laugh To Keep From Crying',
 'Angels Cry',
 'Crying Time',
 'The Crying Game',
 'Nobody Knows The Way I Feel This Morning',
 'Hold Out']

In [85]:
import pickle
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))