In [1]:
import sklearn
import numpy as np
import pandas as pd
import mlflow
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('spotify_millsongdata.csv')
df.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [3]:
df.shape

(57650, 4)

In [4]:
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [5]:
link = df['link']
df = df.drop('link', axis = 1)
link.head()

0    /a/abba/ahes+my+kind+of+girl_20598417.html
1         /a/abba/andante+andante_20002708.html
2          /a/abba/as+good+as+new_20003033.html
3                    /a/abba/bang_20598415.html
4        /a/abba/bang+a+boomerang_20002668.html
Name: link, dtype: object

In [6]:
df.head()

Unnamed: 0,artist,song,text
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante","Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...


In [7]:
df = df.sample(5000)

In [8]:
df['text']

24445    Yeah, yeah, oh,  \r\nYeah, yeah,  \r\n  \r\nYo...
55230    I quit thinking about killing myself  \r\nWhen...
7518     Hello, goodbye my friend  \r\nFeels like the s...
15231    [Slimm]  \r\nI'm about two and a quarter from ...
1309     Daytime husler, you're out of line.  \r\nDon't...
                               ...                        
24232    [Verse 1]  \r\nThere's a fire starting in my h...
57646    Power to the workers  \r\nMore power  \r\nPowe...
47287    A psycho drive twisted in my head  \r\nSilence...
35741    Feelings come and go, I've never known  \r\nSo...
18489    You have got a body to  \r\nTo make a man just...
Name: text, Length: 5000, dtype: object

In [9]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ').replace(r'\n', ' ', regex = True)

In [10]:
df.head()

Unnamed: 0,artist,song,text
24445,Backstreet Boys,Bye Bye Love,"yeah, yeah, oh, \r yeah, yeah, \r \r you s..."
55230,Violent Femmes,When You Died,i quit thinking about killing myself \r when ...
7518,Hanson,My Own Sweet Time,"hello, goodbye my friend \r feels like the st..."
15231,Outkast,Dirt Work,[slimm] \r i'm about two and a quarter from r...
1309,Bette Midler,Daytime Hustler,"daytime husler, you're out of line. \r don't ..."


In [11]:
import nltk
#nltk.download()

In [12]:
from nltk.stem.porter import PorterStemmer

In [13]:
stemmer = PorterStemmer()

In [14]:
def tokenize(text):
    token = nltk.word_tokenize(text)
    a = [stemmer.stem(w) for w in token]
    return " ".join(a)

In [15]:
tokenize("you are beautiful , preciosly good , the sun is shining brightly")

'you are beauti , preciosli good , the sun is shine brightli'

In [16]:
df['text'].apply(lambda x: tokenize(x))

24445    yeah , yeah , oh , yeah , yeah , you should be...
55230    i quit think about kill myself when you die ju...
7518     hello , goodby my friend feel like the start a...
15231    [ slimm ] i 'm about two and a quarter from ro...
1309     daytim husler , you 're out of line . do n't y...
                               ...                        
24232    [ vers 1 ] there 's a fire start in my heart r...
57646    power to the worker more power power to the wo...
47287    a psycho drive twist in my head silenc broken ...
35741    feel come and go , i 've never known someth lo...
18489    you have got a bodi to to make a man just lose...
Name: text, Length: 5000, dtype: object

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
tfid = TfidfVectorizer(analyzer='word', stop_words='english')

In [19]:
matrix = tfid.fit_transform(df['text'])

In [20]:
cosine_similarity(matrix)

array([[1.        , 0.0147175 , 0.02168844, ..., 0.00562474, 0.01954853,
        0.03243678],
       [0.0147175 , 1.        , 0.02307627, ..., 0.00739246, 0.05534821,
        0.03773373],
       [0.02168844, 0.02307627, 1.        , ..., 0.00269918, 0.01136047,
        0.00758226],
       ...,
       [0.00562474, 0.00739246, 0.00269918, ..., 1.        , 0.02138909,
        0.05970706],
       [0.01954853, 0.05534821, 0.01136047, ..., 0.02138909, 1.        ,
        0.10352472],
       [0.03243678, 0.03773373, 0.00758226, ..., 0.05970706, 0.10352472,
        1.        ]])