In [84]:
import numpy as np
import pandas as pd
from unidecode import unidecode
import nltk
import os
import re
from gensim.models import Word2Vec, KeyedVectors
from string import punctuation
#Pode ser alterado de acordo com a banda ou parcela do dataset escolhida. Existem musicas em diferentes linguas no dataset.
stopwords = nltk.corpus.stopwords.words('english')
#stopwords = nltk.corpus.stopwords.words('portuguese')
from nltk.stem import SnowballStemmer
import warnings
warnings.filterwarnings('ignore')

In [39]:
pontuacao = list(punctuation)
# Função pra remover pontuação, stopwords e numeros de textos, pois essas são informações irrelevantes 
pontuacao.append('...')
pontuacao.append('\"')
pontuacao.append('\'')
pontuacao.append('``')
pontuacao.append('`')
pontuacao.append('\n')
pontuacao.append('\t')
pontuacao.append('\’')

def processa(txt):
    for p in pontuacao:
        txt = str(txt).replace(p, ' ')
    return ' '.join([t for t in txt.lower().split() 
                     if (len(t)>2) 
                     if not re.search('^\d', t) 
                     and not re.search('.*\d$', t) 
                     and (t not in stopwords)])

In [40]:
# Lendo os datasets
df1 = pd.read_csv('./music/lyrics1.csv')
df2 = pd.read_csv('./music/lyrics2.csv')
df3 = pd.read_csv('./music/lyrics3.csv')
df4 = pd.read_csv('./music/lyrics4.csv')
df5 = pd.read_csv('./music/lyrics5.csv')
df6 = pd.read_csv('./music/lyrics6.csv')
df7 = pd.read_csv('./music/lyrics7.csv')
df8 = pd.read_csv('./music/lyrics8.csv')
df9 = pd.read_csv('./music/lyrics9.csv')


# Concatenando os 2 datasets
df =  pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9])
df.head()

Unnamed: 0.1,Unnamed: 0,Band,Lyrics,Song
0,0,Elijah Blake,"No, no\r\nI ain't ever trapped out the bando\r...",Everyday
1,1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die
2,2,Elijah Blake,She don't live on planet Earth no more\r\nShe ...,The Otherside
3,3,Elijah Blake,"Trippin' off that Grigio, mobbin', lights low\...",Pinot
4,4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds


In [41]:
df = df.drop(['Unnamed: 0'], axis=1)

In [42]:
df.head()

Unnamed: 0,Band,Lyrics,Song
0,Elijah Blake,"No, no\r\nI ain't ever trapped out the bando\r...",Everyday
1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die
2,Elijah Blake,She don't live on planet Earth no more\r\nShe ...,The Otherside
3,Elijah Blake,"Trippin' off that Grigio, mobbin', lights low\...",Pinot
4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds


In [43]:
df.shape

(516174, 3)

In [44]:
# Escolha uma ou mais bandas para treinar seu modelo
minhas_musicas = df[df['Band']=='Linkin Park']

In [45]:
minhas_musicas.head()

Unnamed: 0,Band,Lyrics,Song
22562,Linkin Park,"I break down, fear is sinking in\r\nThe cold c...",Not Alone
22563,Linkin Park,I remembered black skies / the lightning all a...,New Divide [Live]
22564,Linkin Park,What do I do to ignore them behind me?\r\nDo I...,By Myself
22565,Linkin Park,"Crawling in my skin\r\nThese wounds, they will...",Crawling
22566,Linkin Park,"Folks, we have a very special guest for you to...",Cure for the Itch


In [46]:
minhas_musicas.shape

(137, 3)

In [47]:
minhas_musicas['musica_processada'] = minhas_musicas['Lyrics'].apply(lambda x: processa(x))

minhas_musicas.head()

Unnamed: 0,Band,Lyrics,Song,musica_processada
22562,Linkin Park,"I break down, fear is sinking in\r\nThe cold c...",Not Alone,break fear sinking cold comes racing skin sear...
22563,Linkin Park,I remembered black skies / the lightning all a...,New Divide [Live],remembered black skies lightning around rememb...
22564,Linkin Park,What do I do to ignore them behind me?\r\nDo I...,By Myself,ignore behind follow instincts blindly hide pr...
22565,Linkin Park,"Crawling in my skin\r\nThese wounds, they will...",Crawling,crawling skin wounds heal fear fall confusing ...
22566,Linkin Park,"Folks, we have a very special guest for you to...",Cure for the Itch,folks special guest tonight would like introdu...


In [48]:
# Criando o corpus. Vale ressaltar que o corpus é uma lista de listas.
corpus = []

In [49]:
# Percorre linha or linha do dataset
for i in minhas_musicas.iterrows():
    corpus.append(i[1]['musica_processada'].split())

In [50]:
corpus

[['break',
  'fear',
  'sinking',
  'cold',
  'comes',
  'racing',
  'skin',
  'searching',
  'way',
  'get',
  'storm',
  'giving',
  'home',
  'leaving',
  'known',
  'alone',
  'arms',
  'stretched',
  'sky',
  'eyes',
  'like',
  'echoes',
  'night',
  'hiding',
  'hell',
  'silent',
  'one',
  'giving',
  'home',
  'leaving',
  'known',
  'giving',
  'home',
  'leaving',
  'known',
  'alone',
  'giving',
  'home',
  'leaving',
  'known',
  'giving',
  'home',
  'alone',
  'alone',
  'alone'],
 ['remembered',
  'black',
  'skies',
  'lightning',
  'around',
  'remembered',
  'flash',
  'time',
  'began',
  'blur',
  'like',
  'startling',
  'sign',
  'fate',
  'finally',
  'found',
  'voice',
  'heard',
  'get',
  'deserve',
  'give',
  'reason',
  'prove',
  'wrong',
  'wash',
  'memory',
  'clean',
  'let',
  'floods',
  'cross',
  'distance',
  'eyes',
  'give',
  'reason',
  'fill',
  'hole',
  'connect',
  'space',
  'let',
  'enough',
  'reach',
  'truth',
  'lies',
  'across

In [78]:
model = Word2Vec(corpus, size=300, window=5, min_count=3, sg=0, iter=50)

model.train(corpus, total_examples=len(corpus), epochs=50)

(656268, 871700)

In [79]:
model.wv.most_similar(positive=['glass'])

[('castle', 0.9925153255462646),
 ('hardly', 0.9855707287788391),
 ('crack', 0.9515681862831116),
 ('anything', 0.6310016512870789),
 ('anywhere', 0.6257785558700562),
 ('breathe', 0.5744950771331787),
 ('dream', 0.555234432220459),
 ('wisdom', 0.5448112487792969),
 ('justice', 0.5410251021385193),
 ('reconciled', 0.5367980599403381)]

In [80]:
model.wv.most_similar(positive=['battle'])

[('symphony', 0.8895705938339233),
 ('armor', 0.7653952240943909),
 ('fuse', 0.7605004906654358),
 ('together', 0.6912732124328613),
 ('wide', 0.5887556672096252),
 ('awake', 0.5605049133300781),
 ('puts', 0.5322713851928711),
 ('react', 0.5306293368339539),
 ('great', 0.5216079950332642),
 ('breaks', 0.5162221193313599)]

In [81]:
model.wv.most_similar(positive=['sorrow'])

[('borrowed', 0.7508074641227722),
 ('hypocrites', 0.7481631636619568),
 ('money', 0.7249287366867065),
 ('replaced', 0.6566493511199951),
 ('mistakes', 0.6483978033065796),
 ('thieves', 0.6464242935180664),
 ('paid', 0.6445585489273071),
 ('done', 0.5902692675590515),
 ('shock', 0.5789891481399536),
 ('separate', 0.5686025023460388)]

### Words Movers Distance



In [85]:
model.init_sims(replace=True)
model.wmdistance(corpus[0], corpus[1])

1.1020856393302407

In [86]:
model.wmdistance(corpus[5], corpus[10])

1.1331873600081577

Ler um nome de uma musica do usuario, comparar com o array de músicas e recomendar as músicas mais semelhantes