In [12]:
#####################
####PREPROCESSING####
#####################

import pandas as pd 
import string
import numpy as np

np.random.seed(41)

def ngrams(doc, n): 
    doc = doc.lower().translate(str.maketrans('', '', ".!()-\""))
    doc = doc.split()
    grams = []
    for i in range(len(doc) - n + 1): 
        gram = " ".join(doc[i:i+n])
        grams.append(gram)
    return grams

ngrams("hello darkness my old friend i've come to talk with you again .", 3)

['hello darkness my',
 'darkness my old',
 'my old friend',
 "old friend i've",
 "friend i've come",
 "i've come to",
 'come to talk',
 'to talk with',
 'talk with you',
 'with you again']

In [13]:
# Get training and test data

data = pd.read_csv('songdata.csv.zip')
kanyetext = data.loc[data['artist'] == 'Kanye West', 'text']
draketext = data.loc[data['artist'] == 'Drake', 'text']
eminemtext = data.loc[data['artist'] == 'Eminem', 'text']
migostext = data.loc[data['artist'] == 'Migos', 'text']
lilwaynetext = data.loc[data['artist'] == 'Lil Wayne', 'text']

text = [kanyetext, draketext, eminemtext, migostext, lilwaynetext]
X = []
y = []
for artist in range(len(text)):
    for song in text[artist]:
        #gram = ngrams(j, 3)
        X.append(song)
        y.append(artist)
# append each word as a training data 
     #remove these characters from the text
X = np.array(X)
y = np.array(y)

print(len(y))
print(len(X))

433
433


In [31]:
# shuffle! 
randomize = np.arange(len(y))
np.random.shuffle(randomize)
X = X[randomize]
y = y[randomize]

# Train and test split 70/30 split
n_train, n_test = int(len(X) *0.7), int(len(X) * 0.3)
text_train = X[:n_train]
text_test = X[n_train : n_train+n_test]
y_train = y[:n_train]
y_test = y[n_train : n_train+n_test]

In [33]:
####################
#### spacy time ####
####################

import spacy

nlp = spacy.load('en')

# SymbolTable & LSTM code adapted directly from https://github.com/ix-ai-s1-17/lstm-examples/blob/master/AGNewsKeras.ipynb

class SymbolTable:
    """Wrapper for dict to encode unknown symbols"""

    def __init__(self, starting_symbol=2, unknown_symbol=1): 
        self.s       = starting_symbol
        self.unknown = unknown_symbol
        self.d       = dict()

    def lookup_add(self, w):
        if w not in self.d:
            self.d[w] = self.s
            self.s += 1
        return self.d[w]

    def lookup(self, w, strict=False):
        return self.d[w] if strict else self.d.get(w, self.unknown)

    def reverse(self):
        r = {v: k for k, v in self.d.iteritems()}
        r[0], r[1] = '~~NONE~~', '~~UNKNOWN~~'
        return r

    def num_words(self):
        return len(self.d)

    def num_symbols(self):
        return self.s

In [34]:
# Parse all text
text_train_parsed = [nlp(str(s)) for s in text_train]
print(len(text_train_parsed))
text_test_parsed = [nlp(str(s)) for s in text_test]
print(len(text_test_parsed))

303
129


In [35]:
# Convert text to integer symbols
symbol_table = SymbolTable()

def preprocess_text(parsed_text, symbol_table, init=True):
    mapper = symbol_table.lookup_add if init else symbol_table.lookup
    return [[mapper(w.text.strip().lower()) for s in t.sents for w in s] for t in parsed_text]

In [37]:
symbols_train = preprocess_text(text_train_parsed, symbol_table, True)
symbols_test = preprocess_text(text_test_parsed, symbol_table, False)

print(len(symbols_train))
print(len(symbols_train[0]))
print(len(symbols_test))
print(len(symbols_test[0]))

303
584
129
585


In [39]:
from keras.preprocessing import sequence

MAX_LENGTH = 50 #not sure what this does!
x_train = sequence.pad_sequences(symbols_train)
print(x_train.shape)
x_test = sequence.pad_sequences(symbols_test)
print(x_test.shape)

(303, 1073)
(129, 1045)


In [40]:
n_classes = len(set(y))
print(n_classes)

5


In [41]:
#########################
#### Building Model ####
########################

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM
from keras.optimizers import Adam

d = 128

model = Sequential()
model.add(Embedding(symbol_table.num_symbols(), output_dim=d))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(n_classes, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy',
              optimizer=Adam(lr=0.0005),
              metrics=['accuracy'])

In [43]:
model.fit(x_train, y_train, validation_data=(x_test, y_test), batch_size=12, epochs=20)

Train on 303 samples, validate on 129 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1218dc8d0>

In [56]:
songs = ["""i had to did  
cuz i had the kid  
duh o
i had to did  
cuz i had the kid  
duh oh we ain't his  
so what you get a bitch  
i'm the beat it and it  
why what it put as i make it on the way  
it's why(tird, why you trapping for the one the hander  
i got me back a concert  
so like they ain't the way to dee  
i got me what it need and it  
oh oh oh  
i don't have it in you no  
i wanna feel the girlfriend and pronto""", """"you  
i've loved and i've lost  
  
i've"
you  
i've loved and i've lost  
  
i've put on ellom over mackin', you don't know hello shit, what i'm thinkin brob, helw 1?p  
but  but that wayn't you even, jump  
you in my ones som' goin' to shit into  
all the ceporsontally i'm in me car like will pugpy at the cartenned  
shit sayan't this yaan it's finost  
boy i'm not lost on you  
shine see em sele , it's all 'cause it and the world more them woestern  
that i'm a doold buogh
t""", """"a, you know i'm gettin' money)  
cause w"
a, you know i'm gettin' money)  
cause we ain't till that don't do that i can't get my prise  
  
i'm brough it up in the root started  
i got to tell like i'm gonna take a fuck it  
  
what you wanna bring you the close you need to me  
and then right now and shady i can't try to thought i know  
yeah  
  
what you try to be think you wanna get my hands  
i get that  
pronto  
  
i got the needs the since  
got the poppin a wish no don""", """"ow the same way that they used to,  
and"
ow the same way that they used to,  
and i'm the start  
  
[hook]  
i'm start to be call stared to the bad  
  
now you see the i don't try no get the real  
  
what it love the mast the what's the feel  
in a show you ever been of it  
what you start the looving of crazy  
oh yeah, and the niggas and started in the good  
i look me wait a way  
no how i don't try to see the back to me  
who the stared the show me  
i got it all the ri"""]

In [57]:
songs = [nlp(s) for s in songs]

symbols_songs = preprocess_text(songs, symbol_table, False)

songs_test = sequence.pad_sequences(symbols_songs)

model.predict(songs_test)

array([[  6.04427466e-03,   9.92598653e-01,   2.64872971e-04,
          3.95533367e-04,   6.96645118e-04],
       [  8.00389040e-04,   5.33509185e-04,   9.91834164e-01,
          1.00995286e-03,   5.82210999e-03],
       [  4.65855375e-03,   9.92249370e-01,   3.80822981e-04,
          6.06190995e-04,   2.10499042e-03],
       [  7.95657467e-03,   9.46620047e-01,   8.24880600e-03,
          2.92020831e-02,   7.97252543e-03]], dtype=float32)

In [70]:
np.set_printoptions(suppress=True, precision=4)
model.predict(songs_test)

# "kanyetext, draketext, eminemtext, migostext, lilwaynetext"

array([[ 0.006 ,  0.9926,  0.0003,  0.0004,  0.0007],
       [ 0.0008,  0.0005,  0.9918,  0.001 ,  0.0058],
       [ 0.0047,  0.9922,  0.0004,  0.0006,  0.0021],
       [ 0.008 ,  0.9466,  0.0082,  0.0292,  0.008 ]], dtype=float32)