In [107]:
#####################
####PREPROCESSING####
#####################

import pandas as pd 
import string
import numpy as np

np.random.seed(41)

#not really an ngrams, but splits a given document into sets of 40 words that overlap by 5 words
def ngrams(doc, n): 
    doc = doc.lower().translate(str.maketrans('', '', ".!()-\""))
    doc = doc.split()
    grams = []
    increment = n-5
    for i in range(0, len(doc) - n + 1, increment): 
        gram = " ".join(doc[i:i+n])
        grams.append(gram)
    return grams

#gets first n words in a document
def getfirstnwords(doc, n):
    doc = doc.lower().translate(str.maketrans('', '', ".!()-\""))
    return(" ".join(doc.split()[:n]))

In [108]:
# Get training and test data

data = pd.read_csv('songdata.csv.zip')
kanyetext = data.loc[data['artist'] == 'Kanye West', 'text']
draketext = data.loc[data['artist'] == 'Drake', 'text']
eminemtext = data.loc[data['artist'] == 'Eminem', 'text']
migostext = data.loc[data['artist'] == 'Migos', 'text']
lilwaynetext = data.loc[data['artist'] == 'Lil Wayne', 'text']

text = [kanyetext, draketext, eminemtext, migostext, lilwaynetext]
X = []
y = []
for artist in range(len(text)):
    for song in text[artist]:
        X.append(song)
        y.append(artist)

X = np.array(X)
y = np.array(y)

print(len(y))
print(len(X))

433
433


In [109]:
# shuffle! 
randomize = np.arange(len(y))
np.random.shuffle(randomize)
X = X[randomize]
y = y[randomize]

In [110]:
# Train and test split 70/30 split
n_train = int(len(X) *0.7)
text_train = X[:n_train]
realX_train = []

#contains labels for each song 
y_train = y[:n_train]
realY_train = []

#convert text_train to 40 word sequences 
for trainsong in range(len(text_train)):
    gram = ngrams(text_train[trainsong], 40)#create sets of 40 words that overlap 5 words each
    for wordset in gram:
        realX_train.append(wordset)
        realY_train.append(y_train[trainsong])

text_train = np.array(realX_train)
y_train = np.array(realY_train)
randomize2 = np.arange(len(y_train))
np.random.shuffle(randomize2)
text_train = text_train[randomize2]
y_train = y_train[randomize2]

print(len(text_train))
print(len(y_train))
print(y_train[0])

# Test Data
text_test = [getfirstnwords(song, 40) for song in X[n_train:]]
y_test = y[n_train:]

3946
3946
1


In [111]:
import spacy
nlp = spacy.load('en')

In [112]:
####################
#### spacy time ####
####################

# tokenize words
# SymbolTable & LSTM code adapted directly from https://github.com/ix-ai-s1-17/lstm-examples/blob/master/AGNewsKeras.ipynb

class SymbolTable:
    """Wrapper for dict to encode unknown symbols"""

    def __init__(self, starting_symbol=2, unknown_symbol=1): 
        self.s       = starting_symbol
        self.unknown = unknown_symbol
        self.d       = dict()

    def lookup_add(self, w):
        if w not in self.d:
            self.d[w] = self.s
            self.s += 1
        return self.d[w]

    def lookup(self, w, strict=False):
        return self.d[w] if strict else self.d.get(w, self.unknown)

    def reverse(self):
        r = {v: k for k, v in self.d.iteritems()}
        r[0], r[1] = '~~NONE~~', '~~UNKNOWN~~'
        return r

    def num_words(self):
        return len(self.d)

    def num_symbols(self):
        return self.s

In [113]:
# Parse all text
text_train_parsed = [nlp(str(s)) for s in text_train]
print(len(text_train_parsed))
text_test_parsed = [nlp(str(s)) for s in text_test]
print(len(text_test_parsed))

3946
130


In [114]:
# Convert text to integer symbols
symbol_table = SymbolTable()

def preprocess_text(parsed_text, symbol_table, init=True):
    mapper = symbol_table.lookup_add if init else symbol_table.lookup
    return [[mapper(w.text.strip().lower()) for s in t.sents for w in s] for t in parsed_text]

In [115]:
symbols_train = preprocess_text(text_train_parsed, symbol_table, True)
symbols_test = preprocess_text(text_test_parsed, symbol_table, False)

In [119]:
from keras.preprocessing import sequence
x_train = sequence.pad_sequences(symbols_train)
print(x_train.shape)
x_test = sequence.pad_sequences(symbols_test, 67)
print(x_test.shape)

(3946, 67)
(130, 67)


In [120]:
n_classes = len(set(y))
print(n_classes)

5


In [124]:
#########################
#### Building Model ####
########################

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM
from keras.optimizers import Adam

d = 128

model = Sequential()
model.add(Embedding(symbol_table.num_symbols(), output_dim=d))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(n_classes, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy',
              optimizer=Adam(lr=0.0005),
              metrics=['accuracy'])

In [125]:
model.fit(x_train, y_train, validation_data=(x_test, y_test), batch_size=64, epochs=12)

Train on 3946 samples, validate on 130 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0x149311f28>

In [136]:
songs = ["""i had to did  
cuz i had the kid  
duh o
i had to did  
cuz i had the kid  
duh oh we ain't his  
so what you get a b****  
i'm the beat it and it  
why what it put as i make it on the way  
it's why(tird, why you trapping for the one the hander  
i got me back a concert  
so like they ain't the way to dee  
i got me what it need and it  
oh oh oh  
i don't have it in you no  
i wanna feel the girlfriend and pronto""", 
        
        """don't hit again?  
dog are you f***ing k
don't hit again?  
dog are you f***ing king  
i got the could me  
i got to the don't was don't never mean  
i got the corder the pare  
i got the secen  
i got to the beand the same  
i got the could to the down  
i got the cound  
i got to the count  
i never tell the cramp  
i don't never think i got the ready  
i got a show it  
i got a love the down  
i got the drip in the come  
i can see i got a could the see it  
i got the down """, 
        """w on a little low arm they want me to st
w on a little low arm they want me to stacked on eating  
don't see i sheowed leavin' me, know  
you sanse us 'cause b**** like a strest to be p****  
i just ? stay   
after woulcer and ecathions fame to beeod  
and i'm hadiigs and i got me you know  
her walking up vice  
long bormer thall  
tell it winndo, tell her us get your niggas coldedded kany, karier  
man, i would the way so fall man that hit what like trying to see is to rain """,
        
        """" me a good  
show me a good  
show me a "
 me a good  
show me a good  
show me a b paint  
you scaopetes wit easundnmoog money, musa diama g?  
i'm cucking mildg, i came you like weezy in a ooly  
fufl solatle, f***ed cooly had this hit us up dat she, i mezs treing it fall?  
me smellull wall as turffout, diak, dem aint  
in thes p***ill bigs buo guess yes ogh, long to yoom man  
stan for it f*** it, i moss toskaffy, jurt us i'm lefes all fam ranger up  
lsy len  
until this w"""]

songs = [nlp(s) for s in songs]

symbols_songs = preprocess_text(songs, symbol_table, False)

songs_test = sequence.pad_sequences(symbols_songs)
np.set_printoptions(precision=4, suppress = True)
np.array(model.predict(songs_test), dtype=np.float64)

array([[ 0.0016,  0.9982,  0.    ,  0.    ,  0.0001],
       [ 0.9994,  0.0004,  0.0002,  0.    ,  0.    ],
       [ 0.0008,  0.9991,  0.    ,  0.    ,  0.0001],
       [ 0.0002,  0.9997,  0.    ,  0.    ,  0.0001]])

In [None]:
np.set_printoptions(suppress=True, precision=4)
model.predict(songs_test)

# 0: kanye, 1: drake, 2: eminem, 3: migos, 4: lilwayne"