In [None]:
import numpy as np
import math
import pandas as pd
import re
import gc
from tqdm import tqdm
import keras
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from keras.utils import np_utils, generic_utils
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Embedding, Flatten 
from keras.callbacks import LearningRateScheduler, ModelCheckpoint
import cPickle as pickle
from nltk.tokenize import word_tokenize
import spacy
nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
from itertools import izip_longest
from collections import defaultdict

In [None]:
num_epochs = 20
model_save_interval = 5
batch_size = 128
model_file_name = '../Models/dnn_lstm2'

In [None]:
headers = ['song_id', 'translated_names']
songs = pd.read_csv('../New_Data/tr_songs.csv', usecols = headers) #419615
songs['song_name'] = songs['translated_names'].map(str).apply(lambda x : ''.join([i for i in re.findall(r'[a-zA-Z_\s]', x)]))
songs['song_name'] = songs['song_name'].map(str).apply(lambda x : re.sub(r'\s+',' ',x))

In [None]:
train = pd.read_csv('../Data/train.csv', usecols=['song_id', 'target'], index_col=False)

In [None]:
y = pd.DataFrame(train.song_id.unique(), columns=['song_id'], index=None)
missing = y[~y.song_id.isin(songs.song_id)]

In [None]:
missing_train = pd.DataFrame(columns = ['song_id', 'song_name'])
missing_train['song_id'] = missing.loc[missing['song_id'].isin(train.song_id)].song_id
missing_train['song_name'] = 'General Song'

In [None]:
def load_song_embeddings():
    model = keras.models.load_model('../New_Data/LSTM_song_embeddings/songs_embeddings_100.h5')
    return model

In [None]:
def generate_songs_tensor(song_names, nlp, steps):
    assert not isinstance(song_names, basestring)
    nb_samples = len(song_names)
    word_vec_dim = nlp(song_names[0].decode('utf8'))[0].vector.shape[0]
    song_tensor = np.zeros((nb_samples, steps, word_vec_dim))
    for i in xrange(len(song_names)):
        tokens = nlp(song_names[i].decode('utf8'))
        for j in xrange(len(tokens)):
            if j<steps:
                song_tensor[i,j,:] = tokens[j].vector

    return song_tensor

In [None]:
train_unique_songs = train.song_id.unique() #No. 359966
train_unique_songs = pd.DataFrame(train_unique_songs, columns=['song_id'], index=None)
train_songs = songs.loc[songs['song_id'].isin(train_unique_songs['song_id'])]
duplicated_idx = train_songs.duplicated(subset='song_id', keep='first')
train_songs = train_songs[~duplicated_idx]
train_songs = train_songs.append(missing_train)
train_songs = train_songs.reset_index()

In [None]:
del y, missing_train; 
del train_unique_songs;
gc.collect();

In [None]:
seq_length = 25
train_song_mapper = dict()

X = generate_songs_tensor(train_songs['song_name'], nlp, seq_length)
train_song_mapper = dict(zip(train_songs['song_id'], X))

In [None]:
song_model = load_song_embeddings()
#SVG(model_to_dot(song_model).create(prog='dot', format='svg'))

In [None]:
song_embedding_model = Model(inputs=song_model.input,
                                 outputs=song_model.get_layer('dense_1').output)

In [None]:
def embedding_generator(data, song_mapper):
    num_rows = len(data)
    X = np.zeros((len(data), seq_length, 300), dtype='float32')
    count = 0
    for row_num, row in data.iterrows():
        X[count,] = song_mapper[row['song_id']]
        count += 1
    return X 

In [None]:
input_song_ids_layer = Input(shape=(100,))
intermediate_0 = Dense(64)(input_song_ids_layer)
output_0 = Dense(1, activation='sigmoid')(intermediate_0)
dnn_model = keras.models.Model(inputs = [input_song_ids_layer],
                               outputs = [output_0])
dnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
#SVG(model_to_dot(dnn_model).create(prog='dot', format='svg'))
#print dnn_model.summary()

In [None]:
num_epochs = 20
num_complete_batches = int(math.floor(len(train)/batch_size))
for k in range(0, num_epochs):
    progbar = generic_utils.Progbar(len(train))
    if k%5==0:
        dnn_model.optimizer.lr = dnn_model.optimizer.lr * .1
    for i in range(0, num_complete_batches):
        subset = train[i*batch_size : (i+1)*batch_size]
        X_batch = song_embedding_model.predict(embedding_generator(subset, train_song_mapper), verbose=0)
        Y_batch = subset.target 
        loss, acc = dnn_model.train_on_batch(X_batch, Y_batch)
        progbar.add(X_batch.shape[0], values=[("train loss", loss), ("acc", acc)])
    if len(train) % batch_size != 0:
        subset = train[(num_complete_batches*batch_size)-1 : len(train)-1]
        X_batch = song_embedding_model.predict(embedding_generator(subset, train_song_mapper), verbose=0)
        Y_batch = subset.target 
        loss, acc = dnn_model.train_on_batch(X_batch, Y_batch)
        progbar.add(X_batch.shape[0], values=[("train loss", loss), ("acc", acc)])
    if k%model_save_interval == 0:
        dnn_model.save_weights(model_file_name + '_epoch_{:03d}.hdf5'.format(k))

dnn_model.save_weights(model_file_name + '_epoch_{:03d}.hdf5'.format(k))
dnn_model.save('../Models/dnn_lstm_2.h5')