In [None]:
import numpy as np
import math
import pandas as pd
from tqdm import tqdm
import keras
import gc
import re
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from keras.utils import np_utils, generic_utils
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Embedding, Flatten
import cPickle as pickle
from nltk.tokenize import word_tokenize
import spacy
nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
from itertools import izip_longest
from collections import defaultdict

In [None]:
batch_size = 128
seq_length = 25
weights = '../Models/dnn_lstm2_epoch_019.hdf5'
result_path = '../Test/submission_dnn_lstm2.csv'

In [None]:
headers = ['song_id', 'translated_names']
songs = pd.read_csv('../New_Data/tr_songs.csv', usecols = headers) #419615
songs['song_name'] = songs['translated_names'].map(str).apply(lambda x : ''.join([i for i in re.findall(r'[a-zA-Z_\s]', x)]))
songs['song_name'] = songs['song_name'].map(str).apply(lambda x : re.sub(r'\s+',' ',x))

In [None]:
test = pd.read_csv('../Data/test.csv', usecols=['id','song_id'])
y = pd.DataFrame(test.song_id.unique(), columns=['song_id'], index=None)
missing = y[~y.song_id.isin(songs.song_id)]
missing_test = pd.DataFrame(columns = ['song_id', 'song_name'])
missing_test['song_id'] = missing.loc[missing['song_id'].isin(test.song_id)].song_id
missing_test['song_name'] = 'General Song'
test_unique_songs = test.song_id.unique() #No. 224753
test_unique_songs = pd.DataFrame(test_unique_songs, columns=['song_id'], index=None)
test_songs = songs.loc[songs['song_id'].isin(test_unique_songs['song_id'])]
duplicated_idx = test_songs.duplicated(subset='song_id', keep='first')
test_songs = test_songs[~duplicated_idx]
test_songs = test_songs.append(missing_test)
test_songs = test_songs.reset_index()

In [None]:
del y, missing_test; 
del test_unique_songs;
gc.collect();

In [None]:
def load_song_embeddings():
    model = keras.models.load_model('../New_Data/LSTM_song_embeddings/songs_embeddings_100.h5')
    return model

In [None]:
def generate_songs_tensor(song_names, nlp, steps):
    assert not isinstance(song_names, basestring)
    nb_samples = len(song_names)
    word_vec_dim = nlp(song_names[0].decode('utf8'))[0].vector.shape[0]
    song_tensor = np.zeros((nb_samples, steps, word_vec_dim))
    for i in xrange(len(song_names)):
        tokens = nlp(song_names[i].decode('utf8'))
        for j in xrange(len(tokens)):
            if j<steps:
                song_tensor[i,j,:] = tokens[j].vector

    return song_tensor

In [None]:
def embedding_generator(data, song_mapper):
    num_rows = len(data)
    X = np.zeros((len(data), seq_length, 300), dtype='float32')
    count = 0
    for row_num, row in data.iterrows():
        X[count,] = song_mapper[row['song_id']]
        count += 1
    return X

In [None]:
test_song_mapper = dict()
X_test = generate_songs_tensor(test_songs['song_name'], nlp, seq_length)
test_song_mapper = dict(zip(test_songs['song_id'], X_test))

In [None]:
song_model = load_song_embeddings()
song_embedding_model = Model(inputs=song_model.input,
                                 outputs=song_model.get_layer('dense_1').output)

In [None]:
input_song_ids_layer = Input(shape=(100,))
intermediate_0 = Dense(64)(input_song_ids_layer)
output_0 = Dense(1, activation='sigmoid')(intermediate_0)
dnn_model = keras.models.Model(inputs = [input_song_ids_layer],
                               outputs = [output_0])
dnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
dnn_model.load_weights(weights)

In [None]:
predict_list = []
Y_list = []
num_complete_batches = int(math.floor(len(test)/batch_size))
for i in range(0, num_complete_batches):
    subset = test[i*batch_size : (i+1)*batch_size]
    X_batch = song_embedding_model.predict(embedding_generator(subset, test_song_mapper), verbose=0)
    predicted = dnn_model.predict_on_batch(X_batch)
    Y_list.extend(subset.id)
    predict_list.extend(predicted)
if len(test) % batch_size != 0:
    subset = test[(num_complete_batches*batch_size)-1 : len(test)-1]
    X_batch = song_embedding_model.predict(embedding_generator(subset, test_song_mapper), verbose=0)
    predicted = dnn_model.predict_on_batch(X_batch)
    Y_list.extend(subset.id)
    predict_list.extend(predicted)

In [None]:
headers = ['id', 'target']
new_test = pd.DataFrame(columns=headers)
new_test['id'] = Y_list
new_test['target'] = predict_list
new_test['target'] = new_test['target'].apply(lambda x: 1 if x>0.5 else 0)
new_test.to_csv(result_path, index=False, header=['id', 'target'])

In [None]:
assert len(new_test) is len(test)