In [1]:
import numpy as np
import math
import pandas as pd
from tqdm import tqdm
import keras
import gc
import re
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from keras.utils import np_utils, generic_utils
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Embedding, Flatten
import cPickle as pickle
from nltk.tokenize import word_tokenize
import spacy
nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
from itertools import izip_longest
from collections import defaultdict

Using TensorFlow backend.


In [28]:
batch_size = 128
seq_length = 25
weights = '../Models/dnn_lstm_song_user_epoch_015.hdf5'
result_path = '../Test/submission_dnn_user_song.csv'

In [3]:
headers = ['song_id', 'translated_names']
songs = pd.read_csv('../New_Data/tr_songs.csv', usecols = headers) #419615
songs['song_name'] = songs['translated_names'].map(str).apply(lambda x : ''.join([i for i in re.findall(r'[a-zA-Z_\s]', x)]))
songs['song_name'] = songs['song_name'].map(str).apply(lambda x : re.sub(r'\s+',' ',x))

In [32]:
test = pd.read_csv('../Data/test.csv', usecols=['id','msno', 'song_id'])
y = pd.DataFrame(test.song_id.unique(), columns=['song_id'], index=None)
missing = y[~y.song_id.isin(songs.song_id)]
missing_test = pd.DataFrame(columns = ['song_id', 'song_name'])
missing_test['song_id'] = missing.loc[missing['song_id'].isin(test.song_id)].song_id
missing_test['song_name'] = 'General Song'
test_unique_songs = test.song_id.unique() #No. 224753
test_unique_songs = pd.DataFrame(test_unique_songs, columns=['song_id'], index=None)
test_songs = songs.loc[songs['song_id'].isin(test_unique_songs['song_id'])]
duplicated_idx = test_songs.duplicated(subset='song_id', keep='first')
test_songs = test_songs[~duplicated_idx]
test_songs = test_songs.append(missing_test)
test_songs = test_songs.reset_index()

In [13]:
del y, missing_test; 
del test_unique_songs;
gc.collect();

In [14]:
def load_song_embeddings():
    model = keras.models.load_model('../New_Data/LSTM_song_embeddings/songs_embeddings_100.h5')
    return model

def load_user_model():
    model = keras.models.load_model('../New_Data/model_user_embeddings/user_embeddings_100.h5', compile = False)
    model.compile(optimizer=keras.optimizers.Adam(lr=1e-4), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model


def generate_songs_tensor(song_names, nlp, steps):
    assert not isinstance(song_names, basestring)
    nb_samples = len(song_names)
    word_vec_dim = nlp(song_names[0].decode('utf8'))[0].vector.shape[0]
    song_tensor = np.zeros((nb_samples, steps, word_vec_dim))
    for i in xrange(len(song_names)):
        tokens = nlp(song_names[i].decode('utf8'))
        for j in xrange(len(tokens)):
            if j<steps:
                song_tensor[i,j,:] = tokens[j].vector

    return song_tensor

In [15]:
test_song_mapper = dict()
X_test = generate_songs_tensor(test_songs['song_name'], nlp, seq_length)
test_song_mapper = dict(zip(test_songs['song_id'], X_test))

In [9]:
song_model = load_song_embeddings()
#SVG(model_to_dot(song_model).create(prog='dot', format='svg'))
user_model = load_user_model()


song_embedding_model = Model(inputs=song_model.input,outputs=song_model.get_layer('dense_1').output)
user_embedding_model = Model(inputs=user_model.input,outputs=user_model.get_layer('embedding_1').output)

msno_mapper = pickle.load(open('../New_Data/model_user_embeddings/msno_mapper_py2.pkl', 'r'))

In [10]:
def embedding_generator(data, song_mapper):
    num_rows = len(data)
    X_song = np.zeros((len(data), seq_length, 300), dtype=np.float32)
    count = 0
    for row_num, row in data.iterrows():
        X_song[count,] = song_mapper[row['song_id']]
        count += 1
    return X_song

def user_batch(data, msno_mapper):
    num_rows = len(data)
    X_msno = np.zeros((num_rows, ), dtype='str')
    count = 0
    for row_num, row in data.iterrows():
        X_msno[count,] = msno_mapper[row['msno']]
        count += 1
    return X_msno

In [26]:
input_song_ids_layer = Input(shape=(100,))
input_msno_layer = Input(shape=(50,))

combined_input = keras.layers.concatenate([input_msno_layer, input_song_ids_layer])
intermediate_0 = Dense(64)(combined_input)
output_0 = Dense(1, activation='sigmoid')(intermediate_0)
dnn_model = keras.models.Model(inputs = [input_msno_layer, input_song_ids_layer],
                               outputs = [output_0])
dnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [29]:
SVG(model_to_dot(dnn_model).create(prog='dot', format='svg'))
dnn_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_10 (InputLayer)            (None, 50)            0                                            
____________________________________________________________________________________________________
input_9 (InputLayer)             (None, 100)           0                                            
____________________________________________________________________________________________________
concatenate_5 (Concatenate)      (None, 150)           0           input_10[0][0]                   
                                                                   input_9[0][0]                    
____________________________________________________________________________________________________
dense_9 (Dense)                  (None, 64)            9664        concatenate_5[0][0]     

In [30]:
dnn_model.load_weights(weights)

In [None]:
headers = ['id', 'prob', 'target']
new_test = pd.DataFrame(columns=headers)
new_test['id'] = Y_list
new_test['prob'] = predict_list
new_test['target'] = new_test['prob'].apply(lambda x: 1 if x>0.5 else 0)
new_test.to_csv(result_path, index=False, header=['id', 'prob', 'target'])

new_test = new_test.drop(['prob'], axis=1)
new_test.to_csv('submitted_dnn_songs_user.csv', index=False)