In [10]:
# Source : https://github.com/CVxTz/Recommender_keras
# dataset : https://grouplens.org/datasets/movielens/    
# dataset file :  ml-latest-small.zip
# dataset readme : http://files.grouplens.org/datasets/movielens/ml-latest-small-README.html

# This code is a combination of utils.py, recommend.py, and plot_movies.py. 
# And, it also runs and validates in python 3.5 and keras 2.0.   

import pandas as pd
import numpy as np
from keras.models import Sequential, Model
from keras.layers import Embedding, Reshape, Activation, Input, Dense, Flatten, Dropout
from keras.layers.merge import Dot, multiply, concatenate
from keras.utils import np_utils
from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import skipgrams
from collections import defaultdict
### --- 
# from utils import *  # utils.py
from sklearn.metrics import mean_absolute_error
import pickle
### --- 
#from utils import *  # utils.py
#import pickle
import pandas as pd
from sklearn.manifold import TSNE
import numpy as np
import plotly.plotly as py  # conda install -c plotly plotly
import plotly
import plotly.graph_objs as go
import sys
#reload(sys)
#sys.setdefaultencoding('utf-8')
from keras.models import load_model  # NEW 



### --- start of  Recommender_keras/utils.py
def get_mapping(series):
    occurances = defaultdict(int)
    for element in series:
        occurances[element] += 1
    mapping = {}
    i = 0
    for element in occurances:
        i += 1
        mapping[element] = i

    return mapping

def get_data():
    #data = pd.read_csv("data/ratings.csv")
    data = pd.read_csv("ml-latest-small/ratings.csv")
    
    mapping_work = get_mapping(data["movieId"])

    data["movieId"] = data["movieId"].map(mapping_work)

    mapping_users = get_mapping(data["movieId"])

    data["movieId"] = data["movieId"].map(mapping_users)

    percentil_80 = np.percentile(data["timestamp"], 80)

    print(percentil_80)
    print(np.mean(data["timestamp"]<percentil_80))
    print(np.mean(data["timestamp"]>percentil_80))

    cols = ["userId", "movieId", "rating"]

    train = data[data.timestamp<percentil_80][cols]

    print(train.shape) #(80668, 3)

    test = data[data.timestamp>=percentil_80][cols]

    print(test.shape) #(20168, 3)

    max_user = max(data["userId"].tolist() )
    max_work = max(data["movieId"].tolist() )
    return train, test, max_user, max_work, mapping_work


def get_model_1(max_work, max_user):
    dim_embedddings = 30
    bias = 3
    # inputs
    w_inputs = Input(shape=(1,), dtype='int32')
    w = Embedding(max_work+1, dim_embedddings, name="work")(w_inputs)

    # context
    u_inputs = Input(shape=(1,), dtype='int32')
    u = Embedding(max_user+1, dim_embedddings, name="user")(u_inputs)
    o = multiply([w, u])
    o = Dropout(0.5)(o)
    o = Flatten()(o)
    o = Dense(1)(o)

    rec_model = Model(inputs=[w_inputs, u_inputs], outputs=o)
    #rec_model.summary()
    rec_model.compile(loss='mae', optimizer='adam', metrics=["mae"])
    return rec_model

def get_model_2(max_work, max_user):
    dim_embedddings = 30
    bias = 1
    # inputs
    w_inputs = Input(shape=(1,), dtype='int32')
    w = Embedding(max_work+1, dim_embedddings, name="work")(w_inputs)
    w_bis = Embedding(max_work + 1, bias, name="workbias")(w_inputs)

    # context
    u_inputs = Input(shape=(1,), dtype='int32')
    u = Embedding(max_user+1, dim_embedddings, name="user")(u_inputs)
    u_bis = Embedding(max_user + 1, bias, name="userbias")(u_inputs)
    o = multiply([w, u])
    o = concatenate([o, u_bis, w_bis])
    o = Dropout(0.5)(o)
    o = Flatten()(o)
    o = Dense(1)(o)

    rec_model = Model(inputs=[w_inputs, u_inputs], outputs=o)
    #rec_model.summary()
    rec_model.compile(loss='mae', optimizer='adam', metrics=["mae"])
    return rec_model

def get_model_3(max_work, max_user):
    dim_embedddings = 30
    bias = 1
    # inputs
    w_inputs = Input(shape=(1,), dtype='int32')
    w = Embedding(max_work+1, dim_embedddings, name="work")(w_inputs)
    w_bis = Embedding(max_work + 1, bias, name="workbias")(w_inputs)

    # context
    u_inputs = Input(shape=(1,), dtype='int32')
    u = Embedding(max_user+1, dim_embedddings, name="user")(u_inputs)
    u_bis = Embedding(max_user + 1, bias, name="userbias")(u_inputs)
    o = multiply([w, u])
    o = Dropout(0.5)(o)
    o = concatenate([o, u_bis, w_bis])
    o = Flatten()(o)
    o = Dense(10, activation="relu")(o)
    o = Dense(1)(o)

    rec_model = Model(inputs=[w_inputs, u_inputs], outputs=o)
    #rec_model.summary()
    rec_model.compile(loss='mae', optimizer='adam', metrics=["mae"])
    return rec_model

def get_array(series):
    return np.array([[element] for element in series])
### --- end of  Recommender_keras/utils.py




### --- start of Recommender_keras/recommend.py
train, test, max_user, max_work, mapping_work = get_data()


pickle.dump(mapping_work, open('mapping_work.pkl', 'wb')) # create pkl file

####################
model = get_model_1(max_work, max_user)

history = model.fit([get_array(train["movieId"]), get_array(train["userId"])], get_array(train["rating"]), epochs=10,
                    validation_split=0.2, verbose=1)

model.save_weights("model_1.h5")  # create hdf5 file

predictions = model.predict([get_array(test["movieId"]), get_array(test["userId"])])

test_performance = mean_absolute_error(test["rating"], predictions)

print(" Test Mae model 1 : %s " % test_performance)


####################
model = get_model_2(max_work, max_user)

history = model.fit([get_array(train["movieId"]), get_array(train["userId"])], get_array(train["rating"]), epochs=10,
                    validation_split=0.2, verbose=1)

predictions = model.predict([get_array(test["movieId"]), get_array(test["userId"])])

test_performance = mean_absolute_error(test["rating"], predictions)

print(" Test Mae model 2 : %s " % test_performance)

####################
model = get_model_3(max_work, max_user)

history = model.fit([get_array(train["movieId"]), get_array(train["userId"])], get_array(train["rating"]), epochs=10,
                    validation_split=0.2, verbose=1)

predictions = model.predict([get_array(test["movieId"]), get_array(test["userId"])])

test_performance = mean_absolute_error(test["rating"], predictions)

print(" Test Mae model 3 : %s " % test_performance)
### --- End of Recommender_keras/recommend.py

"""
from keras.models import load_model

model.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'

del model  # deletes the existing model

# returns a compiled model
# identical to the previous one
model = load_model('my_model.h5')
"""

### --- start of Recommender_keras/plot_movies.py
train, test, max_user, max_work, _ = get_data()

#movies = pd.read_csv("data/movies.csv")
movies = pd.read_csv("ml-latest-small/movies.csv")
    
movie_title = dict(zip(movies["movieId"], movies["title"]))

model = get_model_1(max_user=max_user, max_work=max_work)


model.load_weights("model_1.h5")   # load hdf5 


embedding_work = model.get_layer("work").get_weights()[0]

print(embedding_work)

mapping_work = pickle.load(open("mapping_work.pkl", "rb"))  # load pkl (pickle) file

# reverse_mapping = dict((v,k) for k,v in mapping_work.iteritems())
# Error: “ 'dict' object has no attribute 'iteritems' ”
# Removed dict.iteritems(), dict.iterkeys(), and dict.itervalues()
# Instead: use dict.items(), dict.keys(), and dict.values() respectively.

reverse_mapping = dict((v,k) for k,v in mapping_work.items())  # iteritems -> items

embedding = {}

for id in movie_title:
    if id in mapping_work:
        embedding[id] = embedding_work[mapping_work[id], :]


list_titles = []
list_embeddings = []

for id in embedding:
    list_titles.append(movie_title[id])
    list_embeddings.append(embedding[id])

matrix_embedding = np.array(list_embeddings)

X_embedded = TSNE(n_components=2).fit_transform(matrix_embedding)

vis_x = X_embedded[:, 0]
vis_y = X_embedded[:, 1]


data = [
    go.Scatter(
        x=vis_x,
        y=vis_y,
        mode='markers',
        text=list_titles
    )
]

layout = go.Layout(
    title='Movies'
)

fig = go.Figure(data=data, layout=layout)

plotly.offline.plot(fig, filename='movies.html')

1458635171.0
0.7999920663255187
0.19999801658137967
(80668, 3)
(20168, 3)
Train on 64534 samples, validate on 16134 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
 Test Mae model 1 : 0.9483242885419084 
Train on 64534 samples, validate on 16134 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
 Test Mae model 2 : 0.927058987476106 
Train on 64534 samples, validate on 16134 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
 Test Mae model 3 : 0.8370298216239841 
1458635171.0
0.7999920663255187
0.19999801658137967
(80668, 3)
(20168, 3)
[[ 0.04973532  0.04982212  0.00041407 ...  0.03007351 -0.00178476
  -0.04186865]
 [ 0.12134685  0.16361068  0.10502332 ...  0.18737651 -0.11195848
  -0.23810238]
 [-0.06399585  0.15757044  0.29366466 ... -0.02647739  0.03571346
   0.05992419]
 ...
 [

'movies.html'