In [17]:
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName('Elephas_App').setMaster('local[8]')
try:
    sc.stop()
except:
    pass
sc = SparkContext(conf=conf)

In [18]:
import pandas as pd
import numpy as np
import os

In [19]:
from keras.models import Sequential, Model
from keras.layers import Embedding, Reshape, Activation, Input, Dense, Flatten, Dropout
from keras.layers.merge import Dot, multiply, concatenate
from keras.utils import np_utils
from keras.utils.data_utils import get_file
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import skipgrams
from keras import losses
from keras import metrics

In [20]:
def get_model(max_work, max_user):
    dim_embedddings = 30
    bias = 1
    # inputs
    w_inputs = Input(shape=(1,), dtype='int32')
    w = Embedding(max_work+1, dim_embedddings, name="work")(w_inputs)
    w_bis = Embedding(max_work + 1, bias, name="workbias")(w_inputs)

    # context
    u_inputs = Input(shape=(1,), dtype='int32')
    u = Embedding(max_user+1, dim_embedddings, name="user")(u_inputs)
    u_bis = Embedding(max_user + 1, bias, name="userbias")(u_inputs)
    
    #genre_inputs = Input(shape=(20,1,), dtype='int32')
    #genre = Embedding(21,dim_embedddings, name="genre")(genre_inputs)
    
    o = multiply([w, u])
    o = Dropout(0.5)(o)
    o = concatenate([o, u_bis, w_bis])
    o = Flatten()(o)
    o = Dense(10, activation="relu")(o)
    o = Dense(1)(o)

    rec_model = Model(inputs=[w_inputs, u_inputs], outputs=o)
    #rec_model.summary()
    rec_model.compile(loss = losses.mean_squared_error, optimizer='adam', metrics=[metrics.mae])

    return rec_model

In [21]:
genres=["Action","Adventure","Animation","Children","Comedy","Crime","Documentary","Drama",\
            "Fantasy","Film-Noir","Horror","Musical","Mystery","Romance","Sci-Fi","Thriller","War","Western","IMAX","(no genres listed)"]
genres_index = {}
for i, gen in enumerate(genres):
    genres_index[gen]=i

def genres_to_array(g):
    genre_array = np.array([0]*len(genres))
    for i in g.split("|"):
        genre_array[genres_index[i]]=1
    return genre_array
    

In [22]:
def get_data(path):
    genre = pd.read_csv(path+"/movies.csv")
    genre["genres"] = genre["genres"].apply(genres_to_array)
    genre = genre[["movieId","genres"]]

    rating = pd.read_csv(path+"/ratings.csv")
    
    data = pd.merge(genre, rating, how='right', on='movieId')
    
    data["genres"] = data["genres"].apply(lambda g:g if isinstance(g, list) else [0]*len(genres))
    percentil_80 = np.percentile(data["timestamp"], 80)
    

    print(percentil_80)

    print(np.mean(data["timestamp"]<percentil_80))

    print(np.mean(data["timestamp"]>percentil_80))

    cols = ["userId", "movieId", "genres", "rating"]

    train = data[data.timestamp<percentil_80][cols]

    print(train.shape)

    test = data[data.timestamp>=percentil_80][cols]

    print(test.shape)

    max_user = max(data["userId"].tolist() )
    max_work = max(data["movieId"].tolist() )


    return train, test, max_user, max_work

In [23]:
def get_array(series):
    return np.array([[element] for element in series])

In [24]:
train, test, max_user, max_work = get_data("../data")

975768738.0
0.7999968006686603
0.19999920016716508
(800164, 4)
(200045, 4)


In [25]:
model = get_model(max_work, max_user)

In [26]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
work (Embedding)                (None, 1, 30)        118590      input_7[0][0]                    
__________________________________________________________________________________________________
user (Embedding)                (None, 1, 30)        181230      input_8[0][0]                    
__________________________________________________________________________________________________
multiply_4

In [27]:
x_train = [get_array(train["movieId"]), get_array(train["userId"])]
y_train = get_array(train["rating"])

In [28]:
from elephas.utils.rdd_utils import to_simple_rdd
rdd = to_simple_rdd(sc, x_train, y_train)



In [75]:
#from elephas.spark_model import SparkModel

#spark_model = SparkModel(model, frequency='epoch', mode='asynchronous')
#spark_model.fit(rdd, epochs=20, batch_size=32, verbose=0, validation_split=0.1)

In [29]:
history = model.fit(x_train, y_train, nb_epoch=10,
                    validation_split=0.2, verbose=1)

Instructions for updating:
Use tf.cast instead.


  from ipykernel import kernelapp as app


Train on 640131 samples, validate on 160033 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [41]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt

In [31]:
predictions = model.predict([get_array(test["movieId"]), get_array(test["userId"])])

In [102]:
test_performance = mean_absolute_error(test["rating"], predictions)

In [43]:
test_performance

0.811744453944615

In [163]:
from IPython.display import Image, HTML, display
import tmdbsimple as tmdb
tmdb.API_KEY = '31362d194b7b8e33df9fdbfa6e138b49'
def get_resc_to_user(the_id,num=10):
    rating = pd.read_csv("../data/ratings.csv")
    rating = rating[rating["userId"]!=the_id]
    movie = rating[["movieId"]].reset_index(drop=True).drop_duplicates()
    score = pd.DataFrame(model.predict([[the_id]*len(movie),movie]))
    movie["score"]=score
    return movie.sort_values(by=['score'],ascending=False).head(num)

def display_resc_to_user(the_id,num=10):
    recs = get_resc_to_user(the_id,num)
    display(HTML("<h2>Get recommended movies for user id %s</h2>" % the_id))
    display(HTML("<h4>Recommended movies:</h4>"))
    rec_html = "<table border=0>"
    i = 0
    for i,rec in recs.iterrows():
        movie_id = int(rec.movieId)
        img_url = get_poster_url(movie_id)
        score = rec.score
        title = get_movie_title(movie_id)
        rec_html += "<td><h5>%s</h5><img src=%s width=150></img></td><td><h5>%2.3f</h5></td>" % (title,img_url, score)
        i += 1
        if i % 5 == 0:
            rec_html += "</tr><tr>"
    rec_html += "</tr></table>"
    display(HTML(rec_html))

def get_movie_title(movie_id):
    movie = pd.read_csv("../data/movies.csv")
    return movie[movie["movieId"]==movie_id][["title"]].values[0][0]

def get_poster_url(movie_id):
    links = pd.read_csv("../data/links.csv")
    id = int(links[links["movieId"]==movie_id]["tmdbId"])
    IMAGE_URL = 'https://image.tmdb.org/t/p/w500'
    try:
        movie = tmdb.Movies(id).info()
        poster_url = IMAGE_URL + movie['poster_path'] if 'poster_path' in movie and movie['poster_path'] is not None else ""
        return poster_url
    except:
        return "KEY_ERR"
    return 
    

In [164]:
display_resc_to_user(1,5)

0,1,2,3
"Secret Garden, The (1993)",4.883,Mr. Saturday Night (1992),4.851


In [96]:
sc.stop()