# Autoencoder for Recommender Systems


In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv('ratings.csv')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
df_movies = pd.read_csv('movies.csv')


In [4]:
from sklearn.preprocessing import MinMaxScaler

r = df['rating'].values.astype(float)
min_max_scaler = MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(r.reshape(-1,1))
df_normalized = pd.DataFrame(x_scaled)
df['rating'] = df_normalized

### User Item matrix

In [5]:
df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,0.777778,964982703
1,1,3,0.777778,964981247
2,1,6,0.777778,964982224
3,1,47,1.000000,964983815
4,1,50,1.000000,964982931
...,...,...,...,...
100831,610,166534,0.777778,1493848402
100832,610,168248,1.000000,1493850091
100833,610,168250,1.000000,1494273047
100834,610,168252,1.000000,1493846352


In [6]:
matrix = df.pivot(index='userId', columns='movieId', values='rating')
matrix.fillna(0, inplace=True)

In [7]:
matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.777778,0.000000,0.777778,0.0,0.0,0.777778,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.777778,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.444444,0.000000,0.000000,0.0,0.0,0.000000,0.444444,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.777778,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.444444,0.333333,0.333333,0.0,0.0,0.000000,0.000000,0.0,0.0,0.777778,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,0.555556,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.777778,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
users = matrix.index.tolist()
items = matrix.columns.tolist()
num_items = df.movieId.nunique()
num_users = df.userId.nunique()


### Weights and biases for autoencoder

In [9]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior() 
num_input = num_items
num_hidden_1 = 10
num_hidden_2 = 5

X = tf.placeholder(tf.float64, [None, num_input])

weights = {
    'encoder_h1': tf.Variable(tf.random_normal([num_input, num_hidden_1], dtype=tf.float64)),
    'encoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_hidden_2], dtype=tf.float64)),
    'decoder_h1': tf.Variable(tf.random_normal([num_hidden_2, num_hidden_1], dtype=tf.float64)),
    'decoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_input], dtype=tf.float64)),
}

biases = {
    'encoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'encoder_b2': tf.Variable(tf.random_normal([num_hidden_2], dtype=tf.float64)),
    'decoder_b1': tf.Variable(tf.random_normal([num_hidden_1], dtype=tf.float64)),
    'decoder_b2': tf.Variable(tf.random_normal([num_input], dtype=tf.float64)),
}

Instructions for updating:
non-resource variables are not supported in the long term


In [10]:
def encoder(x):
    # Encoder Hidden layer with sigmoid activation #1
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['encoder_h1']), biases['encoder_b1']))
    # Encoder Hidden layer with sigmoid activation #2
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['encoder_h2']), biases['encoder_b2']))
    return layer_2


# Building the decoder

def decoder(x):
    # Decoder Hidden layer with sigmoid activation #1
    layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(x, weights['decoder_h1']), biases['decoder_b1']))
    # Decoder Hidden layer with sigmoid activation #2
    layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']), biases['decoder_b2']))
    return layer_2


# Construct model

encoder_op = encoder(X)
decoder_op = decoder(encoder_op)


# Prediction

y_pred = decoder_op


# Targets are the input data.

y_true = X

In [11]:
loss = tf.losses.mean_squared_error(y_true, y_pred)
optimizer = tf.train.AdagradOptimizer(0.1).minimize(loss)

predictions = pd.DataFrame()

# Define evaluation metrics

eval_x = tf.placeholder(tf.int32, )
eval_y = tf.placeholder(tf.int32, )
pre, pre_op = tf.metrics.precision(labels=eval_x, predictions=eval_y)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [12]:
init = tf.global_variables_initializer()
local_init = tf.local_variables_initializer()

with tf.Session() as session:
    epochs = 200
    batch_size = 64

    session.run(init)
    session.run(local_init)

    num_batches = int(matrix.shape[0] / batch_size)
    matrix = np.array_split(matrix, num_batches)

    for i in range(epochs):

        avg_cost = 0

        for batch in matrix:
            _, l = session.run([optimizer, loss], feed_dict={X: batch})
            avg_cost += l

        avg_cost /= num_batches
        if(i%20==0):
            print("Epoch: {} Loss: {}".format(i , avg_cost))

    print("Predictions...")

    matrix = np.concatenate(matrix, axis=0)

    preds = session.run(decoder_op, feed_dict={X: matrix})

    predictions = predictions.append(pd.DataFrame(preds))

    predictions = predictions.stack().reset_index(name='rating')
    predictions.columns = ['userId', 'movieId', 'rating']
    predictions['userId'] = predictions['userId'].map(lambda value: users[value])
    predictions['movieId'] = predictions['movieId'].map(lambda value: items[value])

Epoch: 0 Loss: 0.3499911626180013
Epoch: 20 Loss: 0.34087451961305404
Epoch: 40 Loss: 0.33076194590992397
Epoch: 60 Loss: 0.3189229700300429
Epoch: 80 Loss: 0.3073517382144928
Epoch: 100 Loss: 0.30055052704281277
Epoch: 120 Loss: 0.2970028618971507
Epoch: 140 Loss: 0.29493587215741474
Epoch: 160 Loss: 0.2935793134901259
Epoch: 180 Loss: 0.29259854555130005
Predictions...


In [13]:
predictions

Unnamed: 0,userId,movieId,rating
0,1,1,0.289992
1,1,2,0.740901
2,1,3,0.443068
3,1,4,0.539897
4,1,5,0.744034
...,...,...,...
5931635,610,193581,0.733143
5931636,610,193583,0.341572
5931637,610,193585,0.733629
5931638,610,193587,0.574337


In [14]:
print("Filtering out items in training set")

keys = ['userId', 'movieId']
i1 = predictions.set_index(keys).index
i2 = df.set_index(keys).index

recs = predictions[~i1.isin(i2)]
recs = recs.sort_values(['userId', 'rating'], ascending=[True, False])
recs = recs.groupby('userId').head(20)
#recs.to_csv('recs.tsv', sep='\t', index=False, header=False)

Filtering out items in training set


### Recommendations

In [15]:
df_join = pd.merge(df_movies, df, how='inner', on='movieId')
df=df_join

### Top 10 movies watched by user

In [20]:
df[df.userId==4].sort_values(['rating'], ascending=[ False]).head(10)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
64612,4967,No Man's Land (2001),Drama|War,4,1.0,1007569424
20138,898,"Philadelphia Story, The (1940)",Comedy|Drama|Romance,4,1.0,964623347
25460,1203,12 Angry Men (1957),Drama,4,1.0,945174025
54965,3508,"Outlaw Josey Wales, The (1976)",Action|Adventure|Drama|Thriller|Western,4,1.0,964538351
24855,1197,"Princess Bride, The (1987)",Action|Adventure|Comedy|Fantasy|Romance,4,1.0,964538763
24644,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi,4,1.0,964538763
24449,1188,Strictly Ballroom (1992),Comedy|Romance,4,1.0,964538500
45517,2599,Election (1999),Comedy,4,1.0,964622684
23741,1103,Rebel Without a Cause (1955),Drama,4,1.0,964539723
23080,1086,Dial M for Murder (1954),Crime|Mystery|Thriller,4,1.0,964539723


In [17]:
df_join = pd.merge(df_movies, recs, how='inner', on='movieId')
recs=df_join

### Top 10 movies recommendation

In [19]:
recs[recs.userId==4].sort_values(['rating'], ascending=[ False])

Unnamed: 0,movieId,title,genres,userId,rating
2749,3117,Ride with the Devil (1999),Drama|Romance|War,4,0.972014
3,745,Wallace & Gromit: A Close Shave (1995),Animation|Children|Comedy,4,0.968846
7729,7937,"Silence, The (Tystnaden) (1963)",Drama,4,0.965706
8852,53435,Hostel: Part II (2007),Crime|Horror|Thriller,4,0.964267
5900,6022,American Me (1992),Drama,4,0.963754
4033,3971,"Private Eyes, The (1981)",Comedy|Mystery,4,0.962146
2235,3055,Felicia's Journey (1999),Thriller,4,0.960319
5299,5456,Wagons East (1994),Comedy|Western,4,0.960012
1639,2404,Rambo III (1988),Action|Adventure|Thriller|War,4,0.958525
6529,6237,"Glenn Miller Story, The (1953)",Drama,4,0.956424
