In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
%cd "/content/drive/My Drive/colab notebooks/movie_recommend/"

/content/drive/My Drive/colab notebooks/movie_recommend


In [3]:
import os
from zipfile import ZipFile

with ZipFile('ml-latest.zip', 'r') as f:
  f.extractall('.')

In [2]:
import tensorflow as tf
from tensorflow import keras

import numpy as np
import pandas as pd

#### Load data

In [5]:
import os
os.listdir('ml-latest-small')

['tags.csv', 'movies.csv', 'links.csv', 'README.txt', 'ratings.csv']

In [256]:
from IPython.display import display

df_ratings = pd.read_csv('ml-latest-small/ratings.csv')
df_movies = pd.read_csv('ml-latest-small/movies.csv')

display(df_ratings.head())
display(df_movies.head())

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


#### Data clearning and features enginering

In [257]:
import re
# Product
df_prods = df_movies
df_prods['year'] = df_prods['title'].apply(lambda x: x.split('(')[-1].replace(')','').strip().split('–')[0]
                                  if '(' in x else 0)
df_prods['name'] = df_prods['title'].apply(lambda x: '('.join(x.split('(')[:-1]) )

df_prods['very old'] = df_prods['year'].apply( lambda x: int(x) <= 1980 if x != 0
                                              else 0 )
df_prods['old'] = df_prods['year'].apply( lambda x: 2000 >= int(x) > 1980 )
df_prods['new'] = df_prods['year'].apply( lambda x: int(x) > 2000 )

df_prods['genres'] = df_prods['genres'].apply(lambda x: x.split('|'))
df_prods['product'] = range(0,len(df_prods))

In [258]:
from datetime import datetime
# Users
df_users = df_ratings
datetime_col = df_users['timestamp'].apply(datetime.fromtimestamp)

df_users['hour'] = datetime_col.apply(lambda x: x.hour)
df_users['weekday'] = datetime_col.apply(lambda x: x.weekday())

df_users = df_users.merge(df_prods[['movieId', 'product']])
df_users['userId'] = df_users['userId'].apply(lambda x: x-1)

df_users = df_users.rename(columns={"rating": "y", "userId": 'user'})

In [259]:
# Clean data
df_prods = df_prods.drop(['movieId','year','title'], axis=1).set_index('product')
df_users = df_users[['user', 'product', 'hour', 'weekday', 'y']]

features enginering

In [260]:
genres = set([item for list_ in df_prods['genres'].values for item in list_])

In [261]:
for genre in genres:
  df_prods[genre] = df_prods['genres'].apply(lambda x: genre in x)

df_prods = df_prods.drop(['genres'], axis=1)
df_prods.head()

Unnamed: 0_level_0,name,very old,old,new,Crime,Horror,Thriller,Fantasy,Drama,Animation,...,Mystery,Children,Western,War,Film-Noir,Action,Comedy,IMAX,Sci-Fi,Adventure
product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Toy Story,False,True,False,False,False,False,True,False,True,...,False,True,False,False,False,False,True,False,False,True
1,Jumanji,False,True,False,False,False,False,True,False,False,...,False,True,False,False,False,False,False,False,False,True
2,Grumpier Old Men,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
3,Waiting to Exhale,False,True,False,False,False,False,False,True,False,...,False,False,False,False,False,False,True,False,False,False
4,Father of the Bride Part II,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False


splitting in train and test sets

In [262]:
df_context = df_users[['user', 'product', 'hour', 'weekday']]

In [263]:
df_users = df_users.pivot_table(index='user', columns='product', values='y')

missing_cols = list(  set(df_prods.index) - set(df_users.columns) )
for col in missing_cols:
  df_users[col] = np.nan

df_users = df_users[sorted(df_users.columns)]

preprocessing

In [264]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0.5,1))
scaled_data = scaler.fit_transform(df_users.values)

df_users = pd.DataFrame(columns=df_users.columns, data=scaled_data,
                        index=df_users.index)

  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)


In [265]:
int_split = int(len(df_users.columns)*.8)

df_users_train = df_users.loc[:, :int_split-1]
df_users_test = df_users.loc[:, int_split:]

In [266]:
df_users.head()

product,0,1,2,3,4,5,6,7,8,9,...,9732,9733,9734,9735,9736,9737,9738,9739,9740,9741
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.888889,,0.888889,,,0.875,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,0.888889,,,,,,,,,,...,,,,,,,,,,


metrics

In [241]:
def mean_reciprocal_rank(y_test, predicted):
    score = []
    for product in y_test:
        mrr = 1 / (list(predicted).index(product) + 1) if product in predicted else 0
        score.append(mrr)
    return np.mean(score)

In [242]:
from sklearn.metrics import accuracy_score

def evaluate_pred(user_id, y_true, y_pred):
  print('User:', user_id)

  print('y:',y_true), print('pred:',y_pred)

  print('true predicted:', len( set(y_true)&set(y_pred) ))
  print('accuracy: {:.2f}%'.format( accuracy_score(y_true, y_pred)*100 ))
  print('mrr: {:.2f}'.format( mean_reciprocal_rank(y_true, y_pred) ))

#### Content based filtering

In [181]:
prod_feat = df_prods.drop(['name'],axis=1).values
user_id = 9

def get_test_recommendations(user_id):
  train = df_users_train.iloc[user_id].to_frame('y')
  test = df_users_test.iloc[user_id].to_frame('y')

  tmp = test.copy()
  tmp['y'] = np.nan
  train = pd.concat([train,tmp])
  user_prod = train.fillna(0).values.T
  user_prod.shape, prod_feat.shape
  weights = np.dot(user_prod,prod_feat)
  weights = weights / weights.sum()

  pred = np.dot(weights, prod_feat.T)
  test['y_hat'] = pred[0][-len(test):]
  test = test[~np.isnan(test['y'])]
  return test

In [183]:
top_k = 5

for i in range(20):
  if np.sum(df_users_test.iloc[i]) > 0:
    test = get_test_recommendations(i)

    y_true = test['y'].sort_values(ascending=False).index.values[:5]
    y_pred = test['y_hat'].sort_values(ascending=False).index.values[:5]

    evaluate_pred(i, y_true, y_pred),print()

User: 1
y: [8305 8681 8466 8550 8063]
pred: [8509 8305 8063 8550 8681]
true predicted: 4
accuracy: 20.00%
mrr: 0.26

User: 9
y: [7802 8036 8929 8787 8447]
pred: [8303 8388 8265 8036 8245]
true predicted: 1
accuracy: 0.00%
mrr: 0.05

User: 14
y: [9433 8683 9221 8252 8900]
pred: [8681 8900 8691 8683 9437]
true predicted: 2
accuracy: 0.00%
mrr: 0.15

User: 17
y: [9041 7967 8307 7981 9437]
pred: [9193 8636 8900 8691 8077]
true predicted: 0
accuracy: 0.00%
mrr: 0.00



#### Collaborotive filtering

In [267]:
train_ds = df_users_train.stack(dropna=True).reset_index().rename(
    columns={0: 'y', "level_0": "user"})
test_ds = df_users_test.stack(dropna=True).reset_index().rename(
    columns={0: 'y', "level_0": "user"})
test_ds.head()

Unnamed: 0,user,product,y
0,1,8063,0.8125
1,1,8305,1.0
2,1,8376,0.777778
3,1,8466,0.875
4,1,8509,0.5


building and compiling model

In [217]:
embed_size = 50
user_size, prod_size = df_users.shape

# User
x_users_input = keras.layers.Input(shape=[1,])
x_users_embed = keras.layers.Embedding(input_dim=user_size,
                                      output_dim=embed_size)(x_users_input)

x_users = keras.layers.Reshape([embed_size])(x_users_embed)

# Products
x_prods_input = keras.layers.Input(shape=[1,])
x_prods_embed = keras.layers.Embedding(input_dim=prod_size,
                                       output_dim=embed_size)(x_prods_input)

x_prods = keras.layers.Reshape([embed_size])(x_prods_embed)

# Matrix Factorization
xx = keras.layers.Dot(normalize=True, axes=1)([x_users,x_prods])

# Output
output = keras.layers.Dense(1)(xx)

# Compling
model = keras.models.Model(inputs=[x_users_input, x_prods_input],
                           outputs=[output])
model.compile(loss='mean_absolute_error', optimizer='adam',
              metrics=['mean_absolute_percentage_error'])

In [218]:
model.fit([train_ds['user'], train_ds['product']], train_ds['y'],
          epochs=64, validation_split=.3, batch_size=64, shuffle=True)

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


<keras.src.callbacks.History at 0x7bb426a7f1f0>

In [219]:
test_ds['y_hat'] = model.predict([test_ds['user'], test_ds['product']])
test_ds.head()



Unnamed: 0,user,product,y,y_hat
0,1,8063,0.8125,0.733784
1,1,8305,1.0,0.785927
2,1,8376,0.777778,0.862506
3,1,8466,0.875,0.637438
4,1,8509,0.5,0.819685


In [220]:
top_k = 5

for i in range(20):
  test = test_ds.loc[test_ds['user'] == i]
  if len(test) > 0:

    y_true = test['y'].sort_values(ascending=False).index.values[:5]
    y_pred = test['y_hat'].sort_values(ascending=False).index.values[:5]

    evaluate_pred(i, y_true, y_pred),print()

User: 1
y: [1 6 3 5 0]
pred: [5 2 6 4 1]
true predicted: 3
accuracy: 0.00%
mrr: 0.31

User: 9
y: [ 8 17 34 33 29]
pred: [11 34 27 36 25]
true predicted: 1
accuracy: 0.00%
mrr: 0.10

User: 14
y: [66 55 62 43 60]
pred: [55 40 39 67 65]
true predicted: 1
accuracy: 0.00%
mrr: 0.20

User: 17
y: [127  70  93  73 145]
pred: [148 111  68  97  71]
true predicted: 0
accuracy: 0.00%
mrr: 0.00



hybrid filtering

In [268]:
features = df_prods.drop(['name'],axis=1)

days = df_context['weekday'].unique()
for day in days:
  df_context[str(day)] = df_context['weekday'].apply(lambda x: x==day)

hours = [(0,6), (6,12), (12,18), (18,24)]
for (from_hour, to_hour) in hours:
  df_context[f'{from_hour}-{to_hour}'] = df_context['hour'].apply(lambda x: from_hour <= x < to_hour)

df_context = df_context.drop(['weekday', 'hour'], axis=1)

In [290]:
train_ds = df_users_train.stack(dropna=True).reset_index().rename(
    columns={0: 'y', "level_0": "user"})

train_ds = train_ds.merge(features.astype(float), how="left", left_on='product', right_index=True)
train_ds = train_ds.merge(df_context.astype(float), how='left')

train_ds.head()

Unnamed: 0,user,product,y,very old,old,new,Crime,Horror,Thriller,Fantasy,...,4,1,0,2,3,5,0-6,6-12,12-18,18-24
0,0,0,0.888889,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,2,0.888889,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0,5,0.875,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0,43,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0,46,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [276]:
context = df_context.drop(['user', 'product'],axis=1).columns

In [285]:
embed_size = 50
feat_size,context_size = len(features.columns),len(context)
user_size, prod_size = df_users.shape

x_users_input = keras.layers.Input(shape=[1,])
x_prods_input = keras.layers.Input(shape=[1,])

################## Collaborotive filtering ##################
# Matrix Factorization
## User
x_users_embed = keras.layers.Embedding(input_dim=user_size,
                                      output_dim=embed_size)(x_users_input)

x_users = keras.layers.Reshape([embed_size])(x_users_embed)

## Products
x_prods_embed = keras.layers.Embedding(input_dim=prod_size,
                                       output_dim=embed_size)(x_prods_input)

x_prods = keras.layers.Reshape([embed_size])(x_prods_embed)

## Matrix Factorization output
matrix_fact = keras.layers.Dot(normalize=True, axes=1)([x_users,x_prods])

# Neural Network
## User
x_users_embed = keras.layers.Embedding(input_dim=user_size,
                                      output_dim=embed_size)(x_users_input)

x_users = keras.layers.Reshape([embed_size])(x_users_embed)

## Products
x_prods_embed = keras.layers.Embedding(input_dim=prod_size,
                                       output_dim=embed_size)(x_prods_input)

x_prods = keras.layers.Reshape([embed_size])(x_prods_embed)

## Neural Network output
concat = keras.layers.Concatenate()([x_users,x_prods])
nn = keras.layers.Dense( int(embed_size/2),activation='relu' )(concat)

################## Content-base filtering ##################
feat_input = keras.layers.Input(shape=[feat_size])
feat_x = keras.layers.Dense(feat_size, activation='relu')(feat_input)

################## Knowledge filtering ##################
context_input = keras.layers.Input(shape=[context_size])
context_x = keras.layers.Dense(context_size, activation='relu')(context_input)

## Output
concat = keras.layers.Concatenate()([matrix_fact, nn, feat_x, context_x])
output = keras.layers.Dense(1)(concat)

# Compling
model = keras.models.Model(inputs=[x_users_input, x_prods_input, feat_input,
                                   context_x], outputs=[output])
model.compile(loss='mean_absolute_error', optimizer='adam',
              metrics=['mean_absolute_percentage_error'])

In [291]:
model.fit([train_ds['user'], train_ds['product'], train_ds[features.columns], train_ds[context]],
          train_ds['y'], epochs=64, validation_split=.3, batch_size=64, shuffle=True)

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


<keras.src.callbacks.History at 0x7bb426c97430>

In [292]:
test_ds = df_users_test.stack(dropna=True).reset_index().rename(
    columns={0: 'y', "level_0": "user"})

test_ds = test_ds.merge(features.astype(float), how="left", left_on='product', right_index=True)
test_ds = test_ds.merge(df_context.astype(float), how='left')

In [293]:
test_ds['y_hat'] = model.predict([test_ds['user'], test_ds['product'],
                                  test_ds[features.columns], test_ds[context]])
test_ds.head()



Unnamed: 0,user,product,y,very old,old,new,Crime,Horror,Thriller,Fantasy,...,1,0,2,3,5,0-6,6-12,12-18,18-24,y_hat
0,1,8063,0.8125,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.815667
1,1,8305,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.884895
2,1,8376,0.777778,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.768134
3,1,8466,0.875,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.748897
4,1,8509,0.5,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.667557


In [294]:
top_k = 5

for i in range(20):
  test = test_ds.loc[test_ds['user'] == i]
  if len(test) > 0:

    y_true = test['y'].sort_values(ascending=False).index.values[:5]
    y_pred = test['y_hat'].sort_values(ascending=False).index.values[:5]

    evaluate_pred(i, y_true, y_pred),print()

User: 1
y: [1 6 3 5 0]
pred: [6 5 1 7 0]
true predicted: 4
accuracy: 20.00%
mrr: 0.41

User: 9
y: [ 8 17 34 33 29]
pred: [15 19 12 36 20]
true predicted: 0
accuracy: 0.00%
mrr: 0.00

User: 14
y: [66 55 62 43 60]
pred: [62 45 40 43 37]
true predicted: 2
accuracy: 20.00%
mrr: 0.25

User: 17
y: [127  70  93  73 145]
pred: [125 112 145 110 100]
true predicted: 1
accuracy: 0.00%
mrr: 0.07



the amout is pretty the same as with simple base-context filtering, but the order of movies have been changed