<a href="https://colab.research.google.com/github/jnetoass/autoencoder/blob/main/teste_sysrec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pathlib import Path
import matplotlib.pyplot as plt

In [None]:
# Download the actual data from http://files.grouplens.org/datasets/movielens/ml-25m.zip"
# Use the ratings.csv file
movielens_data_file_url = (
    "http://files.grouplens.org/datasets/movielens/ml-25m.zip"
)
movielens_zipped_file = keras.utils.get_file(
    "ml-25m.zip", movielens_data_file_url, extract=False
)
keras_datasets_path = Path(movielens_zipped_file).parents[0]
movielens_dir = keras_datasets_path / "ml-25m"

# Only extract the data the first time the script is run.
if not movielens_dir.exists():
    with ZipFile(movielens_zipped_file, "r") as zip:
        # Extract files
        print("Extracting all the files now...")
        zip.extractall(path=keras_datasets_path)
        print("Done!")

ratings_file = movielens_dir / "ratings.csv"
ratings = pd.read_csv(ratings_file)

Downloading data from http://files.grouplens.org/datasets/movielens/ml-25m.zip


Exception: ignored

In [None]:
movies_file = movielens_dir / "movies.csv"
movies = pd.read_csv(movies_file).set_index("movieId")

In [None]:
movies.head()
#ratings.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [None]:
rating_counts = ratings.groupby("movieId")["rating"].count().sort_values(ascending=False)

# only the 500 most popular movies
pop_ratings = ratings[ratings["movieId"].isin((rating_counts).index[0:1000])]
pop_ratings = pop_ratings.set_index(["movieId", "userId"])

In [None]:
prefs = pop_ratings["rating"]

mean_0 = pop_ratings["rating"].mean()
prefs = prefs - mean_0

mean_i = prefs.groupby("movieId").mean()
prefs = prefs - mean_i

mean_u = prefs.groupby("userId").mean()
prefs = prefs - mean_u

In [None]:
pref_matrix = prefs.reset_index()[["userId", "movieId", "rating"]].pivot(index="userId", columns="movieId", values="rating").fillna(0)

In [None]:
pref_matrix.shape

(162506, 1000)

In [None]:
pref_matrix.head()

movieId,1,2,3,5,6,7,10,11,16,17,...,122920,122922,134130,134853,139385,148626,152081,164179,166528,168252
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.380532,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.104022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.433894,0.037345,-0.24373,-0.142715,-0.005457,-0.237414,0.0,-0.237825,0.008664,-0.104883
4,-0.688701,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.171591,0.0,1.409864,0.177907,0.81136,1.177496,0.423985,0.0
5,-0.21981,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#from keras.optimizers import Adam
from keras.layers import Input, Dense, Dropout
from keras.models import Model

def autoEncoder(X):
    '''
    Autoencoder for Collaborative Filter Model
    '''

    # Input
    input_layer = Input(shape=(X.shape[1],), name='UserScore')
    
    # Encoder
    # -----------------------------
    enc = Dense(512, activation='selu', name='EncLayer1')(input_layer)

    # Latent Space
    # -----------------------------
    lat_space = Dense(256, activation='selu', name='LatentSpace')(enc)
    lat_space = Dropout(0.8, name='Dropout')(lat_space) # Dropout

    # Decoder
    # -----------------------------
    dec = Dense(512, activation='selu', name='DecLayer1')(lat_space)

    # Output
    output_layer = Dense(X.shape[1], activation='linear', name='UserScorePred')(dec)

    # this model maps an input to its reconstruction
    model = Model(input_layer, output_layer)    
    
    return model

In [None]:
# input
X = pref_matrix.values
y = pref_matrix.values

In [None]:
# Build model
model = autoEncoder(X)

model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mse')
    
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 UserScore (InputLayer)      [(None, 1000)]            0         
                                                                 
 EncLayer1 (Dense)           (None, 512)               512512    
                                                                 
 LatentSpace (Dense)         (None, 256)               131328    
                                                                 
 Dropout (Dropout)           (None, 256)               0         
                                                                 
 DecLayer1 (Dense)           (None, 512)               131584    
                                                                 
 UserScorePred (Dense)       (None, 1000)              513000    
                                                                 
Total params: 1,288,424
Trainable params: 1,288,424
Non-train

In [None]:
hist = model.fit(x=X, y=y,
                  epochs=50,
                  batch_size=64,
                  shuffle=True,
                  validation_split=0.1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
# Predict new Matrix Interactions, set score zero on visualized games
new_matrix = model.predict(X) * (X == 0)

In [None]:
# converting the reconstructed matrix back to a Pandas dataframe
new_users_items_matrix_df  = pd.DataFrame(new_matrix, 
                                          columns = pref_matrix.columns, 
                                          index   = pref_matrix.index)
new_users_items_matrix_df.head()

movieId,1,2,3,5,6,7,10,11,16,17,...,122920,122922,134130,134853,139385,148626,152081,164179,166528,168252
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.009981,-0.021267,-0.014194,-0.015353,0.007553,-0.007167,0.003896,-0.005203,-0.006583,-0.013538,...,0.003436,-0.0018,-0.021413,-0.002874,-0.000606,0.001794,-0.003009,0.000645,-0.00643,0.008633
2,-0.0,-0.040883,-0.013083,-0.06904,-0.000499,0.013837,0.030529,-0.008762,-0.08293,-0.030698,...,-0.014069,0.022334,0.000792,-0.002039,-0.019663,-0.026223,-0.006565,-0.005225,0.040666,0.003843
3,-0.0,0.004231,-0.008817,-0.034579,0.079124,-0.005048,0.035639,-0.020518,0.053058,0.001062,...,-0.0,-0.0,-0.0,-0.0,0.0,0.0,-0.024041,-0.0,-0.0,-0.0
4,-0.0,0.056871,-0.006633,0.00602,0.023553,-0.00079,0.020553,-0.007082,0.010767,-0.01837,...,-0.105335,-0.090593,-0.0,0.027064,0.0,0.0,-0.0,0.0,-0.0,0.020686
5,-0.0,-0.002365,0.023309,0.000195,0.008466,0.011858,0.052462,0.021215,-0.00681,-0.055897,...,0.001764,0.006238,-0.008348,-0.002615,-0.010629,0.007316,-0.005715,0.002272,0.014742,0.011931


In [None]:
def recommender_for_user(user_id, interact_matrix, df_content, topn = 10):
    '''
    Recommender Games for UserWarning
    '''
    pred_scores = interact_matrix.loc[user_id].values

    df_scores   = pd.DataFrame({'movieId': list(pref_matrix.columns), 
                               'score': pred_scores})

    df_rec      = df_scores.set_index('movieId')\
                    .join(df_content.set_index('movieId'))\
                    .sort_values('score', ascending=False)\
                    .head(topn)[['score', 'title']]
    
    return df_rec[df_rec.score > 0]

In [None]:
pred_scores = pref_matrix.loc[3500].values

In [None]:
df_scores   = pd.DataFrame({'movieId': list(pref_matrix.columns), 
                               'score': pred_scores})

In [None]:
df_scores

Unnamed: 0,movieId,score
0,1,0.0
1,2,0.0
2,3,0.0
3,5,0.0
4,6,0.0
...,...,...
995,148626,0.0
996,152081,0.0
997,164179,0.0
998,166528,0.0


In [None]:
df_rec      = df_scores.set_index('movieId')\
                    .join(movies.set_index('movieId'))\
                    .sort_values('score', ascending=False)\
                    .head(10)[['score', 'title']]

KeyError: ignored

In [None]:
df_rec = df_scores.set_index('movieId')

In [None]:
movies.reset_index(inplace=True)

In [None]:
df2 = df_rec.join(movies.set_index('movieId'))

In [None]:
df2.sort_values('score', ascending=False).head(10)[['score', 'title']]

Unnamed: 0_level_0,score,title
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1747,1.189392,Wag the Dog (1997)
3052,1.084961,Dogma (1999)
1097,0.977203,E.T. the Extra-Terrestrial (1982)
2908,0.916576,Boys Don't Cry (1999)
1210,0.713769,Star Wars: Episode VI - Return of the Jedi (1983)
2717,0.678711,Ghostbusters II (1989)
1250,0.615142,"Bridge on the River Kwai, The (1957)"
260,0.590093,Star Wars: Episode IV - A New Hope (1977)
3000,0.572188,Princess Mononoke (Mononoke-hime) (1997)
3081,0.186354,Sleepy Hollow (1999)


In [None]:
# Games previously purchased by the user
recommender_for_user(
    user_id         = 3500, 
    interact_matrix = pref_matrix, 
    df_content      = movies)

KeyError: ignored

# predição

In [None]:
pred_scores = new_users_items_matrix_df.loc[3500].values

In [None]:
df_scores   = pd.DataFrame({'movieId': list(new_users_items_matrix_df.columns), 
                               'score': pred_scores})

In [None]:
df_rec = df_scores.set_index('movieId')

In [None]:
df2 = df_rec.join(movies.set_index('movieId'))

In [None]:
df2.sort_values('score', ascending=False).head(10)[['score', 'title']]

Unnamed: 0_level_0,score,title
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1196,0.191308,Star Wars: Episode V - The Empire Strikes Back...
2012,0.053567,Back to the Future Part III (1990)
5378,0.052292,Star Wars: Episode II - Attack of the Clones (...
595,0.051113,Beauty and the Beast (1991)
5952,0.048605,"Lord of the Rings: The Two Towers, The (2002)"
4896,0.048286,Harry Potter and the Sorcerer's Stone (a.k.a. ...
2628,0.047056,Star Wars: Episode I - The Phantom Menace (1999)
2115,0.043287,Indiana Jones and the Temple of Doom (1984)
2011,0.043203,Back to the Future Part II (1989)
736,0.042157,Twister (1996)


#predição 2

In [None]:
pred_scores = pref_matrix.loc[832].values

In [None]:
df_scores   = pd.DataFrame({'movieId': list(pref_matrix.columns), 
                               'score': pred_scores})

In [None]:
df_rec = df_scores.set_index('movieId')

In [None]:
df2 = df_rec.join(movies.set_index('movieId'))

In [None]:
df2.sort_values('score', ascending=False).head(10)[['score', 'title']]

Unnamed: 0_level_0,score,title
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
44555,1.279257,"Lives of Others, The (Das leben der Anderen) (..."
115713,1.010295,Ex Machina (2015)
307,0.998241,Three Colors: Blue (Trois couleurs: Bleu) (1993)
148626,0.95255,"Big Short, The (2015)"
1230,0.937197,Annie Hall (1977)
306,0.906683,Three Colors: Red (Trois couleurs: Rouge) (1994)
1094,0.828775,"Crying Game, The (1992)"
8949,0.825037,Sideways (2004)
300,0.815861,Quiz Show (1994)
25,0.802345,Leaving Las Vegas (1995)


In [None]:
pred_scores = new_users_items_matrix_df.loc[832].values

In [None]:
df_scores   = pd.DataFrame({'movieId': list(new_users_items_matrix_df.columns), 
                               'score': pred_scores})

In [None]:
df_rec = df_scores.set_index('movieId')

In [None]:
df2 = df_rec.join(movies.set_index('movieId'))

In [None]:
df2.sort_values('score', ascending=False).head(10)[['score', 'title']]

Unnamed: 0_level_0,score,title
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
924,0.157019,2001: A Space Odyssey (1968)
1208,0.152243,Apocalypse Now (1979)
923,0.14337,Citizen Kane (1941)
1183,0.086967,"English Patient, The (1996)"
903,0.074676,Vertigo (1958)
1196,0.073824,Star Wars: Episode V - The Empire Strikes Back...
7361,0.073451,Eternal Sunshine of the Spotless Mind (2004)
541,0.07216,Blade Runner (1982)
235,0.07116,Ed Wood (1994)
2599,0.070282,Election (1999)
