In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
#
# Plots the evolution of the training accuracy and loss
#
def plot(h,epochs):
    LOSS = 0; ACCURACY = 1
    training = np.zeros((2,epochs)); testing = np.zeros((2,epochs))
    training[LOSS] = h.history['loss']
    testing[LOSS] = h.history['val_loss']    # validation loss
    training[ACCURACY] = h.history['mae']
    testing[ACCURACY] = h.history['val_mae']  # validation accuracy

    epochs = range(1,epochs+1)
    fig, axs = plt.subplots(1,2, figsize=(17,5))
    for i, label in zip((LOSS, ACCURACY),('loss', 'mae')):   
        axs[i].plot(epochs, training[i], 'b-', label='Training ' + label)
        axs[i].plot(epochs, testing[i], 'y-', label='Test ' + label)
        axs[i].set_title('Training and test ' + label)
        axs[i].set_xlabel('Epochs')
        axs[i].set_ylabel(label)
        axs[i].legend()
        axs[i].grid(True)
    plt.show()

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

NUM_USERS = 6040  
NUM_ITEMS = 3900
FILE_NAME = "../../datasets/movielens-ratings1M.txt" #ML1M
SEPARATOR = ","
USER = 0; ITEM = 1; RATING = 2

data = np.array(pd.read_csv(FILE_NAME, sep=SEPARATOR))
    
train, test = train_test_split(data, test_size=0.2,random_state=50)
train = np.array(train).astype("float32") 
test = np.array(test).astype("float32") 

In [None]:
#
# Running DEEPMF to obtain the embedding weights (both users and items)
#
from keras.models import Model, Sequential
from keras.layers import Embedding, Flatten, Input, Dropout, Dense, Concatenate, Dot
from keras.optimizers import Adam

latent_dim = 5  # 5 neurons in the embedding can adequately code both the user and items

movie_input = Input(shape=[1],name='movie-input')
movie_embedding = Embedding(NUM_ITEMS + 1, latent_dim, name='movie-embedding')(movie_input)
movie_vec = Flatten(name='movie-flatten')(movie_embedding)

user_input = Input(shape=[1],name='user-input')
user_embedding = Embedding(NUM_USERS + 1, latent_dim, name='user-embedding')(user_input)
user_vec = Flatten(name='user-flatten')(user_embedding)

dot = Dot(axes=1,name='movie-user-concat')([movie_vec, user_vec])

model_deepMF = Model([user_input, movie_input], dot)
model_deepMF.compile(optimizer='adam', metrics=['mae'], loss='mean_squared_error')

model_deepMF.summary()

EPOCHS = 10
history_deepMF = model_deepMF.fit([train[:,USER],train[:,ITEM]],train[:,RATING], 
                    validation_data=([test[:,USER],test[:,ITEM]], test[:,RATING]), 
                    epochs=EPOCHS, verbose=1)
plot(history_deepMF,EPOCHS)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 movie-input (InputLayer)       [(None, 1)]          0           []                               
                                                                                                  
 user-input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 movie-embedding (Embedding)    (None, 1, 5)         19505       ['movie-input[0][0]']            
                                                                                                  
 user-embedding (Embedding)     (None, 1, 5)         30205       ['user-input[0][0]']             
                                                                                              

In [None]:
model_deepMF.save('ModelDeepMF.h5')

In [None]:
from keras.models import Model

# we create two models, from model_deepMF, to get user's and item's embeddings
model_user_embeddings = Model(inputs=user_input, outputs=user_embedding)
model_movie_embeddings = Model(inputs=movie_input, outputs=movie_embedding)

# obtaining all the existing users an items activation maps
user_embeddings = model_user_embeddings.predict(np.array(range(NUM_USERS+1)))
movie_embeddings = model_movie_embeddings.predict(np.array(range(NUM_ITEMS+1)))
    

In [None]:
import random

#
# creates the dataset of real samples: <user embedding, item embedding, rating>
#
def get_dataset(data):
    embedding_dataset = np.zeros((len(data)+1,latent_dim * 2)) 
    ratings = np.zeros((len(data)+1))
    for i in range(len(data)):
        user_embedding = user_embeddings[int(data[i,USER])][0]
        movie_embedding = movie_embeddings[int(data[i,ITEM])][0]
        ratings[i] = (data[i,RATING]-3.)/5.   # normalized -2/5 to 2/5
        embedding_dataset[i] = np.concatenate((user_embedding, movie_embedding))
    # add ratings
    embedding_dataset = np.insert(embedding_dataset, latent_dim * 2, ratings, axis=1)
    return embedding_dataset

# train and test embeddings and ratings datasets (positive values)
embedding_dataset = get_dataset(data)

In [None]:
embedding_dataset.shape

In [None]:
#
# WGAN to create the fake samples.
# Both the generator and the discriminator models are really small because the source samples are not
# large and sparse vectors; they are small and dense: 5 real number to code the user, 5 real numbers to 
# code the item and a real number to code the normalized rating.
#

from keras.datasets import mnist
from keras.layers import Input, Dense, Reshape, Flatten, Dropout
from keras.layers import BatchNormalization, Activation, ZeroPadding2D
from keras.layers import LeakyReLU
from keras.layers.convolutional import UpSampling2D, Conv2D
from keras.models import Sequential, Model
from keras.optimizers import RMSprop

import keras.backend as K


class WGAN():
    def __init__(self):
        self.latent_dim = latent_dim
        self.noise_dim = 100

        # Following parameter and optimizer set as recommended in paper
        self.n_critic = 5
        self.clip_value = 0.01
        optimizer = RMSprop(lr=0.00005)

        # Build and compile the critic
        self.critic = self.build_critic()
        self.critic.compile(loss=self.wasserstein_loss,
            optimizer=optimizer,
            metrics=['accuracy'])

        # Build the generator
        self.generator = self.build_generator()

        # The generator takes noise as input and generated samples
        z = Input(shape=(self.noise_dim,))
        fake_sample = self.generator(z)

        # For the combined model we will only train the generator
        self.critic.trainable = False

        # The critic takes generated samples as input and determines validity
        valid = self.critic(fake_sample)

        # The combined model  (stacked generator and critic)
        self.combined = Model(z, valid)
        self.combined.compile(loss=self.wasserstein_loss,
            optimizer=optimizer,
            metrics=['accuracy'])

    def wasserstein_loss(self, y_true, y_pred):
        return K.mean(y_true * y_pred)

    def build_generator(self):

        model = Sequential()

        model.add(Dense(10, input_dim=self.noise_dim))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dense(20))
        model.add(LeakyReLU(alpha=0.2))
        model.add(Dropout(0.2))
        model.add(Dense(self.latent_dim*2+1, activation='linear'))

        model.summary()

        noise = Input(shape=(self.noise_dim,))
        fake_sample = model(noise)

        return Model(noise, fake_sample)

    def build_critic(self):

        model = Sequential()

        model.add(Dense(4, input_dim=self.latent_dim*2 + 1))
        model.add(LeakyReLU(alpha=0.2))
        model.add(Dense(1, activation='sigmoid'))
        model.summary()

        sample = Input(shape=(self.latent_dim*2 + 1,))
        validity = model(sample)

        return Model(sample, validity)

    def train(self, dataset, epochs, batch_size=128, sample_interval=50, num_training_samples=1000):

        # Adversarial ground truths
        valid = -np.ones((batch_size, 1))
        fake = np.ones((batch_size, 1))

        for epoch in range(epochs):

            for _ in range(self.n_critic):

                # ---------------------
                #  Train Discriminator
                # ---------------------

                # Select a random batch of votes
                idx = np.random.randint(0, num_training_samples, batch_size)
                real_samples = dataset[idx]
                
                # Sample noise as generator input
                noise = np.random.normal(0, 1, (batch_size, self.noise_dim))

                # Generate a batch of new votes
                fake_samples = self.generator.predict(noise)

                # Train the critic
                d_loss_real = self.critic.train_on_batch(real_samples, valid)
                d_loss_fake = self.critic.train_on_batch(fake_samples, fake)
                d_loss = 0.5 * np.add(d_loss_fake, d_loss_real)

                # Clip critic weights
                for l in self.critic.layers:
                    weights = l.get_weights()
                    weights = [np.clip(w, -self.clip_value, self.clip_value) for w in weights]
                    l.set_weights(weights)


            # ---------------------
            #  Train Generator
            # ---------------------

            g_loss = self.combined.train_on_batch(noise, valid)

            # Plot the progress
            if epoch % sample_interval == 0:
                print ("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" % (epoch, d_loss[0], 100*d_loss[1], g_loss[0]))
                
if __name__ == '__main__':
    wgan = WGAN()
    wgan.train(embedding_dataset, 
               epochs=20000, 
               batch_size=64, sample_interval=200,
               num_training_samples = len(embedding_dataset))
    wgan.generator.save('WGANRS.h5')




800 [D loss: -0.000203, acc.: 48.44%] [G loss: -0.500286]






1000 [D loss: -0.000171, acc.: 49.22%] [G loss: -0.500349]






1200 [D loss: -0.000142, acc.: 49.22%] [G loss: -0.500315]


2200 [D loss: -0.000028, acc.: 50.00%] [G loss: -0.500175]








2400 [D loss: -0.000023, acc.: 48.44%] [G loss: -0.500148]






2600 [D loss: -0.000022, acc.: 46.88%] [G loss: -0.500166]






2800 [D loss: -0.000006, acc.: 48.44%] [G loss: -0.500120]






3000 [D loss: -0.000005, acc.: 46.88%] [G loss: -0.500044]






3200 [D loss: 0.000004, acc.: 50.00%] [G loss: -0.500082]






3400 [D loss: -0.000001, acc.: 50.00%] [G loss: -0.500075]






3600 [D loss: 0.000003, acc.: 48.44%] [G loss: -0.500062]






3800 [D loss: 0.000003, acc.: 49.22%] [G loss: -0.500055]






4000 [D loss: -0.000001, acc.: 50.00%] [G loss: -0.500059]






4200 [D loss: 0.000001, acc.: 50.00%] [G loss: -0.500052]




4400 [D loss: 0.000001, acc.: 50.00%] [G loss: -0.500064]






4600 [D loss: 0.000003, acc.: 50.00%] [G loss: -0.500088]








4800 [D loss: 0.000004, acc.: 50.00%] [G loss: -0.500056]






5000 [D loss: -0.000000, acc.: 50.00%] [G loss: -0.500070]






5200 [D loss: 0.000004, acc.: 50.00%] [G loss: -0.500085]






5400 [D loss: 0.000002, acc.: 49.22%] [G loss: -0.500050]






5600 [D loss: 0.000002, acc.: 50.00%] [G loss: -0.500055]






5800 [D loss: 0.000003, acc.: 50.00%] [G loss: -0.500052]








6000 [D loss: 0.000005, acc.: 50.00%] [G loss: -0.500051]






6200 [D loss: 0.000005, acc.: 50.00%] [G loss: -0.500044]






6400 [D loss: 0.000006, acc.: 50.00%] [G loss: -0.500046]






6600 [D loss: 0.000006, acc.: 50.00%] [G loss: -0.500038]








6800 [D loss: 0.000006, acc.: 50.00%] [G loss: -0.500032]






7000 [D loss: 0.000006, acc.: 50.00%] [G loss: -0.500036]






7200 [D loss: 0.000005, acc.: 50.00%] [G loss: -0.500036]






7400 [D loss: 0.000005, acc.: 50.00%] [G loss: -0.500040]






7600 [D loss: 0.000006, acc.: 50.00%] [G loss: -0.500034]








7800 [D loss: 0.000007, acc.: 50.00%] [G loss: -0.500034]






8000 [D loss: 0.000006, acc.: 50.00%] [G loss: -0.500029]








8200 [D loss: 0.000006, acc.: 50.00%] [G loss: -0.500027]






8400 [D loss: 0.000006, acc.: 50.00%] [G loss: -0.500028]






8600 [D loss: 0.000006, acc.: 50.00%] [G loss: -0.500029]








In [None]:
from keras.models import load_model
import statistics
import numpy as np

############
# creates augmented samples in the format:
# embedding user, embedding item and rating vectors

FAKE_SAMPLES = 200000  # number of generated fake samples

def get_augmented(model, num_fake_samples):
    STD_DEV = 1 # You can stablish the fake samples variability by setting this value (usually
    # in the range [0.2..2.0])
    noise = np.random.normal(0, STD_DEV, (num_fake_samples, 100)) # wgan.noise_dim
    fake_samples = model.predict(noise)   
    return fake_samples
 
augmented = get_augmented(load_model('WGANRS.h5'), FAKE_SAMPLES)

# Normalizes the ratings distribution
y = augmented[:,-1] 

mu_aug = statistics.mean(y)
de_aug = statistics.stdev(y, mu_aug)

embedding_dataset.shape = (-1,11) # cambio por 1x11x1 a 11,
mu = statistics.mean(embedding_dataset[:,-1])
de = statistics.stdev(embedding_dataset[:,-1], mu)
print(mu, de)
   
augmented[:,-1] *= mu / mu_aug
mu = statistics.mean(augmented[:,-1])
de = statistics.stdev(augmented[:,-1], mu)
print(mu, de)

In [None]:
from sklearn.cluster import KMeans

# We can create 'families of datasets' containing combinations of number of users and items
# This sklearn kMeans process can be slow when the number of users or items is high.
testing_users = [500,1000]
testing_items = [500,1000]

for K in testing_users:   
    clustering_users = KMeans(n_clusters=K, n_init=1, max_iter= 100, verbose=1)
    cluster_users = clustering_users.fit_predict(augmented[:FAKE_SAMPLES,:latent_dim])  
    np.save('W_cluster_users_' + str(FAKE_SAMPLES) + '_' + str(K), cluster_users)  
    print("end cluster users: " + str(K))

for K in testing_items:   
    clustering_items = KMeans(n_clusters=K, n_init=1, max_iter= 100, verbose=1)
    cluster_items = clustering_items.fit_predict(augmented[:FAKE_SAMPLES,latent_dim:-1])  
    np.save('W_cluster_items_' + str(FAKE_SAMPLES) + '_' + str(K), cluster_items)   
    print("end cluster items: " + str(K))

In [None]:
#
# Writing syntetic datasets
#
for Ku in testing_users:
    cluster_users = np.load('W_cluster_users_' + str(FAKE_SAMPLES) + '_' + str(Ku)+'.npy')
    for Ki in testing_items:
        cluster_items = np.load('W_cluster_items_' + str(FAKE_SAMPLES) + '_' + str(Ki)+'.npy')
        f = open ('W_synthetic_'+str(FAKE_SAMPLES)+'_'+str(Ku)+'_'+str(Ki)+'.txt','w')
        f.write('user,item,rating\n')
        for i in range(FAKE_SAMPLES):
            us = str(int(cluster_users[i]))
            it = str(int(cluster_items[i]))
            ra = augmented[i,-1:][0]*5. + 3.  # from normalized [[-2/5..2/5] to [1..5]]
            # ratings quantization
            if ra > 5.:
                ra = 5
            elif ra < 1:
                ra = 1
            else:
                ra = int(np.round(ra))
            f.write(us + "," + it + "," + str(ra) + "\n")
        f.close()
        print("synthetic dataset " + str(Ku) + ', ' + str(Ki) + " has been created")
      

In [None]:
#
# We remove duplicated samples and also samples where the same user votes more than 2 ratings to the same item
#

SEPARATOR = ","
sizes = np.zeros((len(testing_users),len(testing_items)))
for i,Ku in enumerate(testing_users):
    for j,Ki in enumerate(testing_items):
        file = 'W_synthetic_'+str(FAKE_SAMPLES)+'_'+str(Ku)+'_'+str(Ki)+'.txt'
        data = np.array(pd.read_csv(file, sep=SEPARATOR, header=1))
        df = pd.DataFrame(data)
        df = df.drop_duplicates()
        
        df.to_csv(file, index=False, header=['user','item','rating'])  
        
        # remove samples where the same user votes more than 2 ratings to the same item
        tuples_to_remove = []
        for (user, item), group in df.groupby(by=[0,1]):
            if len(group)>=3:
                tuples_to_remove.extend(group.index.tolist())
        df.drop(tuples_to_remove, inplace=True)
        sizes[i][j] = len(df) 
        print(file, len(df)) 
        df.to_csv(file, index=False, header=['user','item','rating'])  
        np.save('W_synthetic_'+str(FAKE_SAMPLES)+'_SIZES', sizes)      
