# CNN for recommendation

In [None]:
from keras.layers import Dense, Activation, Input, MaxPooling2D, Embedding, Conv2D, Reshape, Permute
from keras.models import load_model, Model, Sequential
from keras.utils import Sequence
from keras.utils import plot_model, model_to_dot

import matplotlib.pyplot as plt
import csv
import math
import numpy as np
# from keras_sequential_ascii import keras2ascii

The goal of this notebook is to leverage the geometrical representation power of CNNs to achieve better performance on recommendation on a series of benchmarks.

In [None]:
from google.colab import drive # Il faut pouvoir lire les fichiers CSV du Drive
drive.mount('/content/drive')

In [None]:
# pip install git+git://github.com/stared/keras-sequential-ascii.git

In [None]:
# Nombre de valeurs dans le dictionnaire + la valeur vide
d = 14370 + 1 # 1 catégories supplémentaires : une <EOS>
# Taille de l'input
Tx = 64
# Taille de l'output
Ty = 16
# Dimension de l'embedding
n_e = 256
# Mini-batch size
m = 256

# Paramètre du premier CONV2D
n_f_1 , n_k_1 = 32 , 3
n_f_2 , n_k_2 = 64 , 5
n_f_3 , n_k_3 = 128 , 7

n_mp_1 = 2
n_mp_2 = 4
n_mp_3 = 8

n_k_4 = 32

# Paramêtres du FC network
n_fc = 512

In [None]:
# def CNNmodel():
#   inpt = Input(shape=(Tx,))
#   # Embedding
#   embedding = Embedding(input_dim=d , output_dim=n_e, input_length=Tx)
#   # First layer, we embedd the input sequence to get a (Tx , n_e) picture
#   x = embedding(inpt)
#   x = Reshape(target_shape=(Tx, n_e , 1))(x)

#   # A Conv2D network
#   x = Conv2D(filters=n_f_1 , kernel_size= n_k_1, padding = "same" , activation="relu")(x)
#   x = MaxPooling2D(pool_size=(n_mp_1 , 1) , padding="same")(x)
#   x = Conv2D(filters=n_f_2 , kernel_size= n_k_2, padding = "same", activation="relu")(x)
#   x = MaxPooling2D(pool_size=(n_mp_2 , 1) , padding="same")(x)
#   x = Conv2D(filters=n_f_3 , kernel_size= n_k_3, padding = "same", activation="relu")(x)
#   x = MaxPooling2D(pool_size=(n_mp_3 , 1) , padding="same")(x)
#   x = Conv2D(filters=Ty , kernel_size= (1, n_k_4), padding = "same", activation="relu")(x)

#   x = Reshape(target_shape=(n_e, Ty))(x)
#   x = Permute((2,1) , input_shape=(n_e,Ty))(x)
#   # A FC network
#   x = Dense(n_fc, activation="relu")(x)
#   x = Dense(d , activation="softmax")(x)

#   model = Model(inputs = inpt , outputs = x)
#   return model

In [None]:
# model = CNNmodel()
# model.compile(loss="sparse_categorical_crossentropy",optimizer="adam", metrics="accuracy")
#model.summary()

In [None]:
#model.save("/content/drive/MyDrive/PSC Recommandation séquentielle/Modèles/CNN/NoPairCNN_save")

In [None]:
folder = "/content/drive/MyDrive/PSC Recommandation séquentielle/Données/DataTables/"
class DataGenerator(Sequence):
  def __init__(self , nb_lines, X_path, Y_path):
    self.X_path = X_path
    self.X_reader = csv.reader(open(folder + X_path , "r"))
    self.Y_path = Y_path
    self.Y_reader = csv.reader(open(folder + Y_path , "r"))
    self.nb_lines = nb_lines

  def __len__(self):
    return math.ceil(self.nb_lines/m)

  def __getitem__(self, idx):
    X1 = []
    Y = []
    for i in range(m):
      x,y = self.getNextSample()
      x = [int(i) for i in x]
      y = [[int(i)] for i in y]
      X1.append(x)
      Y.append(y)
    X1 = np.array(X1)
    return np.array(X1) , np.array(Y)

  def getNextSample(self):
    x = next(self.X_reader , None)
    y = next(self.Y_reader , None)
    if x is None:
      self.X_reader = csv.reader(open(folder + self.X_path , "r"))
      self.Y_reader = csv.reader(open(folder + self.Y_path , "r"))
      x = next(self.X_reader , None)
      y = next(self.Y_reader , None)
    return x , y

In [None]:
model = load_model("/content/drive/MyDrive/PSC Recommandation séquentielle/Modèles/CNN/CNN No Pair/NoPairCNN_save")

In [None]:
# graph = model_to_dot(model)
# graph.write_png('/content/drive/MyDrive/PSC Recommandation séquentielle/Modèles/CNN/CNN No Pair/model.png')
# model.summary()

In [None]:
# train_gen = DataGenerator(3705954
#                           , "./s6_X_train.csv"
#                           ,"./s6_Y_train.csv")
# model.fit(train_gen , epochs=1, verbose = 1)

# model.save("NoPairCNN_save")

# Comparing train and dev sets

In [None]:
train_gen = DataGenerator(3705954, "s_X_train.csv" , "s_Y_train.csv")
model.evaluate(train_gen)

In [None]:
dev_gen = DataGenerator(205412, "X_dev.csv" , "Y_dev.csv")
model.evaluate(dev_gen)

In [None]:
with open(folder + "X_dev.csv", 'r') as csvfile:
  spamreader = csv.reader(csvfile)
  n = 0
  for row in spamreader:
    n += 1
  print(n) # 206202 lignes dans le dev set, 205412 dans le test set, 3705954 dans le train set

## Kind of beam search to predict baskets in the most promising way

In [None]:
def k_largest(l, k, exclus = []): # renvoie une liste ind des indices des k plus grands éléments de l avec l[ind[0]] >= ... >= l[ind[k-1]]. exclus est une liste d'indices exclus de la recherche
  n = len(l)
  if (n - len(exclus) <= k):
    ind = [i for i in range (len(l)) if i not in exclus] # on n'oublie pas de gérer les exclus
    ind.sort(key = lambda i: -l[i])
    return ind
  exclus1 = [i for i in exclus if i < n // 2]
  exclus2 = [i - n // 2 for i in exclus if i >= n // 2]
  ind_temp1 = k_largest(l[:n // 2], k, exclus1)
  ind_temp2 = k_largest(l[n // 2:], k, exclus2)
  k1, k2 = len(ind_temp1), len(ind_temp2)
  i1, i2 = 0, 0
  ind = []
  while (i1 + i2 < k):
    if (i2 == k2 or (i1 < k1 and l[ind_temp1[i1]] > l[ind_temp2[i2] + n // 2])):
      ind.append(ind_temp1[i1])
      i1 += 1
    else:
      ind.append(ind_temp2[i2] + n // 2)
      i2 += 1
  return ind

In [None]:
def predict(y_hat, k):
  # on suppose que y_hat est déjà de dimension 16 * (d - 1) où d - 1 = 14370
  ran = []
  for i0 in range (k): ran.extend([(i0, j) for j in range (k)])
  pred = [] # on initialise la prédiction
  ind1 = k_largest(y_hat[0], k, pred)
  for i in range (15):
    ind2 = k_largest(y_hat[i + 1], k, pred)
    ind = [(ind1[i1], ind2[i2], y_hat[i][ind1[i1]] * y_hat[i + 1][ind2[i2]]) for (i1, i2) in ran]
    ind.sort(key = lambda j: -j[2])
    l = min(j for j in range (k ** 2) if (ind[j][0] != ind[j][1]))
    pred.append(ind[l][0])
    if (i == 14): pred.append(ind[l][1])
    else: ind1 = k_largest(y_hat[i + 1], k, pred)
  return pred

In [None]:
def predict2(y_hat, k):
  # on suppose que y_hat est déjà de dimension 16 * (d - 1) où d - 1 = 14370
  ran = []
  for i0 in range (k): ran.extend([(i0, j) for j in range (k)])
  pred = [] # on initialise la prédiction
  for i in range (0, 15, 2):
    ind1 = k_largest(y_hat[i], k, pred)
    ind2 = k_largest(y_hat[i + 1], k, pred)
    ind = [(ind1[i1], ind2[i2], y_hat[i][ind1[i1]] * y_hat[i + 1][ind2[i2]]) for (i1, i2) in ran]
    ind.sort(key = lambda j: -j[2])
    l = min(j for j in range (k ** 2) if (ind[j][0] != ind[j][1]))
    pred.append(ind[l][0])
    pred.append(ind[l][1])
  return pred

In [None]:
def predict3(y_hat, k):
  # on suppose que y_hat est déjà de dimension 16 * (d - 1) où d - 1 = 14370
  ran = []
  for i1 in range (k):
    for i2 in range (k):
      for i3 in range (k):
        ran.append((i1, i2, i3))
  pred = [] # on initialise la prédiction
  for i in range (0, 15, 3):
    ind1 = k_largest(y_hat[i], k, pred)
    ind2 = k_largest(y_hat[i + 1], k, pred)
    ind3 = k_largest(y_hat[i + 2], k, pred)
    ind = [(ind1[i1], ind2[i2], ind3[i3], y_hat[i][ind1[i1]] * y_hat[i + 1][ind2[i2]] * y_hat[i + 2][ind3[i3]]) for (i1, i2, i3) in ran]
    ind.sort(key = lambda j: -j[3])
    l = min(j for j in range (k ** 3) if (ind[j][0] != ind[j][1] and ind[j][0] != ind[j][2] and ind[j][1] != ind[j][2]))
    pred.append(ind[l][0])
    pred.append(ind[l][1])
    pred.append(ind[l][2])
    if (i == 12): pred.append(k_largest(y_hat[i + 3], 2, pred)[0]) # on ajoute le dernier
  return pred

In [None]:
def predict4(y_hat, k):
  # on suppose que y_hat est déjà de dimension 16 * (d - 1) où d - 1 = 14370
  ran = []
  for i1 in range (k):
    for i2 in range (k):
      for i3 in range (k):
        for i4 in range (k):
          ran.append((i1, i2, i3, i4))
  pred = [] # on initialise la prédiction
  for i in range (0, 15, 4):
    ind1 = k_largest(y_hat[i], k, pred)
    ind2 = k_largest(y_hat[i + 1], k, pred)
    ind3 = k_largest(y_hat[i + 2], k, pred)
    ind4 = k_largest(y_hat[i + 3], k, pred)
    ind = [(ind1[i1], ind2[i2], ind3[i3], ind4[i4], y_hat[i][ind1[i1]] * y_hat[i + 1][ind2[i2]] * y_hat[i + 2][ind3[i3]] * y_hat[i + 3][ind4[i4]]) for (i1, i2, i3, i4) in ran]
    ind.sort(key = lambda j: -j[4])
    l = min(j for j in range (k ** 4) if (ind[j][0] != ind[j][1] and ind[j][0] != ind[j][2] and ind[j][1] != ind[j][2] and ind[j][0] != ind[j][3] and ind[j][1] != ind[j][3] and ind[j][2] != ind[j][3]))
    pred.append(ind[l][0])
    pred.append(ind[l][1])
    pred.append(ind[l][2])
    pred.append(ind[l][3])
  return pred

## Results

In [None]:
with open(folder+"Y_dev.csv" , "r") as file :
  r = csv.reader(file)
  for i in range(1):
    next(r)
  y = [int(i) for i in next(r)]
  print("Séquence de référence : ")
  print(y)

with open(folder+"X_dev.csv" , "r") as file :
  r = csv.reader(file)
  for i in range(1):
    next(r)
  x = np.array([[ int(i) for i in next(r)]])
  print("64 achats précédents")
  print(x)
  y_hat = model(x)
  y_hat = y_hat[0]
y_hat_np = y_hat.numpy()[:, 1:] # ATTENTION ON ENLEVE LA PREMIERE COLONNE A CAUSE DU ARGMAX...
p_y = [y_hat_np[i][y[i]] for i in range(16)]
print("Probas de la séquence de référence :")
print(p_y)
# p_y_hat = [np.max(y_hat_np[i]) for i in range(16)]
# y_hat = [1 + np.argmax(y_hat_np[i]) for i in range(16)]
########## PREMIERE METHODE ##########
y_hat = []
p_y_hat = []
for i in range (16):
  j = min(l for l in range (d - 1) if l not in y_hat)
  for l in range (j, d-1):
    if (l not in y_hat and y_hat_np[i][l] > y_hat_np[i][j]):
      j = l
  y_hat.append(j)
  p_y_hat.append(y_hat_np[i][j])
y_hat = [1 + y for y in y_hat]
print("Séquence inférée et proba (méthode simple) : ")
print(y_hat)
print(p_y_hat)

########## DEUXIEME METHODE ##########
y_hat = predict(y_hat_np, 20)
p_y_hat = [y_hat_np[i][y_hat[i]] for i in range (16)]
y_hat = [1 + x for x in y_hat]
print("Séquence inférée et proba (méthode faisceaux) : ")
print(y_hat)
print(p_y_hat)

########## TROISIEME METHODE ##########
y_hat = predict2(y_hat_np, 100)
p_y_hat = [y_hat_np[i][y_hat[i]] for i in range (16)]
y_hat = [1 + x for x in y_hat]
print("Séquence inférée et proba (méthode faisceaux deux par deux) : ")
print(y_hat)
print(p_y_hat)

########## QUATRIEME METHODE ##########
y_hat = predict3(y_hat_np, 20)
p_y_hat = [y_hat_np[i][y_hat[i]] for i in range (16)]
y_hat = [1 + x for x in y_hat]
print("Séquence inférée et proba (méthode faisceaux trois par trois) : ")
print(y_hat)
print(p_y_hat)

########## CINQUIEME METHODE ##########
y_hat = predict4(y_hat_np, 15)
p_y_hat = [y_hat_np[i][y_hat[i]] for i in range (16)]
y_hat = [1 + x for x in y_hat]
print("Séquence inférée et proba (méthode faisceaux quatre par quatre) : ")
print(y_hat)
print(p_y_hat)