In [1]:
import os
import numpy as np
import scipy
import pandas as pd
import math 
import random
import sklearn
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [2]:
vente_2015 = pd.read_csv("C:/Users/silva/PycharmProjects/TestExtract/FichierMock_réduit_100K.csv", sep = r'\s*;\s*', header = 0, encoding ="latin-1", engine = 'python')

#Attribution d'une note 1 vente = 1pts
vente_2015['note'] = 1

#Pour la suite du notebook, on va se concentrer sur deux catégories
vente_2015_autre = vente_2015[(vente_2015['categorie_produit'] == 'Autre') | (vente_2015['categorie_client'] == 'Autre')]

vente_2015_high_tech = vente_2015[(vente_2015['categorie_produit'] == 'High-tech') | (vente_2015['categorie_client'] == 'High-tech')]

print('Nombre de ventes par un client ou dun produit "Autre" : %d' % len(vente_2015_autre))

print('Nombre de ventes par un client ou dun produit "High-tech" : %d' % len(vente_2015_high_tech))


Nombre de ventes par un client ou dun produit "Autre" : 346
Nombre de ventes par un client ou dun produit "High-tech" : 19575


In [3]:
#On regroupe d'abord les ventes par client/produit ... 
vente_2015_unique_autre = vente_2015_autre.groupby(['id_client', 'id_produit']).size().groupby('id_client').size()
print("Nombre de client pour catégorie Autre %d" % len(vente_2015_unique_autre))

vente_2015_unique_high_tech = vente_2015_high_tech.groupby(['id_client', 'id_produit']).size().groupby('id_client').size()
print("Nombre de client pour catégorie High-tech %d" % len(vente_2015_unique_high_tech))

#... pour ensuite évaluer le nombre d'achat unique il a fait
#Nous gardons seulement les clients qui ont plus de 2 achats
#Cela pour eviter les problèmes de cold-start
vente_2015_autre_suffisant = vente_2015_unique_autre[vente_2015_unique_autre >= 2].reset_index()[['id_client']]
print("Nombre de client ayant fait un nombre suffisant (2) d'achat Autre %d" % len(vente_2015_autre_suffisant))

vente_2015_high_tech_suffisant = vente_2015_unique_high_tech[vente_2015_unique_high_tech >= 2].reset_index()[['id_client']]
print("Nombre de client ayant fait un nombre suffisant (2) d'achat high_tech %d" % len(vente_2015_high_tech_suffisant))

Nombre de client pour catégorie Autre 194
Nombre de client pour catégorie High-tech 5573
Nombre de client ayant fait un nombre suffisant (2) d'achat Autre 49
Nombre de client ayant fait un nombre suffisant (2) d'achat high_tech 2988


In [4]:
#On join le fichier de base avec les données que nous avons récupéré (les clients ayant acheté un nombre suffisant de produit)
#Afin d'avoir les informations de la vente initiale
vente_2015_entier_autre = vente_2015.merge(vente_2015_autre_suffisant, 
                                how = 'right',
                                left_on = 'id_client',
                                right_on = 'id_client')
print("Nombre de vente autre  %d" % len(vente_2015_entier_autre))

vente_2015_entier_high_tech = vente_2015.merge(vente_2015_high_tech_suffisant, 
                                how = 'right',
                                left_on = 'id_client',
                                right_on = 'id_client')
print("Nombre de vente high-tech  %d" % len(vente_2015_entier_high_tech))

Nombre de vente autre  435
Nombre de vente high-tech  23445


In [5]:
#Pour chaque interaction, nous allons ajouter un facteur, ce qui montre qu'il est est très intéressé par ce produit
def smooth_user_preference(x):
    return math.log(1+x, 2)

#On aggrege alors sur la note et on enlève les doublons
vente_autre = vente_2015_entier_autre.groupby(['id_client', 'id_produit'])['note'].sum().apply(smooth_user_preference).reset_index()
print("Nombre de vente unique client/produit autre %d" % len(vente_autre))

vente_high_tech = vente_2015_entier_high_tech.groupby(['id_client', 'id_produit'])['note'].sum().apply(smooth_user_preference).reset_index()
print("Nombre de vente unique client/produit high-tech %d" % len(vente_high_tech))

Nombre de vente unique client/produit autre 429
Nombre de vente unique client/produit high-tech 23436


In [6]:
#On split le fichier de base en 2 parties : Train et Test
vente_train_autre, vente_test_autre = train_test_split(vente_autre, 
                                          test_size = 0.20)
print("Nombre de vente autre train %d" % len(vente_train_autre))
print("Nombre de vente autre test %d" % len(vente_test_autre))

vente_train_high_tech, vente_test_high_tech = train_test_split(vente_high_tech, 
                                          test_size = 0.20)
print("Nombre de vente high-tech train %d" % len(vente_train_high_tech))
print("Nombre de vente high-tech test %d" % len(vente_test_high_tech))

Nombre de vente autre train 343
Nombre de vente autre test 86
Nombre de vente high-tech train 18748
Nombre de vente high-tech test 4688


In [7]:
#On indexe pour l'id_client pour faciliter les recherches pendant l'évaluation du modèle
vente_full_indexed_autre = vente_autre.set_index('id_client')
vente_train_indexed_autre = vente_train_autre.set_index('id_client')
vente_test_indexed_autre = vente_test_autre.set_index('id_client')

vente_full_indexed_high_tech = vente_high_tech.set_index('id_client')
vente_train_indexed_high_tech = vente_train_high_tech.set_index('id_client')
vente_test_indexed_high_tech = vente_test_high_tech.set_index('id_client')

In [8]:
#Permet de recupérer les données clients et produit
def get_items_interacted(person_id, vente):
    interacted_items = vente.loc[person_id]['id_produit']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [9]:
#Nous avons choisit d'utiliser la métrique Recall@N
#Elle évalue si l'élément fait partie des N meilleurs recommendations
#La précision Top-N 
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

#CLASSE POUR LA CATEGORIE AUTRE
class ModelEvaluator:


    def get_not_interacted_items_sample(self, person_id, sample_size, seed=42):
        interacted_items = get_items_interacted(person_id, vente_full_indexed_autre)
        all_items = set(vente_2015_autre['id_produit'])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, topn):        
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index

    def evaluate_model_for_user(self, model, person_id):
        #Getting the items in test set
        interacted_values_testset = vente_test_indexed_autre.loc[person_id]
        if type(interacted_values_testset['id_produit']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['id_produit'])
        else:
            person_interacted_items_testset = set([int(interacted_values_testset['id_produit'])])  
        interacted_items_count_testset = len(person_interacted_items_testset) 

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend_items(person_id, 
                                               items_to_ignore=get_items_interacted(person_id, 
                                                                                    vente_train_indexed_autre), 
                                               topn=10000000000)

        hits_at_5_count = 0
        hits_at_10_count = 0
        #For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id, 
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, 
                                                                          seed=item_id%(2**32))

            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['id_produit'].isin(items_to_filter_recs)]                    
            valid_recs = valid_recs_df['id_produit'].values
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        person_metrics = {'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    def evaluate_model(self, model):
        #print('Running evaluation for users')
        people_metrics = []
        for idx, person_id in enumerate(list(vente_test_indexed_autre.index.unique().values)):
            #if idx % 100 == 0 and idx > 0:
            #    print('%d users processed' % idx)
            person_metrics = self.evaluate_model_for_user(model, person_id)  
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('interacted_count', ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': model.get_model_name(),
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df
    
model_evaluator_autre = ModelEvaluator()   

In [10]:
#Nous avons choisit d'utiliser la métrique Recall@N
#Elle évalue si l'élément fait partie des N meilleurs recommendations
#La précision Top-N 
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

#CLASSE POUR LA CATEGORIE HIGH-TECH
class ModelEvaluator:


    def get_not_interacted_items_sample(self, person_id, sample_size, seed=42):
        interacted_items = get_items_interacted(person_id, vente_full_indexed_high_tech)
        all_items = set(vente_2015_high_tech['id_produit'])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, topn):        
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index

    def evaluate_model_for_user(self, model, person_id):
        #Getting the items in test set
        interacted_values_testset = vente_test_indexed_high_tech.loc[person_id]
        if type(interacted_values_testset['id_produit']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['id_produit'])
        else:
            person_interacted_items_testset = set([int(interacted_values_testset['id_produit'])])  
        interacted_items_count_testset = len(person_interacted_items_testset) 

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend_items(person_id, 
                                               items_to_ignore=get_items_interacted(person_id, 
                                                                                    vente_train_indexed_high_tech), 
                                               topn=10000000000)

        hits_at_5_count = 0
        hits_at_10_count = 0
        #For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id, 
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, 
                                                                          seed=item_id%(2**32))

            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['id_produit'].isin(items_to_filter_recs)]                    
            valid_recs = valid_recs_df['id_produit'].values
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        person_metrics = {'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    def evaluate_model(self, model):
        #print('Running evaluation for users')
        people_metrics = []
        for idx, person_id in enumerate(list(vente_test_indexed_high_tech.index.unique().values)):
            #if idx % 100 == 0 and idx > 0:
            #    print('%d users processed' % idx)
            person_metrics = self.evaluate_model_for_user(model, person_id)  
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('interacted_count', ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': model.get_model_name(),
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df
    
model_evaluator_high_tech = ModelEvaluator()  

In [11]:
#Création de la matrice des clients / Produits avec 0 quand ils ont pas acheté le produit, et 1 quand ils ont acheté
#Matrice catégorie autre
matrix_vente_autre = vente_train_autre.pivot(index = 'id_client',
                                                 columns = 'id_produit',
                                                 values = 'note').fillna(0)

#Matrice catégorie High-tech 
matrix_vente_high_tech = vente_train_high_tech.pivot(index = 'id_client',
                                                 columns = 'id_produit',
                                                 values = 'note').fillna(0)


In [12]:
matrix_vente_autre_valeur = matrix_vente_autre.values
matrix_vente_high_tech_valeur = matrix_vente_high_tech.values

In [13]:
matrix_user_autre = list(matrix_vente_autre.index)
matrix_user_high_tech = list(matrix_vente_high_tech.index)

In [14]:
user_sparse_matrix_autre = csr_matrix(matrix_vente_autre_valeur)
user_sparse_matrix_autre

<49x272 sparse matrix of type '<class 'numpy.float64'>'
	with 343 stored elements in Compressed Sparse Row format>

In [15]:
user_sparse_matrix_high_tech = csr_matrix(matrix_vente_high_tech_valeur)
user_sparse_matrix_high_tech

<2976x15311 sparse matrix of type '<class 'numpy.float64'>'
	with 18748 stored elements in Compressed Sparse Row format>

In [16]:
nb_facteurs = 15
#Facteur pour le calcul sur les matrices
U_autre, sigma_autre, Vt_autre = svds(user_sparse_matrix_autre, k = nb_facteurs)
U_high_tech, sigma_high_tech, Vt_high_tech = svds(user_sparse_matrix_high_tech, k = nb_facteurs)

In [17]:
U_autre.shape
U_high_tech.shape

(2976, 15)

In [18]:
Vt_autre.shape
Vt_high_tech.shape

(15, 15311)

In [19]:
sigma_autre = np.diag(sigma_autre)
sigma_autre.shape

sigma_high_tech = np.diag(sigma_high_tech)
sigma_high_tech.shape

(15, 15)

In [20]:
#Nous allons calculer des prédicteurs pour les valeurs 0 de la matrice "matrix_vente_xx" 
user_predicted_autre = np.dot(np.dot(U_autre,sigma_autre), Vt_autre)

user_predicted_high_tech = np.dot(np.dot(U_high_tech, sigma_high_tech), Vt_high_tech)
user_predicted_high_tech
user_predicted_autre

array([[ 6.90755148e-01,  1.43744801e-02,  2.79653720e-03, ...,
        -3.65170567e-02, -3.33897453e-03, -1.05784733e-02],
       [-8.50720268e-03,  3.75230575e-02, -1.59832744e-03, ...,
        -3.03923247e-03,  2.75171119e-03, -1.70982990e-02],
       [ 1.03589230e-02, -1.05684726e-01, -8.15059722e-03, ...,
         3.05882881e-04, -2.26224547e-02,  6.77073710e-02],
       ...,
       [-6.28608841e-03,  5.11183316e-03,  3.77827037e-04, ...,
        -8.19691274e-03,  2.67191357e-03,  1.79525354e-01],
       [ 3.63369477e-02, -2.75307607e-02, -8.07815423e-03, ...,
         2.86233518e-01,  6.73148851e-03,  1.09452447e-01],
       [-3.12520637e-02, -6.80506645e-03, -1.58442247e-03, ...,
        -8.54136468e-02,  2.16852723e-03,  2.95342983e-02]])

In [21]:
user_predicted_autre_normalise = (user_predicted_autre - user_predicted_autre.min()) / (user_predicted_autre.max() - user_predicted_autre.min())

user_predicted_high_tech_normalise = (user_predicted_high_tech - user_predicted_high_tech.min()) / (user_predicted_high_tech.max() - user_predicted_high_tech.min())
user_predicted_high_tech_normalise
user_predicted_autre_normalise

array([[0.59711429, 0.19185291, 0.18491585, ..., 0.16136065, 0.18123968,
        0.17690205],
       [0.17814308, 0.20572265, 0.18228261, ..., 0.18141928, 0.18488899,
        0.17299562],
       [0.18944694, 0.11991803, 0.17835674, ..., 0.18342354, 0.16968576,
        0.22380793],
       ...,
       [0.17947388, 0.18630308, 0.18346665, ..., 0.17832899, 0.18484118,
        0.29080498],
       [0.20501197, 0.16674489, 0.17840015, ..., 0.35474043, 0.18727352,
        0.24881998],
       [0.16451523, 0.17916293, 0.18229094, ..., 0.13206368, 0.18453957,
        0.20093609]])

In [22]:
matrix_preds_autre = pd.DataFrame(user_predicted_autre_normalise, columns = matrix_vente_autre.columns, index = matrix_user_autre).transpose()

matrix_preds_high_tech = pd.DataFrame(user_predicted_high_tech_normalise, columns = matrix_vente_high_tech.columns, index = matrix_user_high_tech).transpose()
matrix_preds_high_tech.head(10)
matrix_preds_autre.head(10)

Unnamed: 0_level_0,92611,96649,96790,97359,97404,97440,97528,97615,97663,97673,...,98715,98719,98723,98730,98731,98735,98754,98756,98757,98758
id_produit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1624,0.597114,0.178143,0.189447,0.180193,0.181523,0.184691,0.180708,0.18206,0.188955,0.191929,...,0.184302,0.187323,0.195808,0.184954,0.268073,0.18124,0.189169,0.179474,0.205012,0.164515
3855,0.191853,0.205723,0.119918,0.170632,0.286603,0.192976,0.162236,0.15967,0.177868,0.161339,...,0.190323,0.251886,0.151509,0.150147,0.15903,0.157254,0.242634,0.186303,0.166745,0.179163
4047,0.184916,0.182283,0.178357,0.18582,0.172377,0.170774,0.185205,0.188121,0.183217,0.180761,...,0.183236,0.174193,0.186002,0.183725,0.183569,0.187181,0.177974,0.183467,0.1784,0.182291
4548,0.175565,0.180847,0.18678,0.173728,0.167129,0.182459,0.254725,0.166679,0.177373,0.184683,...,0.14861,0.211555,0.190347,0.184725,0.213308,0.188989,0.204522,0.169945,0.173933,0.17249
6623,0.182594,0.172009,0.218386,0.186667,0.187389,0.182778,0.195359,0.212913,0.192115,0.190469,...,0.185633,0.17266,0.192782,0.22775,0.175595,0.208948,0.198271,0.185563,0.181088,0.170356
8225,0.170445,0.222378,0.215561,0.175586,0.18663,0.314002,0.183378,0.174353,0.185723,0.199678,...,0.183287,0.187274,0.186577,0.195298,0.195295,0.176071,0.175432,0.176436,0.188829,0.204328
8844,0.18324,0.18324,0.18324,0.18324,0.18324,0.18324,0.18324,0.18324,0.18324,0.18324,...,0.18324,0.18324,0.18324,0.18324,0.18324,0.18324,0.18324,0.18324,0.18324,0.18324
13171,0.184916,0.182283,0.178357,0.18582,0.172377,0.170774,0.185205,0.188121,0.183217,0.180761,...,0.183236,0.174193,0.186002,0.183725,0.183569,0.187181,0.177974,0.183467,0.1784,0.182291
13778,0.187323,0.185994,0.186497,0.186925,0.170165,0.188046,0.175834,0.31541,0.251251,0.169903,...,0.400635,0.433041,0.174115,0.214653,0.170715,0.177371,0.170986,0.18384,0.173714,0.183272
15526,0.187323,0.185994,0.186497,0.186925,0.170165,0.188046,0.175834,0.31541,0.251251,0.169903,...,0.400635,0.433041,0.174115,0.214653,0.170715,0.177371,0.170986,0.18384,0.173714,0.183272


In [23]:
len(matrix_preds_autre.columns)
len(matrix_preds_high_tech.columns)

2976

In [24]:
class CFRecommender:
    
    MODEL_NAME = 'Collaborative Filtering'
    
    def __init__(self, matrix_preds_autre, items_df=None):
        self.cf_preds = matrix_preds_autre
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Get and sort the user's predictions
        sorted_user_predictions = self.cf_preds[user_id].sort_values(ascending=False).reset_index().rename(columns={user_id: 'note'})

        # Recommend the highest predicted rating movies that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['id_produit'].isin(items_to_ignore)].sort_values('note', ascending = False).head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df


        return recommendations_df
    
#Pour la catégorie Autre 
cf_recommender_model_autre = CFRecommender(matrix_preds_autre, vente_2015)

#Pour la catégorie High_tech 
cf_recommender_model_high_tech = CFRecommender(matrix_preds_high_tech, vente_2015)

In [25]:
print("SVD")
cf_global_metrics, cf_detailed_results = model_evaluator_autre.evaluate_model(cf_recommender_model_autre)
print("\nGlobal Metrics : \n%s" % cf_global_metrics)
cf_detailed_results.head(10)

SVD
37 users processed

Global Metrics : 
{'modelName': 'Collaborative Filtering', 'recall@5': 0.03488372093023256, 'recall@10': 0.046511627906976744}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
7,0,0,6,0.0,0.0,98147
1,0,0,5,0.0,0.0,97928
2,0,0,4,0.0,0.0,98311
5,0,0,4,0.0,0.0,98430
11,0,0,3,0.0,0.0,98168
24,0,0,3,0.0,0.0,98530
6,0,0,3,0.0,0.0,97528
8,1,1,3,0.333333,0.333333,98632
9,0,0,3,0.0,0.0,98758
18,0,0,3,0.0,0.0,97933


In [38]:
cf_recommender_model_autre.recommend_items(92611, topn=20, verbose=True)

Unnamed: 0,id_produit,note
0,99925,0.946031
1,99966,0.681947
2,1624,0.597114
3,89930,0.597114
4,92589,0.597114
5,93470,0.597114
6,96560,0.597114
7,99396,0.597114
8,99936,0.534018
11,99389,0.404556


In [39]:
vente_2015.loc[(vente_2015['id_client'] == 92611) & (vente_2015['id_produit'] == 1624)]

Unnamed: 0,id_vente,date_vente,id_client,categorie_client,id_magasin,prix_vente_unitaire,quantité,id_produit,categorie_produit,note
19187,2939,02/01/2015,92611,Enfant,197,46.7,4,1624,High-tech,1


In [40]:
vente_2015.loc[(vente_2015['id_client'] == 92611) & (vente_2015['id_produit'] == 99314)]

Unnamed: 0,id_vente,date_vente,id_client,categorie_client,id_magasin,prix_vente_unitaire,quantité,id_produit,categorie_produit,note


In [29]:
cf_recommender_model_high_tech.recommend_items(11317, topn=20, verbose=True)

Unnamed: 0,id_produit,note
0,5626,0.014616
1,19192,0.014615
2,251,0.014615
3,1606,0.014615
4,4621,0.014615
5,6092,0.014615
6,3535,0.014615
7,2066,0.014615
8,4028,0.014615
9,12712,0.014615


In [30]:
vente_2015.loc[(vente_2015['id_client'] == 11317) & (vente_2015['id_produit'] == 12240)]

Unnamed: 0,id_vente,date_vente,id_client,categorie_client,id_magasin,prix_vente_unitaire,quantité,id_produit,categorie_produit,note


In [31]:
#Calcul des produits "Autre" les plus populaires
item_popularity_autre = vente_autre.groupby('id_produit')['note'].sum().sort_values(ascending=False).reset_index()
item_popularity_autre.head(10)

item_popularity_high_tech = vente_high_tech.groupby('id_produit')['note'].sum().sort_values(ascending=False).reset_index()
item_popularity_high_tech.head(10)

Unnamed: 0,id_produit,note
0,1951,6.0
1,10793,6.0
2,5629,6.0
3,3589,6.0
4,10292,6.0
5,5436,6.0
6,1844,5.0
7,11993,5.0
8,3249,5.0
9,6167,5.0


In [32]:
class PopularityRecommender:
    
    MODEL_NAME = 'Popularity'
    
    def __init__(self, popularity_df, items_df=None):
        self.popularity_df = popularity_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Recommend the more popular items that the user hasn't seen yet.
        recommendations_df = self.popularity_df[~self.popularity_df['id_produit'].isin(items_to_ignore)].sort_values('note', ascending = False).head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df

        return recommendations_df
    
#Produit Autre
popularity_model_autre = PopularityRecommender(item_popularity_autre, vente_2015)

#Produit High_tech
popularity_model_high_tech = PopularityRecommender(item_popularity_high_tech, vente_2015)

In [33]:
print('Evaluating Popularity recommendation model...')
pop_global_metrics, pop_detailed_results_df = model_evaluator_autre.evaluate_model(popularity_model_autre)
print('\nGlobal metrics:\n%s' % pop_global_metrics)
pop_detailed_results_df.head(10)

Evaluating Popularity recommendation model...
37 users processed

Global metrics:
{'modelName': 'Popularity', 'recall@5': 0.05813953488372093, 'recall@10': 0.10465116279069768}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
7,0,0,6,0.0,0.0,98147
1,0,0,5,0.0,0.0,97928
2,0,0,4,0.0,0.0,98311
5,0,0,4,0.0,0.0,98430
11,0,0,3,0.0,0.0,98168
24,0,0,3,0.0,0.0,98530
6,0,0,3,0.0,0.0,97528
8,0,0,3,0.0,0.0,98632
9,1,1,3,0.333333,0.333333,98758
18,0,0,3,0.0,0.0,97933


In [34]:
popularity_model_autre.recommend_items(98704, topn=20, verbose=True)

Unnamed: 0,id_produit,note
0,99934,6.0
1,99906,4.0
2,99914,4.0
3,99948,4.0
4,99933,4.0
5,99910,4.0
6,99979,4.0
7,99903,4.0
9,99919,3.584963
8,99921,3.584963


In [35]:
vente_2015.loc[(vente_2015['id_client'] == 98704) & (vente_2015['id_produit'] == 99934)]

Unnamed: 0,id_vente,date_vente,id_client,categorie_client,id_magasin,prix_vente_unitaire,quantité,id_produit,categorie_produit,note


In [36]:
vente_2015.loc[(vente_2015['id_client'] == 98704) & (vente_2015['id_produit'] == 99919)]

Unnamed: 0,id_vente,date_vente,id_client,categorie_client,id_magasin,prix_vente_unitaire,quantité,id_produit,categorie_produit,note
45308,6961,02/01/2015,98704,Autre,85,20.83,8,99919,Autre,1
