In [251]:
import csv
import copy
import numpy as np
import random
from igraph import *
import math
import itertools
from scipy.optimize import linear_sum_assignment

In [263]:
# Extraction de données

data100 = []
with open('100.csv', newline='') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in spamreader:
        data100.append(row)
        
data1000 = []
with open('1000.csv', newline='') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in spamreader:
        data1000.append(row)

In [306]:
# Premier algo glouton - choisir l'item au poids minimal, déshabiller au fur et à mesure

def find_fingerprints(data):
    """
        Parameters
        ----------
            data: List[Tuple[int, int]]
                List of couples like (user_id, item_id).
                An item could be a song, an artist or something else.

        Returns
        ----------
            fingerprints: List[Tuple[int, List[int]]]
                List of couples like (user_id, fingerprint).
                Fingerprint being a list of items.
            cost: int
                Sum of the lengths of all of the fingerprints.
            k: int
                Length of the biggest fingerprint.
    """
    # Initialisation des tables
    super_likes = np.array(data)
    super_likes = super_likes.astype(np.int)
    super_users = np.array(np.unique(super_likes[:,0]))
    super_items = np.array(np.unique(super_likes[:,1]))
    super_weigths = np.zeros(np.size(super_items))
    
    # Calcul des poids
    for l in super_likes:
        super_weigths[np.where(super_items==l[1])] += 1
        
    # Initialisation de la liste des fingerprints
    fingerprints = []
    
    # Pour chaque user
    for num, u in enumerate(super_users):
        
        # Faire une copie des tables
        likes = copy.deepcopy(super_likes)
        users = copy.deepcopy(super_users)
        items = copy.deepcopy(super_items)
        weigths = copy.deepcopy(super_weigths)
        
        # Initialiser le fingerprint
        F = [] 
        
        # Tant qu'il y a plusieurs users et qu'on a pas parcouru tous les items du user
        while len(F) < len([l for l in likes if l[0]==u]) and len(users) > 1 :
            
            # Extraire les items du user u qui ne sont pas encore dans le fingerprint
            u_items = [x for x in [np.argwhere(items==a)[0][0] for a in [l[1] for l in likes if l[0]==u]] if items[x] not in F]
            
            # Trouver celui avec le poids minimal
            for i, index in enumerate(u_items):
                if i == 0 or weigths[index] < weigths[minimum]:
                    minimum = index
            selected = items[minimum]
            
            # Le stocker dans le fingerprint
            F.append(selected)
            
            # Supprimer les users qui n'ont pas liké l'item séléctionné
            users = np.array([l[0] for l in likes if l[1] == selected])
            
            # Réinitialisation des tables
            likes = np.array([l for l in likes if l[0] in users])
            items = np.array(np.unique(likes[:,1]))
            weigths = np.zeros(np.size(items))
            for l in likes:
                weigths[np.where(items==l[1])] += 1
                
        # Sauvegarder le fingerprint
        fingerprints.append([u, F])
       
    # Renvoyer la liste des fingerprints trouvés, la somme des cardinaux des fp et le cardinal maximal
    return fingerprints, sum([len(f[1]) for f in fingerprints]), max([len(f[1]) for f in fingerprints])

In [307]:
fingerprints, cost, k = find_fingerprints(data100)
print(cost, k)
for f in fingerprints:
    print(f)

21 1
[612, [142013]]
[807, [641]]
[814, [959]]
[829, [762]]
[1244, [190935]]
[1762, [800]]
[3212, [3566]]
[3444, [179798]]
[3862, [9962076]]
[5737, [244311]]
[5924, [71030]]
[6010, [3925]]
[6135, [630]]
[7403, [7965]]
[7784, [1030]]
[8113, [3]]
[10942, [4195]]
[10963, [5012881]]
[12434, [493]]
[13411, [152369]]
[13969, [175]]


In [309]:
fingerprints, cost, k = find_fingerprints(data1000)
print(cost, k)
for f in fingerprints:
    print(f)

1157 11
[1, [199860]]
[5, [6958285]]
[6, [6699351]]
[8, [7065571]]
[9, [193557]]
[13, [5229206]]
[18, [11865099]]
[19, [583456]]
[20, [823652]]
[39, [112173, 144227]]
[42, [69530]]
[43, [77163]]
[47, [1021278]]
[52, [342923]]
[62, [1635314]]
[63, [643]]
[64, [513292]]
[68, [78682, 51465712]]
[80, [4784343]]
[90, [826382]]
[95, [533395]]
[102, [1200694]]
[115, [57793]]
[123, [112678]]
[128, [1950]]
[134, [62959]]
[137, [6113884]]
[141, [1067]]
[145, [289430]]
[164, [533111]]
[168, [4218204]]
[175, [156453]]
[176, [247256]]
[178, [1352863]]
[185, [269994]]
[186, [11464]]
[189, [10795]]
[191, [86371]]
[194, [1994, 919, 3128]]
[199, [1254503]]
[214, [1519440]]
[216, [5518476]]
[223, [8514]]
[226, [6168798]]
[268, [4878787]]
[269, [5186409]]
[278, [1423960]]
[295, [4241678]]
[299, [5645111]]
[334, [981085]]
[344, [4443765]]
[348, [69789]]
[359, [2110751]]
[362, [240330]]
[390, [71335]]
[393, [4316960]]
[404, [4855153]]
[429, [4395901]]
[438, [2235]]
[482, [1687943]]
[501, [92782]]
[514, [13

Dans les deux algos qui suivent un item peut être un ensemble d'items. Par exemple lorsque k=2 un items peut représenter un couple d'items.

In [293]:
# Problème d'assignement
def assignment_fingerprints(data):
    """
        Parameters
        ----------
            data: List[Tuple[int, int]]
                List of couples like (user_id, item_id).
                An item could be a song, an artist or something else.

        Returns
        ----------
            fingerprints: List[Tuple[int, List[int]]]
                List of couples like (user_id, fingerprint).
                Fingerprint being a list of items.
            cost: int
                Sum of the lengths of all of the fingerprints.
            k: int
                Length of the biggest fingerprint.
    """
    # Initialisation des tables
    likes = np.array(data)
    likes = likes.astype(np.int)
    users = np.array(np.unique(likes[:,0]))
    super_items = np.array(np.unique(likes[:,1]))
    user_items = [[u, [row[1] for row in likes if row[0] == u]] for u in users]
    
    # Pour k allant de 1 à n?
    for k in range (1, 3):
        
        # Création des ensembles des items de taille allant de 1 à k
        if k == 1:
            # Init "previous cost" à un nombre infiniment grand
            prev_cost = math.inf
            items = [[super_items[i]] for i, item in enumerate(super_items)]
        else:
            # TODO: générer uniquement les combinaisons qui existent parmi les ensebles d'items des users
            items = items + list(itertools.combinations(super_items, k))
        
        # Initialisation de la matrice de coût
        cost_matrix = np.empty([len(users), len(items)])
        
        # Génération de la matrice de coût
        for num_u, user in enumerate(users):
            for num_i, item in enumerate(items):
                
                # Si l'arrête user -> item existe, son coût est la longueur (cardinal) de l'item
                if all(i in user_items[num_u][1] for i in item):
                    cost_matrix[num_u][num_i] = len(item)
                    
                # Si l'arrête user -> item n'existe pas on lui associe un coût plus elevé que k
                # Ici on teste avec max k = 2 donc on peut dire que 3 sera toujours supérieur à k
                else:
                    cost_matrix[num_u][num_i] = 3
        
        list_u, list_i = linear_sum_assignment(cost_matrix)
        cost = cost_matrix[list_u, list_i].sum()
        
        # Si le coût est inférieur à celui calculé à k-1 on sauvegarde le coût et on calcule le fingerprint
        if cost < prev_cost:
            prev_cost = cost
            fingerprints = [[users[u], items[i]] for u, i in zip(list_u, list_i)]
        # Sinon on quitte la boucle
        else:
            break 
            
    # Renvoyer la liste des fingerprints trouvés, la somme des cardinaux des fp et le cardinal maximal
    return fingerprints, prev_cost, k-1

In [294]:
fingerprints, cost, k = assignment_fingerprints(data100)
print(cost, k)
for f in fingerprints:
    print(f)

21.0 1
[612, [142013]]
[807, [641]]
[814, [68]]
[829, [762]]
[1244, [106]]
[1762, [323]]
[3212, [27]]
[3444, [145]]
[3862, [1275239]]
[5737, [1033]]
[5924, [15375]]
[6010, [3925]]
[6135, [630]]
[7403, [973]]
[7784, [1030]]
[8113, [3]]
[10942, [4195]]
[10963, [4237]]
[12434, [493]]
[13411, [152369]]
[13969, [175]]


In [None]:
assignment_fingerprints(data1000)

Tried to run it on data from 1000 users with k going from 1 to 2, the job ran for more than two hours and yet no results.
The use of this method is probably limited since it seems to be too heavy even for medium size datasets.

In [296]:
""" En cours de modifications. """

# Ensembles de k items
def find_fingerprints_with_k(data):
    
    # Initialisation des tables
    super_likes = np.array(data)
    super_likes = super_likes.astype(np.int)
    super_users = np.array(np.unique(super_likes[:,0]))
    super_items = np.array(np.unique(super_likes[:,1]))
        
    # Initialisation de la liste des fingerprints
    fingerprints = []
    
    # Faire une copie des tables
    likes = copy.deepcopy(super_likes)
    users = copy.deepcopy(super_users)
    items = copy.deepcopy(super_items)
    users_weights = np.zeros(np.size(users))
    items_weights = np.zeros(np.size(items))
    
    # Pour k allant de 1 à n?
    for k in range (1, 2):
    
        # Calcul des poids des users
        for l in likes:
            users_weights[np.where(users==l[0])] += 1
            
        
        # Calcul des poids des items
        for l in likes:
            items_weights[np.where(items==l[1])] += 1
    
        # Pour chaque user
        for num_u, u in enumerate(users):
            
            # Si le poids du user = 1 son fingerprint est l'unique item qui lui est propre
            if users_weights[num_u] == 1:
                F = [u, likes[np.where(likes[:,0] == u)[0][0]][1]]
                fingerprints.append(F)
              
        # Pour chaque item
        for num_i, i in enumerate(items):
            
            # Si le poids de l'item = 1 c'est le fingerprint de l'unique user qui lui est propre
            if items_weights[num_i] == 1:
                user = likes[np.where(likes[:,1] == i)[0][0]][0]
                if user not in np.array(fingerprints)[:,0]:
                    F = [user, i]
                    fingerprints.append(F)
                
        # Réinitialisation des tables
        # Supprimer tous les users dont on a trouvé le fingerprint
        users = [u for u in users if u not in np.array(fingerprints)[:,0]]
        # Re
        items = itertools.combinations(items, k+1)

In [None]:
find_fingerprints_with_k(data100)