In [17]:
import csv
import copy
import numpy as np
import random
from igraph import *
import math
import itertools
from itertools import groupby
from operator import itemgetter
from scipy.optimize import linear_sum_assignment
import time
import multiprocess
from networkx import Graph
from networkx import connected_components
from networkx.algorithms import bipartite
from networkx.algorithms.bipartite.matching import minimum_weight_full_matching, maximum_matching, hopcroft_karp_matching
from networkx.algorithms import maximal_matching


In [8]:
# Data extraction

data100a = []
with open('100.csv', newline='') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in spamreader:
        data100a.append([int(row[0]), row[1]])
data100a = np.array(data100a)

data100b = []
with open('100_2.csv', newline='') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in spamreader:
        data100b.append([int(row[0]), row[1]])
data100b = np.array(data100b)
        
data1000a = []
with open('1000.csv', newline='') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in spamreader:
        data1000a.append([int(row[0]), row[1]])
data1000a = np.array(data1000a)

data1000b = []
with open('1000_2.csv', newline='') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in spamreader:
        data1000b.append([int(row[0]), row[1]])
data1000b = np.array(data1000b)

In [45]:
# First algorithm - gluton approach - chose minimal weight item, remove users not having this item, repeat

def find_fingerprints_gluton(data):
    """
        Parameters
        ----------
            data: List[Tuple[int, int]]
                List of couples like (user_id, item_id).
                An item could be a song, an artist or something else.

        Returns
        ----------
            fingerprints: List[Tuple[int, List[int]]]
                List of couples like (user_id, fingerprint).
                Fingerprint being a list of items.
            cost: int
                Sum of the lengths of all of the fingerprints.
            k: int
                Length of the biggest fingerprint.
    """
    # Initialisation
    super_likes = np.array(data)
    super_likes = super_likes.astype(np.int)
    super_users = np.array(np.unique(super_likes[:,0]))
    super_items = np.array(np.unique(super_likes[:,1]))
    super_weigths = np.zeros(np.size(super_items))
    
    # Calculate weigths
    for l in super_likes:
        super_weigths[np.where(super_items==l[1])] += 1
        
    # Init fingerprints list
    fingerprints = []
    
    # For each user
    for num, u in enumerate(super_users):
        
        # Make a copy of all tables
        likes = copy.deepcopy(super_likes)
        users = copy.deepcopy(super_users)
        items = copy.deepcopy(super_items)
        weigths = copy.deepcopy(super_weigths)
        
        # Init fingerprint
        F = [] 
        
        # While nb_users > 1 et we have not run through all liked items
        while len(F) < len([l for l in likes if l[0]==u]) and len(users) > 1 :
            
            # Extract user items that are not in the fingerprint
            u_items = [x for x in [np.argwhere(items==a)[0][0] 
                                   for a in [l[1] for l in likes if l[0]==u]] 
                       if items[x] not in F]
            
            # Find one with minimal weigth
            for i, index in enumerate(u_items):
                if i == 0 or weigths[index] < weigths[minimum]:
                    minimum = index
            selected = items[minimum]
            
            # Add it to the fingerprint
            F.append(selected)
            
            # Remove users who have not liked the selected item
            users = np.array([l[0] for l in likes if l[1] == selected])
            
            # Re-init tables
            likes = np.array([l for l in likes if l[0] in users])
            items = np.array(np.unique(likes[:,1]))
            weigths = np.zeros(np.size(items))
            for l in likes:
                weigths[np.where(items==l[1])] += 1
                
        # Save fingerprint
        fingerprints.append([u, F])
       
    # Return fingerprints list, fp cardinals sum and maximal cardinal 
    return fingerprints, sum([len(f[1]) for f in fingerprints]), max([len(f[1]) for f in fingerprints])

In [50]:
start_time = time.time()
fingerprints, cost, k = find_fingerprints_gluton(data100a)
print('Favorite songs from 100 users (dataset 1).')
print('Sum of fingerprints cardinals:', cost, 'Max cardinal:', k)
print('Time:', time.time() - start_time, '\n')

start_time = time.time()
fingerprints, cost, k = find_fingerprints_gluton(data100b)
print('Favorite songs from 100 users (dataset 2).')
print('Sum of fingerprints cardinals:', cost, 'Max cardinal:', k)
print('Time:', time.time() - start_time, '\n')

start_time = time.time()
fingerprints, cost, k = find_fingerprints_gluton(data1000a)
print('Favorite songs from 1000 users (dataset 1).')
print('Sum of fingerprints cardinals:', cost, 'Max cardinal:', k)
print('Time:', time.time() - start_time, '\n')

start_time = time.time()
fingerprints, cost, k = find_fingerprints_gluton(data1000b)
print('Favorite songs from 1000 users (dataset 2).')
print('Sum of fingerprints cardinals:', cost, 'Max cardinal:', k)
print('Time:', time.time() - start_time, '\n')

Favorite songs from 100 users (dataset 1).
Sum of fingerprints cardinals: 104 Max cardinal: 2
Time: 1.6250131130218506 

Favorite songs from 100 users (dataset 2).
Sum of fingerprints cardinals: 100 Max cardinal: 1
Time: 2.8732779026031494 

Favorite songs from 1000 users (dataset 1).
Sum of fingerprints cardinals: 1157 Max cardinal: 11
Time: 224.24060535430908 

Favorite songs from 1000 users (dataset 2).
Sum of fingerprints cardinals: 1048 Max cardinal: 4
Time: 372.7196590900421 



----------------------------------

The following algorithms are variations of the assignment problem.

We gradually increment a variable k, and add items composed of k items to our list. 
Maximum k will represent maximum cardinal of a fingerprint


In [9]:
# k among n combinations

def combinations(args):
    user, items, k = args
    return np.array([[user, '&'.join(row)] for row in list(itertools.combinations(items, k))])



The following function is pretty much irrelevant: we can actually not use scipy.linear_sum_assignment function 
as it takes only full matrices, which means we have to create fake links between users and items that 
they have not liked.

Even if we give these edges a huge weight there will still be a risk that the algorith choses an edge 
that represents an inexisting link.

We could try to find a similar function capabale of treating matrices with NaN values, but I've decided
to completely change the shape of our data and use the NetworkX library instead.



In [53]:
def find_fingerprints_cost_matrix(data):
    """
        Parameters
        ----------
            data: List[Tuple[int, int]]
                List of couples like (user_id, item_id).
                An item could be a song, an artist or something else.

        Returns
        ----------
            fingerprints: List[Tuple[int, List[int]]]
                List of couples like (user_id, fingerprint).
                Fingerprint being a list of items.
            cost: int
                Sum of the lengths of all of the fingerprints.
            k: int
                Length of the biggest fingerprint.
    """
    
    # Init tables
    likes = np.array(data)
    likes = likes.astype(np.int)
    likes = likes[np.lexsort(np.fliplr(likes).T)]
    users = np.array(np.unique(likes[:,0]))
    super_items = np.array(np.unique(likes[:,1]))
    user_items = [(user, list(list(zip(*item))[1])) for user, item in groupby(likes.tolist(), itemgetter(0))]
    
    # Increment k gradually
    for k in range (1, 3):
        
        # Create items combinations of size k
        if k == 1:
            # Init previous cost as an infinite number
            prev_cost = math.inf
            items = [[super_items[i]] for i, item in enumerate(super_items)]
        else:
            # Generate combinaisons of items within user's favourites
            try:
                pool = multiprocess.Pool()
                my_combinations = pool.map(combinations, ([row[1], k] for row in user_items))
                
                new_likes = []
                for i in range(0, len(my_combinations)):
                    for j in range(0, len(my_combinations[i])):
                        new_likes.append([int(my_combinations[i][j][0]), my_combinations[i][j][1], k])
                        
                new_items = np.unique(np.array(new_likes)[:,1])
                
            finally: 
                pool.close()
                pool.join()
                
            items = items + new_items
        
        # Init cost matrix
        cost_matrix = np.empty([len(users), len(items)])
        
        # Generate cost matrix
        for num_u, user in enumerate(users):
            for num_i, item in enumerate(items):
                
                # If link user -> item exists, cost = item cardinal
                if all(i in user_items[num_u][1] for i in item):
                    cost_matrix[num_u][num_i] = len(item)
                    
                # If not we give it a huge random cost
                else:
                    cost_matrix[num_u][num_i] = 9999
                    
        # Apply Hungarian algorithm
        list_u, list_i = linear_sum_assignment(cost_matrix)
        cost = cost_matrix[list_u, list_i].sum()
        
        # If all users have a fingerprint - end of algorithm
        if len(list_u) == len(users):
            fingerprints = [[users[u], items[i]] for u, i in zip(list_u, list_i)]
            return fingerprints, cost, k
        
    # Return fingerprints list, fp cardinals sum and maximal cardinal 
    return fingerprints, cost, k

In [55]:
start_time = time.time()
fingerprints, cost, k = find_fingerprints_cost_matrix(data100a)
print('Favorite songs from 100 users (dataset 1).')
print('Sum of fingerprints cardinals:', cost, 'Max cardinal:', k)
print('Time:', time.time() - start_time, '\n')

start_time = time.time()
fingerprints, cost, k = find_fingerprints_cost_matrix(data100b)
print('Favorite songs from 100 users (dataset 2).')
print('Sum of fingerprints cardinals:', cost, 'Max cardinal:', k)
print('Time:', time.time() - start_time, '\n')

start_time = time.time()
fingerprints, cost, k = find_fingerprints_cost_matrix(data1000a)
print('Favorite songs from 1000 users (dataset 1).')
print('Sum of fingerprints cardinals:', cost, 'Max cardinal:', k)
print('Time:', time.time() - start_time, '\n')

start_time = time.time()
fingerprints, cost, k = find_fingerprints_cost_matrix(data1000b)
print('Favorite songs from 1000 users (dataset 2).')
print('Sum of fingerprints cardinals:', cost, 'Max cardinal:', k)
print('Time:', time.time() - start_time, '\n')

Favorite songs from 100 users (dataset 1).
Sum of fingerprints cardinals: 100.0 Max cardinal: 1
Time: 1.2320871353149414 

Favorite songs from 100 users (dataset 2).
Sum of fingerprints cardinals: 100.0 Max cardinal: 1
Time: 4.574880599975586 

Favorite songs from 1000 users (dataset 1).
Sum of fingerprints cardinals: 1000.0 Max cardinal: 1
Time: 144.6676688194275 



KeyboardInterrupt: 

---
For the following algorithms we will be using and comparing several matching algorithms from NetworkX library.

In [12]:
def assignment_fingerprints_networkx_min_weigth(data):
    """
        Parameters
        ----------
            data: List[Tuple[int, int]]
                List of couples like (user_id, item_id).
                An item could be a song, an artist or something else.

        Returns
        ----------
            fingerprints: List[Tuple[int, List[int]]]
                List of couples like (user_id, fingerprint).
                Fingerprint being a list of items.
            cost: int
                Sum of the lengths of all of the fingerprints.
            k: int
                Length of the biggest fingerprint.
    """
    start_time = time.time()
    
    # Init tables
    data = data[np.lexsort(np.fliplr(data).T)]
    
    users = np.array(np.unique(data[:,0])).astype(np.int)
    items = np.array(np.unique(data[:,1]))
    
    likes = [[int(row[0]), row[1], 1] for row in data.tolist()]
    user_items = [(user, list(list(zip(*item))[1])) for user, item in groupby(likes, itemgetter(0))]
    
    
    # Increment k gradually
    for k in range (1, 3):
        
        # Create bipartite graph
        if k == 1:
            G = Graph()
            G.add_nodes_from(users, bipartite=0)
            G.add_nodes_from(items, bipartite=1)
            G.add_weighted_edges_from(likes)
            
        # Update the graph    
        else:
            undefined_users = [u for u in users if u not in np.array(fingerprints, dtype=int)[:,0]]
            
            try:
                # Create items combinations of size k
                pool = multiprocess.Pool()
                my_combinations = pool.map(combinations, 
                                           ([row[0], row[1], k] for row in user_items 
                                            if len(row[1]) >= k and row[0] in undefined_users))
                new_likes = []

                for i in range(0, len(my_combinations)):
                    for j in range(0, len(my_combinations[i])):
                        new_likes.append([int(my_combinations[i][j][0]), my_combinations[i][j][1], k])

                new_items = np.array(np.unique(np.array(new_likes)[:,1]))

                # Add them to the graph
                G.add_nodes_from(new_items, bipartite=1) 
                G.add_weighted_edges_from(new_likes)

            # To make sure processes are closed in the end
            finally: 
                pool.close()
                pool.join()

        # Init fingerprints            
        fingerprints = []
        
        # Go through connected components of the graph
        for c in connected_components(G):
            matchings = minimum_weight_full_matching(G.subgraph(c))
            for key, value in matchings.items():
                if not isinstance(key, str):
                    fingerprints.append([key,value])
                else:
                    break
                    
        # Calculate cost           
        cost = 0
        for f in fingerprints:
            cost += f[1].count('&') + 1
        
        # If there is a fingerprint for each user - end of algorithm
        if len(fingerprints) == len(users):
            return fingerprints, cost, k
        
    return fingerprints, cost, k

Minimum_weight_full_matching doesn't necessarily return a matching with maximum cardinality, meaning that it might not cover all of our users.

In [13]:
start_time = time.time()
fingerprints, cost, k = assignment_fingerprints_networkx_min_weigth(data100a)
print('Favorite songs from 100 users (dataset 1).')
print('Sum of fingerprints cardinals:', cost, 'Max cardinal:', k)
print('Time:', time.time() - start_time, '\n')

start_time = time.time()
fingerprints, cost, k = assignment_fingerprints_networkx_min_weigth(data100b)
print('Favorite songs from 100 users (dataset 2).')
print('Sum of fingerprints cardinals:', cost, 'Max cardinal:', k)
print('Time:', time.time() - start_time, '\n')

start_time = time.time()
fingerprints, cost, k = assignment_fingerprints_networkx_min_weigth(data1000a)
print('Favorite songs from 1000 users (dataset 1).')
print('Sum of fingerprints cardinals:', cost, 'Max cardinal:', k)
print('Time:', time.time() - start_time, '\n')

start_time = time.time()
fingerprints, cost, k = assignment_fingerprints_networkx_min_weigth(data1000b)
print('Favorite songs from 1000 users (dataset 2).')
print('Sum of fingerprints cardinals:', cost, 'Max cardinal:', k)
print('Time:', time.time() - start_time, '\n')

Favorite songs from 100 users (dataset 1).
Sum of fingerprints cardinals: 93 Max cardinal: 2
Time: 1.3153598308563232 

Favorite songs from 100 users (dataset 2).
Sum of fingerprints cardinals: 80 Max cardinal: 2
Time: 1.3928008079528809 

Favorite songs from 1000 users (dataset 1).
Sum of fingerprints cardinals: 1000 Max cardinal: 1
Time: 9.244788885116577 

Favorite songs from 1000 users (dataset 2).
Sum of fingerprints cardinals: 889 Max cardinal: 2
Time: 17.190099000930786 



Here we use the hopcroft_karp_matching function for bipartite graphs, which is supposed to return a maximum cardinality matching.

In [15]:
def assignment_fingerprints_max_match_bip(data):
    """
        Parameters
        ----------
            data: List[Tuple[int, int]]
                List of couples like (user_id, item_id).
                An item could be a song, an artist or something else.

        Returns
        ----------
            fingerprints: List[Tuple[int, List[int]]]
                List of couples like (user_id, fingerprint).
                Fingerprint being a list of items.
            cost: int
                Sum of the lengths of all of the fingerprints.
            k: int
                Length of the biggest fingerprint.
    """
    start_time = time.time()
    
    # Init tables
    data = data[np.lexsort(np.fliplr(data).T)]
    
    users = np.array(np.unique(data[:,0])).astype(np.int)
    items = np.array(np.unique(data[:,1]))
    
    likes = [[int(row[0]), row[1], 1] for row in data.tolist()]
    user_items = [(user, list(list(zip(*item))[1])) for user, item in groupby(likes, itemgetter(0))]
    
    
    # Increment k gradually
    for k in range (1, 3):

        # Create bipartite graph
        if k == 1:
            G = Graph()
            G.add_nodes_from(users, bipartite=0)
            G.add_nodes_from(items, bipartite=1)
            G.add_weighted_edges_from(likes)
            
        # Update the graph    
        else:
            undefined_users = [u for u in users if u not in np.array(fingerprints, dtype=int)[:,0]]
            
            try:
                # Create items combinations of size k
                pool = multiprocess.Pool()
                my_combinations = pool.map(combinations, 
                                           ([row[0], row[1], k] for row in user_items 
                                            if len(row[1]) >= k and row[0] in undefined_users))
                new_likes = []

                for i in range(0, len(my_combinations)):
                    for j in range(0, len(my_combinations[i])):
                        new_likes.append([int(my_combinations[i][j][0]), my_combinations[i][j][1], k])

                new_items = np.array(np.unique(np.array(new_likes)[:,1]))

                # Add them to the graph
                G.add_nodes_from(new_items, bipartite=1) 
                G.add_weighted_edges_from(new_likes)

            # To make sure processes are closed in the end
            finally: 
                pool.close()
                pool.join()

        # Init fingerprints            
        fingerprints = []

        # Go through connected components of the graph
        for c in connected_components(G):
            matchings = hopcroft_karp_matching(G.subgraph(c))
            for key, value in matchings.items():
                if not isinstance(key, str):
                    fingerprints.append([key,value])
                else:
                    break
                
        # Calculate cost           
        cost = 0
        for f in fingerprints:
            cost += f[1].count('&') + 1
        
        # If there is a fingerprint for each user - end of algorithm
        if len(fingerprints) == len(users):
            return fingerprints, cost, k
        
    return fingerprints, cost, k

In [16]:
start_time = time.time()
fingerprints, cost, k = assignment_fingerprints_max_match_bip(data100a)
print('Favorite songs from 100 users (dataset 1).')
print('Sum of fingerprints cardinals:', cost, 'Max cardinal:', k)
print('Time:', time.time() - start_time, '\n')

start_time = time.time()
fingerprints, cost, k = assignment_fingerprints_max_match_bip(data100b)
print('Favorite songs from 100 users (dataset 2).')
print('Sum of fingerprints cardinals:', cost, 'Max cardinal:', k)
print('Time:', time.time() - start_time, '\n')

start_time = time.time()
fingerprints, cost, k = assignment_fingerprints_max_match_bip(data1000a)
print('Favorite songs from 1000 users (dataset 1).')
print('Sum of fingerprints cardinals:', cost, 'Max cardinal:', k)
print('Time:', time.time() - start_time, '\n')

start_time = time.time()
fingerprints, cost, k = assignment_fingerprints_max_match_bip(data1000b)
print('Favorite songs from 1000 users (dataset 2).')
print('Sum of fingerprints cardinals:', cost, 'Max cardinal:', k)
print('Time:', time.time() - start_time, '\n')

Favorite songs from 100 users (dataset 1).
Sum of fingerprints cardinals: 91 Max cardinal: 2
Time: 0.29979395866394043 

Favorite songs from 100 users (dataset 2).
Sum of fingerprints cardinals: 77 Max cardinal: 2
Time: 0.30950307846069336 

Favorite songs from 1000 users (dataset 1).
Sum of fingerprints cardinals: 1000 Max cardinal: 1
Time: 0.9598491191864014 

Favorite songs from 1000 users (dataset 2).
Sum of fingerprints cardinals: 883 Max cardinal: 2
Time: 4.321008920669556 



As we can see, hopcroft_karp_matching doesn't cover all users either.

---
We will now use the maximal_matching function for usual graphs.

In [53]:
def assignment_fingerprints_max_match(data):
    """
        Parameters
        ----------
            data: List[Tuple[int, int]]
                List of couples like (user_id, item_id).
                An item could be a song, an artist or something else.

        Returns
        ----------
            fingerprints: List[Tuple[int, List[int]]]
                List of couples like (user_id, fingerprint).
                Fingerprint being a list of items.
            cost: int
                Sum of the lengths of all of the fingerprints.
            k: int
                Length of the biggest fingerprint.
    """
    
    # Initialisation des tables
    data = data[np.lexsort(np.fliplr(data).T)]
    
    users = np.array(np.unique(data[:,0])).astype(np.int)
    items = np.array(np.unique(data[:,1]))
    
    likes = [[int(row[0]), row[1], 1] for row in data.tolist()]
    user_items = [(user, list(list(zip(*item))[1])) for user, item in groupby(likes, itemgetter(0))]
    
    
    # Pour k allant de 1 à n?
    for k in range (1, 2):
        
        # Création des ensembles des items de taille allant de 1 à k
        if k == 1:
            G = Graph()
            # Add nodes with the node attribute "bipartite"
            G.add_nodes_from(users, bipartite=0)
            G.add_nodes_from(items, bipartite=1)
            # Add edges only between nodes of opposite node sets
            G.add_weighted_edges_from(likes)
        else:
            # Si on a un fingerprint pour chaque user - fin de la fonction
            if len(fingerprints) == len(users):
                return fingerprints
            
            # Sinon - générer les combinaisons qui existent parmi les ensebles d'items des users
            else:
                undefined_users = [u for u in users if u not in np.array(np.array(fingerprints, dtype=str)[:,0], dtype=int)]
                print(undefined_users)
                try:
                    # Create items combinations of size k
                    pool = multiprocess.Pool()
                    my_combinations = pool.map(combinations, 
                                               ([row[0], row[1], k] for row in user_items 
                                                if len(row[1]) >= k and row[0] in undefined_users))
                    new_likes = []

                    
                    for i in range(0, len(my_combinations)):
                        for j in range(0, len(my_combinations[i])):
                            new_likes.append([int(my_combinations[i][j][0]), my_combinations[i][j][1], k])
                            
                    new_items = np.array(np.unique(np.array(new_likes)[:,1]))
                    
                    G.add_nodes_from(new_items, bipartite=1) 
                    G.add_weighted_edges_from(new_likes)
                    
                # To make sure processes are closed in the end, even if errors happen
                finally: 
                    pool.close()
                    pool.join()

        # Init fingerprints             
        fingerprints = []
        
        # Go through connected components of the graph
        for c in connected_components(G):
            matchings = maximal_matching(G.subgraph(c))
            for m in matchings: 
                if isinstance(m[1], str):
                    fingerprints.append([m[0], m[1]])
                else:
                    fingerprints.append([m[1], m[0]])
                    
        # Calculate cost           
        cost = 0
        for f in fingerprints:
            cost += f[1].count('&') + 1
        
        print('Sum of fingerprints cardinals:', cost, 'k:', k)
        
        # If there is a fingerprint for each user - end of algorithm
        if len(fingerprints) == len(users):
            return fingerprints, cost, k
        
    return fingerprints, cost, k

In [51]:
start_time = time.time()
fingerprints, cost, k = assignment_fingerprints_max_match(data100a)
print('Favorite songs from 100 users (dataset 1).')
print('Sum of fingerprints cardinals:', cost, 'Max cardinal:', k)
print('Time:', time.time() - start_time, '\n')

start_time = time.time()
fingerprints, cost, k = assignment_fingerprints_max_match(data100b)
print('Favorite songs from 100 users (dataset 2).')
print('Sum of fingerprints cardinals:', cost, 'Max cardinal:', k)
print('Time:', time.time() - start_time, '\n')

start_time = time.time()
fingerprints, cost, k = assignment_fingerprints_max_match(data1000a)
print('Favorite songs from 1000 users (dataset 1).')
print('Sum of fingerprints cardinals:', cost, 'Max cardinal:', k)
print('Time:', time.time() - start_time, '\n')

start_time = time.time()
fingerprints, cost, k = assignment_fingerprints_max_match(data1000b)
print('Favorite songs from 1000 users (dataset 2).')
print('Sum of fingerprints cardinals:', cost, 'Max cardinal:', k)
print('Time:', time.time() - start_time, '\n')

Sum of fingerprints cardinals: 100 k: 1
Favorite songs from 100 users (dataset 1).
Sum of fingerprints cardinals: 100 Max cardinal: 1
Time: 0.4384908676147461 

Sum of fingerprints cardinals: 100 k: 1
Favorite songs from 100 users (dataset 2).
Sum of fingerprints cardinals: 100 Max cardinal: 1
Time: 0.41098594665527344 

Sum of fingerprints cardinals: 1000 k: 1
Favorite songs from 1000 users (dataset 1).
Sum of fingerprints cardinals: 1000 Max cardinal: 1
Time: 2.1697616577148438 

Sum of fingerprints cardinals: 997 k: 1
[11444, 7715, 7875]
Sum of fingerprints cardinals: 999 k: 2
[11444, 7875]


IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [54]:
start_time = time.time()
fingerprints, cost, k = assignment_fingerprints_max_match(data1000b)
print('Favorite songs from 1000 users (dataset 2).')
print('Sum of fingerprints cardinals:', cost, 'Max cardinal:', k)
print('Time:', time.time() - start_time, '\n')

for f in fingerprints:
    print(f)

Sum of fingerprints cardinals: 997 k: 1
Favorite songs from 1000 users (dataset 2).
Sum of fingerprints cardinals: 997 Max cardinal: 1
Time: 1.407940149307251 

[1064, '102360936']
[9733, '624823']
[5239, '13788854']
[8221, '100723036']
[1755, '117241658']
[2885, '78662915']
[6246, '1067731']
[3211, '100816236']
[10641, '108546612']
[2570, '3287768']
[9808, '80648074']
[10002, '1012662']
[8181, '100321020']
[10825, '1022290']
[5400, '1068038']
[7680, '124262674']
[9761, '145526404']
[570, '13040252']
[10375, '108957630']
[3355, '1004481552']
[856, '67234288']
[10667, '97312678']
[3402, '111780412']
[93, '101057358']
[3279, '1177313']
[3451, '1041589']
[8441, '100328650']
[6436, '10107284']
[8926, '125120736']
[2519, '79587582']
[8191, '1003381682']
[4094, '106095118']
[4912, '100816096']
[7242, '108596166']
[7478, '129324294']
[10813, '1024973']
[8884, '1016302']
[3551, '1000415022']
[8477, '104096154']
[3161, '10240178']
[11633, '10023508']
[4082, '100067510']
[4735, '104936008']
[188

This method seems to indeed return the maximal matching in terms of covering a maximum number of users.
The last example confronts us with a problem: the user 11444 has only one liked song - 63017512. Thus at k=1 our function has attributed song id 63017512 to user 10517, even though it is not his unique favorite. As it will cost us too much to calculate all the combinations of all users' favourites, and we cannot in this case calculate combinations of user 11444 as he has only one song, we should find another solution.






-----------------------------------------








In [296]:
""" En cours de modifications. """

# Ensembles de k items
def find_fingerprints_with_k(data):
    
    # Initialisation des tables
    super_likes = np.array(data)
    super_likes = super_likes.astype(np.int)
    super_users = np.array(np.unique(super_likes[:,0]))
    super_items = np.array(np.unique(super_likes[:,1]))
        
    # Initialisation de la liste des fingerprints
    fingerprints = []
    
    # Faire une copie des tables
    likes = copy.deepcopy(super_likes)
    users = copy.deepcopy(super_users)
    items = copy.deepcopy(super_items)
    users_weights = np.zeros(np.size(users))
    items_weights = np.zeros(np.size(items))
    
    # Pour k allant de 1 à n?
    for k in range (1, 2):
    
        # Calcul des poids des users
        for l in likes:
            users_weights[np.where(users==l[0])] += 1
            
        
        # Calcul des poids des items
        for l in likes:
            items_weights[np.where(items==l[1])] += 1
    
        # Pour chaque user
        for num_u, u in enumerate(users):
            
            # Si le poids du user = 1 son fingerprint est l'unique item qui lui est propre
            if users_weights[num_u] == 1:
                F = [u, likes[np.where(likes[:,0] == u)[0][0]][1]]
                fingerprints.append(F)
              
        # Pour chaque item
        for num_i, i in enumerate(items):
            
            # Si le poids de l'item = 1 c'est le fingerprint de l'unique user qui lui est propre
            if items_weights[num_i] == 1:
                user = likes[np.where(likes[:,1] == i)[0][0]][0]
                if user not in np.array(fingerprints)[:,0]:
                    F = [user, i]
                    fingerprints.append(F)
                
        # Réinitialisation des tables
        # Supprimer tous les users dont on a trouvé le fingerprint
        users = [u for u in users if u not in np.array(fingerprints)[:,0]]
        # Re
        items = itertools.combinations(items, k+1)

In [None]:
find_fingerprints_with_k(data100)