In [1]:
import numpy as np
import itertools
import pandas

***1) Read the initial dictionary: out.json***

In [2]:
import json

with open('out.json') as json_data:
    data_dict = json.load(json_data)
    print(data_dict)

{'Jacob Gomez': ['jazz', 'gabber', 'blues', 'trance', 'disco'], 'Jackson Smith': ['jazz', 'eurodance', 'country', 'trance'], 'Megan Grimes': ['rock', 'trance'], 'Michael Wilson': ['jazz', 'hiphop', 'pop', 'rock', 'disco'], 'Mrs. Stephanie Shaw': ['hiphop', 'country', 'disco', 'gabber'], 'Kathryn Thornton': ['hiphop', 'rock', 'classical'], 'David Griffin': ['pop', 'metal', 'eurodance', 'jazz'], 'Teresa Jacobs DVM': ['eurodance', 'classical', 'hiphop', 'pop', 'disco'], 'Ashley Wilson': ['pop', 'metal', 'disco'], 'Cole Bryan': ['jazz', 'gabber', 'metal', 'trance', 'rock'], 'Nathan Ibarra': ['eurodance', 'country', 'metal', 'rock'], 'Kyle Velez': ['jazz', 'eurodance', 'metal', 'classical', 'pop'], 'Megan Rodriguez': ['jazz', 'metal', 'gabber'], 'Cynthia Barrera': ['metal', 'rock', 'disco', 'gabber'], 'Michael Thomas': ['gabber', 'trance', 'hiphop', 'pop', 'rock'], 'Vanessa Haley': ['eurodance', 'country', 'hiphop', 'disco'], 'Julie Alvarado': ['jazz', 'gabber', 'classical', 'trance', 'pop'

***2) Generate a CSV file based on the data_dict dictionary***

In [3]:
import csv

def make_trace_csv(data_dict, file_path):
    """
    :param data_dict: nested dictionnary with user_id as key;
                         and the values are the songs listened by the user_id
    :param file_path: saving path 
    """
    with open(file_path, newline="", mode="w") as csvfile:
         csv_writer = csv.writer(csvfile,
                        delimiter=',',
                        quotechar='|',
                        quoting=csv.QUOTE_MINIMAL)
         
         
         row = ["user_id", "song_id"]
         
         csv_writer.writerow(row)

         for key_ in data_dict:
                vect = data_dict[key_]
                for i in range(len(vect)):
                  row = []
                  row.append(key_)
                  row.append(vect[i])
                  csv_writer.writerow(row)

In [4]:
make_trace_csv(data_dict, "trace.csv")

In [6]:
Song_df = pandas.read_csv("trace.csv")
Song_df.head()

Unnamed: 0,user_id,song_id
0,Jacob Gomez,jazz
1,Jacob Gomez,gabber
2,Jacob Gomez,blues
3,Jacob Gomez,trance
4,Jacob Gomez,disco


***3) Filter items which were clicked by few users***

In [8]:
def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
    count = playcount_groupbyid.size()
    return count

def filter_triplets(tp, min_uc=0, min_sc=10):
    # Only keep the triplets for songs which were clicked on by at least min_sc users. 
    if min_sc > 10:
        itemcount = get_count(tp, 'song_id')
        tp = tp[tp['user_id'].isin(itemcount.index[itemcount >= min_sc])]
    
    # Only keep the triplets for users who clicked on at least min_uc songs
    if min_uc > 0:
        usercount = get_count(tp, 'user_id')
        tp = tp[tp['user_id'].isin(usercount.index[usercount >= min_uc])]
    
    # Update both usercount and itemcount after filtering
    usercount, itemcount = get_count(tp, 'user_id'), get_count(tp, 'song_id') 

    return tp, usercount, itemcount

raw_data, user_activity, item_popularity = filter_triplets(Song_df)

In [9]:
item_popularity.tail()

Unnamed: 0,song_id,size
8,metal,13539
9,pop,15722
10,reggae,9280
11,rock,16696
12,trance,8728


In [10]:
user_activity.head()

Unnamed: 0,user_id,size
0,Aaron Abbott,4
1,Aaron Alexander,3
2,Aaron Allen,4
3,Aaron Alvarez,5
4,Aaron Anderson,3


In [11]:
raw_data.tail()

Unnamed: 0,user_id,song_id
155987,Kristen Paul,jazz
155988,Kristen Paul,blues
155989,Kristen Paul,classical
155990,Kristen Paul,trance
155991,Kristen Paul,rock


***4) Reindex user_id and song_id***




In [12]:
movie_id_dict = {u_id: i for i, u_id in enumerate(np.unique(raw_data["song_id"]))}

raw_data["song_id"] = raw_data["song_id"].apply(lambda x: movie_id_dict[x])

n_items = len(np.unique(raw_data["song_id"]))
n_users = len(np.unique(raw_data["user_id"]))

In [13]:
raw_data.head()

Unnamed: 0,user_id,song_id
0,Jacob Gomez,7
1,Jacob Gomez,5
2,Jacob Gomez,0
3,Jacob Gomez,12
4,Jacob Gomez,3


**5) Splitting data into train_data and test_data to start training the song recommender system ***

In [14]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(raw_data, test_size = 0.20, random_state=0)

In [15]:
test_data.head()

Unnamed: 0,user_id,song_id
148755,Luke Henson,6
117247,Paul Padilla,9
64125,Shawn Little,12
10835,Jeremy Mendoza,2
2124,Jordan Lewis,8


***6) The class user_similarity_recommender_py developed below mesure the similarity between users to recommend to each song his potential listeners: collaborative based system predict what a particular user like based on what other similar users like. User-user filtering approach involves defining a sparce matrix based on a user who liked a special song. I'm seeking to answer a question, for each user, what a song listened by a user, will also be listened by another set of users. To further simplify this, based on what are the users who liked this song in the past, what other similar users will like it in the futur.***

In [20]:
class user_similarity_recommender_py():
    def __init__(self):
        self.train_data = None
        self.user_id = None
        self.song_id = None
        self.sparce_matrix = None
        self.songs_dict = None
        self.rev_songs_dict = None
        self.user_similarity_recommendations = None
        
    #Get unique items (songs) corresponding to a given user
    def get_user_items(self, user):
        user_data = self.train_data[self.train_data[self.user_id] == user]
        user_items = list(user_data[self.song_id].unique())
        
        return user_items
        
    #Get unique users for a given item (song)
    def get_item_users(self, item):
        item_data = self.train_data[self.train_data[self.song_id] == item]
        item_users = np.array(item_data[self.user_id].unique())
            
        return item_users
        
    #Get unique users in the training data
    def get_all_users_train_data(self):
        all_users = list(self.train_data[self.user_id].unique())
            
        return all_users
        
    #Construct sparce matrix
    def construct_sparce_matrix(self, song_users, all_users):
            
        ####################################
        #Get songs for all users in song_users.
        ####################################
        song_users_songs = []        
        for i in range(0, len(song_users)):
            song_users_songs.append(self.get_user_items(song_users[i]))
            
        ###############################################
        #Initialize the user sparce matrix of size 
        #len(songs_users) X len(song)
        ###############################################
        sparce_matrix = np.matrix(np.zeros(shape=(len(song_users), len(all_users))), float)
           
        #############################################################
        #Calculate similarity between song_users and all unique users
        #in the training data
        #############################################################
        for i in range(0,len(all_users)):
            #Calculate unique songs  of user i
            users_i_data = self.train_data[self.train_data[self.user_id] == all_users[i]]
            songs_i = set(users_i_data[self.song_id].unique())
            
            for j in range(0,len(song_users)):       
                    
                #Get unique songs of listener j
                songs_j = song_users_songs[j]
                    
                #Calculate intersection of songs of listeners i and j 
                items_intersection = songs_i.intersection(songs_j)
                
                #Calculate cooccurence_matrix[i,j] as Jaccard Index
                if len(items_intersection) != 0:
                    #Calculate union of songs of listeners i and j
                    songs_union = songs_i.union(songs_j)
                    
                    sparce_matrix[j,i] = float(len(items_intersection))/float(len(songs_union))
                else:
                    sparce_matrix[j,i] = 0
                    
        
        return sparce_matrix

    
    #Use the sparce matrix to make the 100 top recommendations
    def generate_top_recommendations(self, song, sparce_matrix, all_users, song_users):
        print("Non zero values in sparce_matrix :%d" % np.count_nonzero(sparce_matrix))
        
        #Calculate a weighted average of the scores in sparce matrix for all song users.
        item_sim_scores = sparce_matrix.sum(axis=0)/float(sparce_matrix.shape[0])
        item_sim_scores = np.array(item_sim_scores)[0].tolist()
 
        #Sort the indices of user_sim_scores based upon their value
        #Also maintain the corresponding score
        sort_index = sorted(((e,i) for i,e in enumerate(list(item_sim_scores))), reverse=True)
    
        #Create a dataframe from the following
        columns = ['song', 'user', 'score', 'rank']
        #index = np.arange(1) # array of numbers for the number of samples
        df = pandas.DataFrame(columns=columns)
         
        #Fill the dataframe with top 100 user based recommendations
        rank = 1 
        for i in range(0,len(sort_index)):
            if ~np.isnan(sort_index[i][0]) and all_users[sort_index[i][1]] not in song_users and rank <= 100:
                df.loc[len(df)]=[song,all_users[sort_index[i][1]],sort_index[i][0],rank]
                rank = rank+1
        
        #Handle the case where there are no recommendations
        if df.shape[0] == 0:
            print("The current user has no songs for training the user similarity based recommendation model.")
            return -1
        else:
            return df
 
    #Create the user similarity based recommender system model
    def create(self, train_data, user_id, song_id):
        self.train_data = train_data
        self.user_id = user_id
        self.song_id = song_id

    #Use the user similarity based recommender system model to
    #make recommendations
    def recommend(self, song):
        
        ########################################
        #A. Get all unique users for this song
        ########################################
        song_users = self.get_item_users(song)    
            
        print("No. of unique users for the song: %d" % len(song_users))
        
        ######################################################
        #B. Get all unique users (users) in the training data
        ######################################################
        all_users = self.get_all_users_train_data()
        
        print("no. of unique users in the training set: %d" % len(all_users))
         
        ###############################################
        #C. Construct item sparce matrix of size 
        #len(song_users) X len(song)
        ###############################################
        sparce_matrix = self.construct_sparce_matrix(song_users, all_users)
        
        #######################################################
        #D. Use the sparce matrix to make recommendations
        #######################################################
        df_recommendations = self.generate_top_recommendations(song, sparce_matrix, all_users, song_users)
                
        return df_recommendations
    
    #Get similar users to given users
    def get_similar_users(self, user_list):
        
        song_users = user_list
        
        ######################################################
        #B. Get all unique users in the training data
        ######################################################
        all_users = self.get_all_users_train_data()
        
        print("no. of unique songs in the training set: %d" % len(all_users))
         
        ###############################################
        #C. Construct user sparce matrix of size 
        #len(song_users) X len(all_users)
        ###############################################
        sparce_matrix = self.construct_sparce_matrix(song_users, all_users)
        
        #######################################################
        #D. Use the sparce matrix to make recommendations
        #######################################################
        item = ""
        df_recommendations = self.generate_top_recommendations(item, sparce_matrix, all_users, song_users)
         
        return df_recommendations

In [21]:
is_model = user_similarity_recommender_py()
is_model.create(train_data, 'user_id', 'song_id')

In [22]:
songs = test_data['song_id'].unique()

***7) The generate_top_recommendations function calculate a weighted average of the scores in sparce matrix for all song users. Using my model, I will be able to predict the list of listeners of a song.***

In [23]:
print("the songs for the user in training data")
item_id = songs[5]
item_users = is_model.get_item_users(item_id)
#
print("------------------------------------------------------------------------------------")
print("Training data songs for the song itemid: %s:" % item_id)
print("------------------------------------------------------------------------------------")

for item_user in item_users:
    print(item_user)

print("----------------------------------------------------------------------")
print("Recommendation process going on:")
print("----------------------------------------------------------------------")

#Recommend songs for the user using personalized model
is_model.recommend(item_id)

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
Danielle Hines
Justin Rojas
Douglas Snyder
Angela Dickerson
Barry Taylor
Madeline Rodriguez
Karen Pacheco
Jesus Hoffman
Kathleen Kennedy
Joel Keith
Angela Guzman
Tammy Burns
Carol Holmes
Rebecca Burns
Alan Stone
Charles Peterson
Stephanie Osborn
Elizabeth Jensen
Jimmy Flores
Adam Brandt
Connie Neal
Maureen Gordon
Christina Silva
Duane Wong
Dr. Matthew Baker
Krystal Kane
Alisha Johnson MD
Sara Gardner
Jason Perez
Jennifer Horn
Jennifer Robbins
Ariana Cooper
Charles Barton
Jason Powell
David Mcdonald
Peter Graham
Michele Howard
Franklin Villa
Nicole Thompson
Margaret Clark
Cheryl Vaughn
Amy Quinn
James Rodriguez MD
Taylor Bryant
Colleen Matthews
Angelica Sparks
Cody Burnett
Stephen Mcfarland
Jennifer Nichols
Jasmine Butler
Jerry Ortiz
Ruben Matthews
Angela Ware
Dr. Jason Crawford
Sophia Castro
Kendra Green
Sabrina Mcintyre
Amy Murray
John Simpson
James Perkins
Molly Webb
Gabriel Ellis
Christine Gr

Unnamed: 0,song,user,score,rank
0,7,Karen Bray,0.212436,1
1,7,Angela Huerta,0.212436,2
2,7,Anthony Wade,0.212436,3
3,7,Thomas Parks,0.212436,4
4,7,Doris Weaver,0.212436,5
...,...,...,...,...
95,7,Patricia Summers,0.198129,96
96,7,Larry Thompson,0.198129,97
97,7,Ann Williamson,0.198129,98
98,7,Patrick Costa,0.198129,99


***7) I can also use my user similarity based collaborative filtering model to find similar users to any users in the dataset***

In [24]:
is_model.get_similar_users(["Jacob Gomez"])

no. of unique songs in the training set: 40020
Non zero values in sparce_matrix :27124


Unnamed: 0,song,user,score,rank
0,,Christopher Sutton,1.00,1
1,,Joseph Price,1.00,2
2,,Nicole Delgado,1.00,3
3,,Daniel Harris,1.00,4
4,,Anthony Stone,1.00,5
...,...,...,...,...
95,,Renee Chase,0.75,96
96,,Ashley Osborn,0.75,97
97,,Rebecca Frederick,0.75,98
98,,William Mitchell,0.75,99
