# Collaborative Based Filtering

In [1]:
## importing all necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')


In [2]:
##read text file.
data = pd.read_table('kaggle_visible_evaluation_triplets.txt', header = None)
## reanming column names.
data.columns = ['user_id', 'song_id', 'freq'] 

## reading text file
data_2 = pd.read_table('unique_tracks.txt', sep = '<SEP>',header = None)
## changing column names.
data_2.columns = ['track_id', 'song_id', 'artist_name','release'] 

## reading text file.
data_3 = pd.read_csv('taste_profile_song_to_tracks.txt',sep = '<\t>',header = None)
## renaming column names.
data_3.columns = ['song_to_tracks'] 
data_3['Songs'],data_3['tracks'] = data_3['song_to_tracks'].str.split('\t',1).str
del data_3['song_to_tracks']

## reading text file.
data_4 = pd.read_table('kaggle_songs.txt',header = None)
## renaming column name
data_4.columns = ['songs'] 
data_4['Songs'],data_4['id'] = data_4['songs'].str.split(' ',1).str
del data_4['songs']

## reading text file.
data_5 = pd.read_table('kaggle_users.txt', sep = '<SEP>',header = None)
## renaming column names.
data_5.columns = ['user_id'] 


data_6 = pd.merge(data,data_5,on = 'user_id')
### merging two dataframes(data_3 and data_4)

data_7 = pd.merge(data_3,data_4,on = 'Songs')

dataframe = pd.merge(data_6,data_2,how ='left',on ='song_id')



In [3]:
dataframe['song'] = dataframe['release'].map(str) + " - " + dataframe['artist_name']


In [4]:
dataframe.head()



Unnamed: 0,user_id,song_id,freq,track_id,artist_name,release,song
0,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOBONKR12A58A7A7E0,1,TRAEHHJ12903CF492F,Dwight Yoakam,You're The One,You're The One - Dwight Yoakam
1,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOEGIYH12A6D4FC0E3,1,TRLGMFJ128F4217DBE,Barry Tuckwell/Academy of St Martin-in-the-Fie...,Horn Concerto No. 4 in E flat K495: II. Romanc...,Horn Concerto No. 4 in E flat K495: II. Romanc...
2,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOFLJQZ12A6D4FADA6,1,TRTNDNE128F1486812,Cartola,Tive Sim,Tive Sim - Cartola
3,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOHTKMO12AB01843B0,1,TRASTUE128F930D488,Lonnie Gordon,Catch You Baby (Steve Pitron & Max Sanna Radio...,Catch You Baby (Steve Pitron & Max Sanna Radio...
4,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SODQZCY12A6D4F9D11,1,TRFPLWO128F1486B9E,Miguel Calo,El Cuatrero,El Cuatrero - Miguel Calo


## Collabarative Filtering Model

**Item Similarity Based Recommendation Engine**
To provide a more personal recommendation to the user we need apply a recommendation engine that consider some kid of similarities between users and theirs items. In other words, it is a recommendation engine based on calculating similarities between a user's items and the other items in our dataset. Usually to define similarity among a set of items, we need a feature set on the basis of which both items can be described. In our case it will mean features of the songs on the basis of which one song can be differentiated from another.

Since our dataset don't have this data, we can use the the Jaccard index to do an implicit similarity, based on common users, in terms of the users who listen to these songs. The Jaccard coefficient measures similarity between finite sample sets, and is defined as the size of the intersection divided by the size of the union of the sample sets:imageThe basic idea remains that if two songs are being listened to by a large fraction of common users out of the total listeners, the two songs can be said to be similar to each other.

So, we need calculate the similarity of each song in the user's list to those in our dataset, using the similarity metric defined previously, but we need attention to it can become a computation-intensive step when we have a large number of songs. To make the computation more feasible,probably you use a cluster or, in our case, we limited ours items to the most popular 5,000 songs so it is quite unlikely that we would miss out on any important recommendations.

Source : Kaggle

In [5]:
## First we will look for item based collabarative modeling
"""Based on what you like in the past,what other similar song that you will 
like based on what other similar user have liked."""
#Class for Item similarity based Recommender System model
class item_similarity_recommender_py():
    def __init__(self):
        self.train_data = None
        self.user_id = None
        self.item_id = None
        self.cooccurence_matrix = None
        self.songs_dict = None
        self.rev_songs_dict = None
        self.item_similarity_recommendations = None
        
    #Get unique items (songs) corresponding to a given user
    def get_user_items(self, user):
        user_data = self.train_data[self.train_data[self.user_id] == user]
        user_items = list(user_data[self.item_id].unique())
        
        return user_items
        
    #Get unique users for a given item (song)
    def get_item_users(self, item):
        item_data = self.train_data[self.train_data[self.item_id] == item]
        item_users = set(item_data[self.user_id].unique())
            
        return item_users
        
    #Get unique items (songs) in the training data
    def get_all_items_train_data(self):
        all_items = list(self.train_data[self.item_id].unique())
            
        return all_items
        
    #Construct cooccurence matrix
    def construct_cooccurence_matrix(self, user_songs, all_songs):
            
        ####################################
        #Get users for all songs in user_songs.
        ####################################
        user_songs_users = []        
        for i in range(0, len(user_songs)):
            user_songs_users.append(self.get_item_users(user_songs[i]))
            
        ###############################################
        #Initialize the item cooccurence matrix of size 
        #len(user_songs) X len(songs)
        ###############################################
        cooccurence_matrix = np.matrix(np.zeros(shape=(len(user_songs), len(all_songs))), float)
           
        #############################################################
        #Calculate similarity between user songs and all unique songs
        #in the training data
        #############################################################
        for i in range(0,len(all_songs)):
            #Calculate unique listeners (users) of song (item) i
            songs_i_data = self.train_data[self.train_data[self.item_id] == all_songs[i]]
            users_i = set(songs_i_data[self.user_id].unique())
            
            for j in range(0,len(user_songs)):       
                    
                #Get unique listeners (users) of song (item) j
                users_j = user_songs_users[j]
                    
                #Calculate intersection of listeners of songs i and j
                users_intersection = users_i.intersection(users_j)
                
                #Calculate cooccurence_matrix[i,j] as Jaccard Index
                if len(users_intersection) != 0:
                    #Calculate union of listeners of songs i and j
                    users_union = users_i.union(users_j)
                    
                    cooccurence_matrix[j,i] = float(len(users_intersection))/float(len(users_union))
                else:
                    cooccurence_matrix[j,i] = 0
                    
        
        return cooccurence_matrix

    
    #Use the cooccurence matrix to make top recommendations
    def generate_top_recommendations(self, user, cooccurence_matrix, all_songs, user_songs):
        print("Non zero values in cooccurence_matrix :%d" % np.count_nonzero(cooccurence_matrix))
        
        #Calculate a weighted average of the scores in cooccurence matrix for all user songs.
        user_sim_scores = cooccurence_matrix.sum(axis=0)/float(cooccurence_matrix.shape[0])
        user_sim_scores = np.array(user_sim_scores)[0].tolist()
 
        #Sort the indices of user_sim_scores based upon their value
        #Also maintain the corresponding score
        sort_index = sorted(((e,i) for i,e in enumerate(list(user_sim_scores))), reverse=True)
    
        #Create a dataframe from the following
        columns = ['user_id', 'song', 'score', 'rank']
        #index = np.arange(1) # array of numbers for the number of samples
        df = pd.DataFrame(columns=columns)
         
        #Fill the dataframe with top 10 item based recommendations
        rank = 1 
        for i in range(0,len(sort_index)):
            if ~np.isnan(sort_index[i][0]) and all_songs[sort_index[i][1]] not in user_songs and rank <= 10:
                df.loc[len(df)]=[user,all_songs[sort_index[i][1]],sort_index[i][0],rank]
                rank = rank+1
        
        #Handle the case where there are no recommendations
        if df.shape[0] == 0:
            print("The current user has no songs for training the item similarity based recommendation model.")
            return -1
        else:
            return df
 
    #Create the item similarity based recommender system model
    def create(self, train_data, user_id, item_id):
        self.train_data = train_data
        self.user_id = user_id
        self.item_id = item_id

    #Use the item similarity based recommender system model to
    #make recommendations
    def recommend(self, user):
        
        ########################################
        #A. Get all unique songs for this user
        ########################################
        user_songs = self.get_user_items(user)    
            
        print("No. of unique songs for the user: %d" % len(user_songs))
        
        ######################################################
        #B. Get all unique items (songs) in the training data
        ######################################################
        all_songs = self.get_all_items_train_data()
        
        print("no. of unique songs in the training set: %d" % len(all_songs))
         
        ###############################################
        #C. Construct item cooccurence matrix of size 
        #len(user_songs) X len(songs)
        ###############################################
        cooccurence_matrix = self.construct_cooccurence_matrix(user_songs, all_songs)
        
        #######################################################
        #D. Use the cooccurence matrix to make recommendations
        #######################################################
        df_recommendations = self.generate_top_recommendations(user, cooccurence_matrix, all_songs, user_songs)
                
        return df_recommendations
    
    #Get similar items to given items
    def get_similar_items(self, item_list):
        
        user_songs = item_list
        
        ######################################################
        #B. Get all unique items (songs) in the training data
        ######################################################
        all_songs = self.get_all_items_train_data()
        
        print("no. of unique songs in the training set: %d" % len(all_songs))
         
        ###############################################
        #C. Construct item cooccurence matrix of size 
        #len(user_songs) X len(songs)
        ###############################################
        cooccurence_matrix = self.construct_cooccurence_matrix(user_songs, all_songs)
        
        #######################################################
        #D. Use the cooccurence matrix to make recommendations
        #######################################################
        user = ""
        df_recommendations = self.generate_top_recommendations(user, cooccurence_matrix, all_songs, user_songs)
         
        return df_recommendations

      
    

# Sample testing on how collaborative work

Step by step recommendation : <br>
    1) Getting the information about our user. (Like in our case we get the songs he/she listened)<br>
    2) Storing the user_ids of all those people who also listened to the songs listened by our user.<br>
    3) Creating our cooccurence_matrix having dimension as (user_song, all_song) and calculating          the Jaccard index (formula given below).<br>
   4) Sorting the index and adding in dataframe.<br>
    5) Top 10 highest score are recommended.

In [6]:
# #Step by step preparation for the recommendation
# #Calling our class
# is_model = item_similarity_recommender_py()
# is_model.create(dataframe, 'user_id', 'song')

In [7]:
#Storing every unique user_id
users = dataframe['user_id'].unique()

In [8]:
#STEP-1
#storing any random user id in user for simple test
#Lets recommend to this user using item-item similarity
user = users[5]
user

'91b8fac7dc5e03f6cfaf6e2aa7171f14a8354d62'

In [9]:
#storing the data of the following user in user_Data
user_data = dataframe[dataframe['user_id'] == user]
user_data.head()

Unnamed: 0,user_id,song_id,freq,track_id,artist_name,release,song
53,91b8fac7dc5e03f6cfaf6e2aa7171f14a8354d62,SOLLNTU12A6701CFDC,1,TRHTCXG12903CC2F60,3 Doors Down,Kryptonite,Kryptonite - 3 Doors Down
54,91b8fac7dc5e03f6cfaf6e2aa7171f14a8354d62,SOBXMJC12AB018293E,1,TRYJGXE128F934D097,Thievery Corporation,Un Simple Histoire,Un Simple Histoire - Thievery Corporation
55,91b8fac7dc5e03f6cfaf6e2aa7171f14a8354d62,SODBXDO12A6D4FCD4F,14,TRYLBYO128F422BC6E,Dario Marianelli,Love Letters,Love Letters - Dario Marianelli
56,91b8fac7dc5e03f6cfaf6e2aa7171f14a8354d62,SOVIJFB12A6D4F98E2,3,TRYFGHH128F148D301,Secret Garden,Passacaglia,Passacaglia - Secret Garden
57,91b8fac7dc5e03f6cfaf6e2aa7171f14a8354d62,SOYPQYA12A6D4FB8F4,4,TRYQCNK128F4215383,Secret Garden,Pastorale,Pastorale - Secret Garden


In [11]:
#storing the songs listened by that user
user_songs = list(user_data['song'].unique())
user_songs

['Kryptonite - 3 Doors Down',
 'Un Simple Histoire - Thievery Corporation',
 'Love Letters - Dario Marianelli',
 'Passacaglia - Secret Garden',
 'Pastorale - Secret Garden',
 'Until The Morning - Thievery Corporation',
 'The Richest Man In Babylon - Thievery Corporation',
 'In The Waiting Line - Zero 7',
 'Illumination - Secret Garden',
 'Big Yellow Taxi - Counting Crows / Vanessa Carlton']

In [12]:
#Here what I did is using really small sample because matrix creation takes a very large time.
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(dataframe, train_size = 0.005, random_state=0)
train_data.head(5)

Unnamed: 0,user_id,song_id,freq,track_id,artist_name,release,song
974539,d17396a872ede1c2ac879cba1dbf193e705a0bc6,SONUXNQ12A8C13DF0D,1,TRYIEJY128F42A1B89,Three Drives,Greece 2000,Greece 2000 - Three Drives
79089,f40218fc78555380ae180fc16d9e4317424d5396,SOGWSEW12A8C1344E0,1,TRNEGYK128F42720D5,Deerhunter,Never Stops,Never Stops - Deerhunter
892746,5a9688b82e6c3472ac61bf43a368a54d020cbd77,SONZKXN12A58A7D454,5,TRUFXCE12903CA3882,Jessica Lea Mayfield,I Can't Lie to You_ Love,I Can't Lie to You_ Love - Jessica Lea Mayfield
588696,ab953babd652e5081ba3cc5daabc2311f73ffadb,SOUMSSX12AB0182F29,1,TRNGROG128F92E9C20,Sin Bandera,A Ti,A Ti - Sin Bandera
1189015,b12afc09cbb25a6d5fe1a53198ca750d147a569d,SOXWJMH12A8151CA83,1,TRUVZLX128F424EF58,Shaggy,Woman A Pressure Me,Woman A Pressure Me - Shaggy


In [13]:
train_data['song'].nunique()

5751

In [14]:
# This function will store all the data of user who litened to the songs,
#listened by our particular user.
"""For eg. if you listened to song 'A' and I also listened to song'A' then it will 
store my user_id."""
 
def get_item_users(item):
    item_data = train_data[train_data['song'] == item]
    item_users = set(item_data['user_id'].unique())
            
    return item_users

In [15]:
#Using loop to first get song id and then storing user id to user_songs_users
user_songs_users = []        
for i in range(0, len(user_songs)):
    user_songs_users.append(get_item_users(user_songs[i]))

In [16]:
user_songs_users #This list contains the id's of all the user who listened to the song 
#listened by our user, we can see that it contains no id because our data is small

[{'22819926ec2d9a7906689d75b1c426d77ed5a731',
  '2abcf9f1e6203d669613f5b1d6d956e1cb06f8a6',
  '5896625c103e55f39c62cf577c3eea00c759d505',
  'fe3f3e74718f95db37f0a4a4db45d503686ca4c4'},
 set(),
 set(),
 set(),
 set(),
 {'5393c182c520caea8b876e5348d1c1fff639fb58'},
 set(),
 {'5ddd3977b8ce3587733fe704aef169a486447a55'},
 set(),
 {'4fd7030d9b7d61b7a947d30f8afa3938a20c7d22',
  '7b9dc7e88fe337c33727ca7b13486f68d37f8fa0'}]

In [17]:
all_songs = list(train_data['song'].unique())
# This will store every unique song from our train dataset

In [19]:
#creating our matrix which is nothing but shape as given
cooccurence_matrix = np.matrix(np.zeros(shape=(len(user_songs), len(all_songs))), float)
cooccurence_matrix

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [20]:
train_data['song'].nunique()

5751

{'135295188afc027cafbd28de292ce94fd5a71e4c',
 '61d5ba8c7cf67f5d744893385195fef5aec1545e',
 'a51785282156c11e6495a60e1f5ed55dfe83393c',
 'd17396a872ede1c2ac879cba1dbf193e705a0bc6'}

In [22]:
#Here we are doing nothing but just filling up our cooccurence matrix
for i in range(0,len(all_songs)):
        #Calculate unique listeners (users) of song (item) i
        songs_i_data = train_data[train_data['song'] == all_songs[i]]
        users_i = set(songs_i_data['user_id'].unique())

        for j in range(0,len(user_songs)):       

            #Get unique listeners (users) of song (item) j from our user
            users_j = user_songs_users[j]

            #Calculate intersection of listeners of songs i and j
            users_intersection = users_i.intersection(users_j)

            #Calculate cooccurence_matrix[i,j] as Jaccard Index
            if len(users_intersection) != 0:
                #Calculate union of listeners of songs i and j
                users_union = users_i.union(users_j)

                cooccurence_matrix[j,i] = float(len(users_intersection))/float(len(users_union))
            else:
                cooccurence_matrix[j,i] = 0

In [23]:
cooccurence_matrix

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [24]:
print("Non zero values in cooccurence_matrix :%d" % np.count_nonzero(cooccurence_matrix))

Non zero values in cooccurence_matrix :4


In [25]:
cooccurence_matrix.sum(axis=0)

matrix([[0., 0., 0., ..., 0., 0., 0.]])

In [28]:
user_sim_scores = cooccurence_matrix.sum(axis=0)/float(cooccurence_matrix.shape[0])
user_sim_scores = np.array(user_sim_scores)[0].tolist()
user_sim_scores

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0

In [29]:
sort_index = sorted(((e,i) for i,e in enumerate(list(user_sim_scores))), reverse=True)
    

In [30]:
#Here we just sort index according to the one having top similarity song.
sort_index

[(0.1, 4191),
 (0.1, 1197),
 (0.1, 1187),
 (0.1, 475),
 (0.0, 5750),
 (0.0, 5749),
 (0.0, 5748),
 (0.0, 5747),
 (0.0, 5746),
 (0.0, 5745),
 (0.0, 5744),
 (0.0, 5743),
 (0.0, 5742),
 (0.0, 5741),
 (0.0, 5740),
 (0.0, 5739),
 (0.0, 5738),
 (0.0, 5737),
 (0.0, 5736),
 (0.0, 5735),
 (0.0, 5734),
 (0.0, 5733),
 (0.0, 5732),
 (0.0, 5731),
 (0.0, 5730),
 (0.0, 5729),
 (0.0, 5728),
 (0.0, 5727),
 (0.0, 5726),
 (0.0, 5725),
 (0.0, 5724),
 (0.0, 5723),
 (0.0, 5722),
 (0.0, 5721),
 (0.0, 5720),
 (0.0, 5719),
 (0.0, 5718),
 (0.0, 5717),
 (0.0, 5716),
 (0.0, 5715),
 (0.0, 5714),
 (0.0, 5713),
 (0.0, 5712),
 (0.0, 5711),
 (0.0, 5710),
 (0.0, 5709),
 (0.0, 5708),
 (0.0, 5707),
 (0.0, 5706),
 (0.0, 5705),
 (0.0, 5704),
 (0.0, 5703),
 (0.0, 5702),
 (0.0, 5701),
 (0.0, 5700),
 (0.0, 5699),
 (0.0, 5698),
 (0.0, 5697),
 (0.0, 5696),
 (0.0, 5695),
 (0.0, 5694),
 (0.0, 5693),
 (0.0, 5692),
 (0.0, 5691),
 (0.0, 5690),
 (0.0, 5689),
 (0.0, 5688),
 (0.0, 5687),
 (0.0, 5686),
 (0.0, 5685),
 (0.0, 5684),
 (0.0, 

In [31]:
#Create a dataframe from the following
columns = ['user_id', 'song', 'score', 'rank']
#index = np.arange(1) # array of numbers for the number of samples
df = pd.DataFrame(columns=columns)

In [32]:
#Can someone help me with these code, i didn't get these code properly
rank = 1 
for i in range(0,len(sort_index)):
    if ~np.isnan(sort_index[i][0]) and all_songs[sort_index[i][1]] not in user_songs and rank <= 10:
        df.loc[len(df)]=[user,all_songs[sort_index[i][1]],sort_index[i][0],rank]
        rank = rank+1

In [33]:
df

Unnamed: 0,user_id,song,score,rank
0,91b8fac7dc5e03f6cfaf6e2aa7171f14a8354d62,Diamond On A Landmine - Billy Talent,0.0,1
1,91b8fac7dc5e03f6cfaf6e2aa7171f14a8354d62,Fade To Grey - Visage,0.0,2
2,91b8fac7dc5e03f6cfaf6e2aa7171f14a8354d62,Room 409 - Bullet For My Valentine,0.0,3
3,91b8fac7dc5e03f6cfaf6e2aa7171f14a8354d62,Climax - Slum Village,0.0,4
4,91b8fac7dc5e03f6cfaf6e2aa7171f14a8354d62,Dame Un Besito - Andy & Lucas,0.0,5
5,91b8fac7dc5e03f6cfaf6e2aa7171f14a8354d62,Yes_ I Don't Want This - Digitalism,0.0,6
6,91b8fac7dc5e03f6cfaf6e2aa7171f14a8354d62,Dance Hall Days - Wang Chung,0.0,7
7,91b8fac7dc5e03f6cfaf6e2aa7171f14a8354d62,Can't Smile Without You - Barry Manilow,0.0,8
8,91b8fac7dc5e03f6cfaf6e2aa7171f14a8354d62,Vegetable Car - Joshua Radin,0.0,9
9,91b8fac7dc5e03f6cfaf6e2aa7171f14a8354d62,Can't Have You - Jonas Brothers,0.0,10


In [None]:
#in this way we get recommended

# Collaborative Filtering 

In [5]:
#total play_Count
total_play_count = sum(dataframe.freq)

#only using data of top 300000 user
play_count = dataframe[['song', 'freq']].groupby('song').sum().\
             sort_values(by='freq',ascending=False).head(30000)


print('30,000 most popular songs represents {:3.2%} of total listen.'.format(float(play_count.sum())/total_play_count))
song_subset = list(play_count.index[:30000])
user_subset = list(dataframe.loc[dataframe.song.isin(song_subset), 'user_id'].unique())

dataframe_sub = dataframe[dataframe.song.isin(song_subset)]

30,000 most popular songs represents 85.24% of total listen.


In [6]:
dataframe_sub.head()

Unnamed: 0,user_id,song_id,freq,track_id,artist_name,release,song
0,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOBONKR12A58A7A7E0,1,TRAEHHJ12903CF492F,Dwight Yoakam,You're The One,You're The One - Dwight Yoakam
1,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOEGIYH12A6D4FC0E3,1,TRLGMFJ128F4217DBE,Barry Tuckwell/Academy of St Martin-in-the-Fie...,Horn Concerto No. 4 in E flat K495: II. Romanc...,Horn Concerto No. 4 in E flat K495: II. Romanc...
2,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOFLJQZ12A6D4FADA6,1,TRTNDNE128F1486812,Cartola,Tive Sim,Tive Sim - Cartola
3,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOHTKMO12AB01843B0,1,TRASTUE128F930D488,Lonnie Gordon,Catch You Baby (Steve Pitron & Max Sanna Radio...,Catch You Baby (Steve Pitron & Max Sanna Radio...
4,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SODQZCY12A6D4F9D11,1,TRFPLWO128F1486B9E,Miguel Calo,El Cuatrero,El Cuatrero - Miguel Calo


In [41]:
is_model = item_similarity_recommender_py()
is_model.create(dataframe_sub, 'user_id', 'song')

#recommending to random user
user_id = users[26]

user_items = is_model.get_user_items(user_id)
#Recommend songs for the user using personalized model
is_model.recommend(user_id)

No. of unique songs for the user: 13
no. of unique songs in the training set: 30000
Non zero values in cooccurence_matrix :55952


Unnamed: 0,user_id,song,score,rank
0,6530c4fc41b9110de5d39fe0355fa103c66385f0,Clocks - Coldplay,0.028875,1
1,6530c4fc41b9110de5d39fe0355fa103c66385f0,Uprising - Muse,0.019479,2
2,6530c4fc41b9110de5d39fe0355fa103c66385f0,Use Somebody - Kings Of Leon,0.018976,3
3,6530c4fc41b9110de5d39fe0355fa103c66385f0,Supermassive Black Hole (Album Version) - Muse,0.018175,4
4,6530c4fc41b9110de5d39fe0355fa103c66385f0,Supermassive Black Hole (Twilight Soundtrack V...,0.018175,5
5,6530c4fc41b9110de5d39fe0355fa103c66385f0,Bubble Toes - Jack Johnson,0.017151,6
6,6530c4fc41b9110de5d39fe0355fa103c66385f0,In My Place - Coldplay,0.017022,7
7,6530c4fc41b9110de5d39fe0355fa103c66385f0,The Only Exception (Album Version) - Paramore,0.016152,8
8,6530c4fc41b9110de5d39fe0355fa103c66385f0,Secrets - OneRepublic,0.016135,9
9,6530c4fc41b9110de5d39fe0355fa103c66385f0,Bleed It Out [Live At Milton Keynes] - Linkin ...,0.016027,10
