In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from joblib import Parallel, delayed



In [2]:
song_df_1 = pd.read_csv('triplets_file.csv')
song_df_2 = pd.read_csv('song_data.csv')

In [3]:
song_df_1.columns = ['user_id', 'song_id', 'listen_count']

In [4]:
song_df = pd.merge(song_df_1, song_df_2.drop_duplicates(['song_id']), on="song_id", how="left")

In [5]:
song_df.rename({'title': 'song'}, axis=1, inplace=True)

In [6]:
song_df['user_id'] = preprocessing.LabelEncoder().fit_transform(song_df.user_id)

In [7]:
song_df['song_id'] = preprocessing.LabelEncoder().fit_transform(song_df.song_id)

In [8]:
song_df.head(10)

Unnamed: 0,user_id,song_id,listen_count,song,release,artist_name,year
0,54961,153,1,The Cove,Thicker Than Water,Jack Johnson,0
1,54961,413,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976
2,54961,736,1,Stronger,Graduation,Kanye West,2007
3,54961,750,1,Constellations,In Between Dreams,Jack Johnson,2005
4,54961,1188,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999
5,54961,1239,5,Apuesta Por El Rock 'N' Roll,Antología Audiovisual,Héroes del Silencio,2007
6,54961,1536,1,Paper Gangsta,The Fame Monster,Lady GaGa,2008
7,54961,2056,1,Stacked Actors,There Is Nothing Left To Lose,Foo Fighters,1999
8,54961,2220,1,Sehr kosmisch,Musik von Harmonia,Harmonia,0
9,54961,3046,1,Heaven's gonna burn your eyes,Hôtel Costes 7 by Stéphane Pompougnac,Thievery Corporation feat. Emiliana Torrini,2002


In [9]:
song_grouped = song_df.groupby(['song']).agg({'listen_count': 'count'}).reset_index()

In [10]:
grouped_sum = song_grouped['listen_count'].sum()

In [11]:
song_grouped['percentage']  = song_grouped['listen_count'].div(grouped_sum)*100

In [12]:
song_grouped.sort_values(['listen_count', 'song'], ascending = [0,1])

Unnamed: 0,song,listen_count,percentage
6836,Sehr kosmisch,8277,0.41385
8725,Undo,7032,0.35160
1964,Dog Days Are Over (Radio Edit),6949,0.34745
9496,You're The One,6729,0.33645
6498,Revelry,6145,0.30725
...,...,...,...
3363,Historia Del Portero,51,0.00255
6782,Scared,51,0.00255
2041,Don´t Leave Me Now,50,0.00250
2860,Ghosts (Toxic Avenger Mix),48,0.00240


In [13]:
users = song_df['user_id'].unique()
len(users)

76353

In [14]:
songs = song_df['song'].unique()
len(songs)

9567

In [15]:
train_data, test_data = train_test_split(song_df, test_size = 0.20, random_state=0)

In [16]:
class popularity_recommender_py():
    def __init__(self):
        self.train_data = None
        self.user_id = None
        self.item_id = None
        self.popularity_recommendations = None
    
    def create(self, train_data, user_id, item_id):
        self.train_data = train_data
        self.user_id = user_id
        self.item_id = item_id
        train_data_grouped = train_data.groupby([self.item_id]).agg({self.user_id: 'count'}).reset_index()
        train_data_grouped.rename(columns = {'user_id': 'score'},inplace=True)
        train_data_sort = train_data_grouped.sort_values(['score', self.item_id], ascending = [0,1])
        train_data_sort['Rank'] = train_data_sort['score'].rank(ascending=0, method='first')
        self.popularity_recommendations = train_data_sort.head(10)
        
    def recommend(self, user_id):    
        user_recommendations = self.popularity_recommendations
        user_recommendations['user_id'] = user_id
        cols = user_recommendations.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        user_recommendations = user_recommendations[cols]
        return user_recommendations

In [17]:
pm = popularity_recommender_py()
pm.create(train_data, 'user_id', 'song')
user_id = users[5]
pm.recommend(user_id)

Unnamed: 0,user_id,song,score,Rank
6836,22570,Sehr kosmisch,6630,1.0
8725,22570,Undo,5639,2.0
1964,22570,Dog Days Are Over (Radio Edit),5592,3.0
9496,22570,You're The One,5396,4.0
6498,22570,Revelry,4938,5.0
6825,22570,Secrets,4627,6.0
3437,22570,Horn Concerto No. 4 in E flat K495: II. Romanc...,4368,7.0
2595,22570,Fireflies,3835,8.0
3322,22570,Hey_ Soul Sister,3819,9.0
8494,22570,Tive Sim,3707,10.0


In [18]:
pm = popularity_recommender_py()
pm.create(train_data, 'user_id', 'song')
#user the popularity model to make some prediction
user_id = '22570'
pm.recommend(user_id)

Unnamed: 0,user_id,song,score,Rank
6836,22570,Sehr kosmisch,6630,1.0
8725,22570,Undo,5639,2.0
1964,22570,Dog Days Are Over (Radio Edit),5592,3.0
9496,22570,You're The One,5396,4.0
6498,22570,Revelry,4938,5.0
6825,22570,Secrets,4627,6.0
3437,22570,Horn Concerto No. 4 in E flat K495: II. Romanc...,4368,7.0
2595,22570,Fireflies,3835,8.0
3322,22570,Hey_ Soul Sister,3819,9.0
8494,22570,Tive Sim,3707,10.0
