In [43]:
import pandas as pd
import random
from surprise import Dataset
from surprise import Reader
from surprise import SVD, KNNBasic, KNNWithMeans, KNNWithZScore
from surprise import accuracy
from surprise.model_selection import KFold
from surprise.model_selection import GridSearchCV
from collections import defaultdict
from surprise import dataset

In [460]:
DEFAULT_COUNT=50
SEED=12345

def readSongData(top):
    """
    Read song data from database

    Parameters
    ----------
    top: random sample n users from song_df

    Returns
    -------
    a pandas dataframe with columns 'user_id', 'song_id', 'listen_count', 'title', 'release', 'artist_name',
   'year', 'song'

    """

    song_df = pd.read_pickle('../../data/song.pkl')
    # random sample n users from song_df
    user_list= list(song_df.user_id.unique())
    random.seed(SEED)
    random.shuffle(user_list)
    song_df = song_df[song_df.user_id.isin(user_list[0:top])]

    return song_df

def createNewObs(songidList):
    """
    Append a new row with userId Johnny that is interested in some selected songs

    Parameters
    ----------
    songidList: the user selected song_ids with format like `SOAKIMP12A8C130995`

    Returns
    -------
    a pandas dataframe with columns 'user_id', 'song_id', 'listen_count'

    """

    ratings_dict = {'user_id': ['johnny']*len(songidList),
                    'song_id': songidList,
                    'listen_count': [DEFAULT_COUNT]*len(songidList)}
    newObs = pd.DataFrame(ratings_dict)
    newObs = newObs[['user_id', 'song_id', 'listen_count']]

    return newObs

def readSurpriseFormat(newObs, song_df):
    """
    combine newObs dataframe with song dataframe and transform it into Surprise data format

    Parameters
    ----------
    newObs: the dataframe obtain from the function createNewObs
    song_df: a dataframe containing the song information

    Returns
    -------
    a surprise.dataset

    """

    # A reader is still needed but only the rating_scale param is requiered.
    reader = Reader(rating_scale=(1, 100))

    # get train data
    train = song_df[['user_id', 'song_id', 'listen_count']]

    # combine together
    full = pd.concat([train, newObs]).reset_index(drop=True)

    # The columns must correspond to user id, item id and ratings (in that order).
    data = Dataset.load_from_df(full, reader)

    return data

def get_top_n(predictions, targetSongidList, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        if iid not in targetSongidList:
            top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [572]:
n_users = 2000
song_df = readSongData(n_users)
song_df.shape

(53035, 8)

In [573]:
len(song_df.song_id.unique())

9476

In [574]:
song_df.song_id.value_counts().head()

SOFRQTD12A81C233C0    224
SOAXGDH12A8C13F8A1    196
SOAUWYT12A81C206F1    191
SOSXLTC12AF72A7F54    174
SOBONKR12A58A7A7E0    165
Name: song_id, dtype: int64

In [575]:
songidList=list(song_df['song_id'].unique())

In [576]:
newObs2 = pd.DataFrame({'user_id':['johnny']*len(songidList),
                      'song_id':songidList,
                      'listen_count':0})[['user_id','song_id','listen_count']]
newObs2.head()

Unnamed: 0,user_id,song_id,listen_count
0,johnny,SOBDRND12A8C13FD08,0
1,johnny,SOCHBAJ12AAF3B3A4F,0
2,johnny,SOCZTMT12AF72A078E,0
3,johnny,SOHRQZQ12A6D4F81D2,0
4,johnny,SOJGMYY12AB01809BE,0


In [577]:
targetSongidList = list(song_df['song_id'].unique())[0:50]

In [578]:
mask.value_counts()

False    6178
True       50
Name: song_id, dtype: int64

In [579]:
mask = newObs2['song_id'].isin(targetSongidList)
column_name = 'listen_count'
newObs2.loc[mask, column_name] = 50 #[50 + randint(0, 9) for i in range(0,50)]

In [580]:
newObs2

Unnamed: 0,user_id,song_id,listen_count
0,johnny,SOBDRND12A8C13FD08,50
1,johnny,SOCHBAJ12AAF3B3A4F,50
2,johnny,SOCZTMT12AF72A078E,50
3,johnny,SOHRQZQ12A6D4F81D2,50
4,johnny,SOJGMYY12AB01809BE,50
5,johnny,SOQFEDG12AB018DD24,50
6,johnny,SOVRZIX12AAF3B2A32,50
7,johnny,SOZMJFG12AB017BDAF,50
8,johnny,SOAFTRR12AF72A8D4D,50
9,johnny,SOALEQA12A58A77839,50


In [581]:
# newObs = createNewObs(list(song_df['song_id'].unique())[1:50])
# newObs.head()

# newObs = createNewObs(['SOAKIMP12A8C130995','SOBBMDR12A8C13253B','SOBXHDL12A81C204C0','SOBYHAJ12A6701BF1D','SODACBL12A8C13C273'])
# newObs

In [582]:
# get train data
train = song_df[['user_id', 'song_id', 'listen_count']]

In [583]:
train.shape

(53035, 3)

In [584]:
len(train.user_id.unique())

2000

In [585]:
# A reader is still needed but only the rating_scale param is requiered.
# reader = Reader(line_format='user_id song_id listen_count', rating_scale=(1, 100))
reader = Reader(line_format='user item rating', rating_scale=(1, 50))
trainset_autofold = Dataset.load_from_df(pd.concat([train,newObs2]), reader)
trainset = trainset_autofold.build_full_trainset()
trainset

<surprise.trainset.Trainset at 0x137141e48>

In [562]:
def load_test(df):
    
    return [(uid, iid, r) for (uid, iid, r) in zip(df['user_id'], df['song_id'], df['listen_count'])]

In [563]:
testset = load_test(newObs2)

In [564]:
testset

[('johnny', 'SOAFTRR12AF72A8D4D', 50),
 ('johnny', 'SOALEQA12A58A77839', 50),
 ('johnny', 'SOATNYF12AF72A8D48', 50),
 ('johnny', 'SOBDKVR12A8C13E705', 50),
 ('johnny', 'SOBDMNP12AF72AB1E1', 50),
 ('johnny', 'SOBDWET12A6701F114', 50),
 ('johnny', 'SOBFMHC12A6D4F9401', 50),
 ('johnny', 'SOBIQWH12A8C13BCDE', 50),
 ('johnny', 'SOBJIZY12A6701F11A', 50),
 ('johnny', 'SODAQMD12A8C131D57', 50),
 ('johnny', 'SODMASJ12AF729E6FA', 50),
 ('johnny', 'SOEWYLX12A6D4F8E5F', 50),
 ('johnny', 'SOFNCRW12A6D4F727B', 50),
 ('johnny', 'SOFWANS12AF72A12E6', 50),
 ('johnny', 'SOGTMYT12A6D4F98D9', 50),
 ('johnny', 'SOHCLPF12AB017FD26', 50),
 ('johnny', 'SOHDSVJ12A6D4F93FF', 50),
 ('johnny', 'SOHZUAA12A6701F116', 50),
 ('johnny', 'SOIDSDT12A6D4F98DB', 50),
 ('johnny', 'SOIJLDG12A8C135B96', 50),
 ('johnny', 'SOIMCDE12A6D4F8383', 50),
 ('johnny', 'SOKIKWC12A6D4F98D8', 50),
 ('johnny', 'SOKJQGO12AF72ACA9F', 50),
 ('johnny', 'SOKOXWU12AF72AD1BC', 50),
 ('johnny', 'SOKQMNB12A6D4F74FD', 50),
 ('johnny', 'SOLJSEJ12A8C

In [565]:
# algo = SVD(n_factors=50, lr_all=0.005, reg_all=0.04, random_state=12345)
algo = SVD(n_factors=50, lr_all=0.005, reg_all=0.04, random_state=12345)

In [566]:
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x148ec9e48>

In [567]:
algo

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x148ec9e48>

In [568]:
algo.test(testset)

[Prediction(uid='johnny', iid='SOAFTRR12AF72A8D4D', r_ui=50, est=26.833151201523101, details={'was_impossible': False}),
 Prediction(uid='johnny', iid='SOALEQA12A58A77839', r_ui=50, est=27.496369349172372, details={'was_impossible': False}),
 Prediction(uid='johnny', iid='SOATNYF12AF72A8D48', r_ui=50, est=26.523931895219466, details={'was_impossible': False}),
 Prediction(uid='johnny', iid='SOBDKVR12A8C13E705', r_ui=50, est=25.620443561587518, details={'was_impossible': False}),
 Prediction(uid='johnny', iid='SOBDMNP12AF72AB1E1', r_ui=50, est=24.753370752330948, details={'was_impossible': False}),
 Prediction(uid='johnny', iid='SOBDWET12A6701F114', r_ui=50, est=23.966885860822536, details={'was_impossible': False}),
 Prediction(uid='johnny', iid='SOBFMHC12A6D4F9401', r_ui=50, est=23.328717395641501, details={'was_impossible': False}),
 Prediction(uid='johnny', iid='SOBIQWH12A8C13BCDE', r_ui=50, est=22.699961344722638, details={'was_impossible': False}),
 Prediction(uid='johnny', iid='S

In [569]:
top_n_svd2 = get_top_n(algo.test(testset), targetSongidList, 10)

In [570]:
top_n_svd2

defaultdict(list,
            {'johnny': [('SOMGIYR12AB0187973', 1.9632773993697927),
              ('SOUYDLS12A6D4F6C0B', 1.9073669097967207),
              ('SOKBXLS12A6702111C', 1.8651027929076249),
              ('SOPUCYA12A8C13A694', 1.6987710538953436),
              ('SOIZFTE12AB0186842', 1.672970597881577),
              ('SOKZRDM12A58A7F99C', 1.6086231212177222),
              ('SOEJVTY12A81C22428', 1.4873175545634298),
              ('SOXMTCY12A67AD8139', 1.4399892836910064),
              ('SOFJCCE12AB0183F96', 1.4063365438128688),
              ('SOPSOHT12A67AE0235', 1.3998483230520233)]})

In [571]:
top_n_svd2['johnny']

[('SOMGIYR12AB0187973', 1.9632773993697927),
 ('SOUYDLS12A6D4F6C0B', 1.9073669097967207),
 ('SOKBXLS12A6702111C', 1.8651027929076249),
 ('SOPUCYA12A8C13A694', 1.6987710538953436),
 ('SOIZFTE12AB0186842', 1.672970597881577),
 ('SOKZRDM12A58A7F99C', 1.6086231212177222),
 ('SOEJVTY12A81C22428', 1.4873175545634298),
 ('SOXMTCY12A67AD8139', 1.4399892836910064),
 ('SOFJCCE12AB0183F96', 1.4063365438128688),
 ('SOPSOHT12A67AE0235', 1.3998483230520233)]

In [489]:
top_n_svd['johnny']

[('SOIZFTE12AB0186842', 6.6755399626332128),
 ('SOKAESA12A8C1410A1', 6.5551639452970125),
 ('SOMGIYR12AB0187973', 6.4850266603334887),
 ('SOJWFSS12A8C1365FA', 6.2074298406329325),
 ('SOJYISZ12A8C135F5A', 5.9436794688824826),
 ('SOPUCYA12A8C13A694', 5.8560888937158744),
 ('SOLWZVR12AB01849C6', 5.8383444096839181),
 ('SOAAFYH12A8C13717A', 5.7540136333451164),
 ('SORJICW12A8C13640D', 5.7239626850027241),
 ('SOFMTUK12A8C13577E', 5.6746391292221485)]

In [73]:
testset_autofold = Dataset.load_from_df(newObs, reader)
testset = testset_autofold.build_full_trainset()
testset

<surprise.trainset.Trainset at 0x12b1c3898>

In [69]:
# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1302e7550>

In [72]:
predictions = algo.test(testset)

TypeError: 'Trainset' object is not iterable

In [59]:
data = Dataset.load_builtin('ml-100k')

In [60]:
data

<surprise.dataset.DatasetAutoFolds at 0x12bbaeef0>

In [35]:
# A reader is still needed but only the rating_scale param is requiered.
# reader = Reader(line_format='user_id song_id listen_count', rating_scale=(1, 100))
reader = Reader(line_format='user item rating', rating_scale=(1, 5))

In [37]:
trainset = Dataset.load_from_df(train, reader)

In [76]:
trainset

<surprise.trainset.Trainset at 0x10be53400>

In [77]:
testset = Dataset.load_from_df(newObs, reader)

In [78]:
testset

<surprise.dataset.DatasetAutoFolds at 0x10be53a20>

In [79]:
predictions = algo.test(testset)

TypeError: 'DatasetAutoFolds' object is not iterable

In [None]:
# combine together
full = pd.concat([train, newObs]).reset_index(drop=True)

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(full, reader)

In [74]:
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split

# Load the movielens-100k dataset (download it if needed),
data = Dataset.load_builtin('ml-100k')

# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=.25)

# We'll use the famous SVD algorithm.
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.9434


0.94335706598785929

In [19]:
trainset

<surprise.trainset.Trainset at 0x10be530f0>

In [75]:
testset

[('796', '258', 4.0),
 ('790', '566', 3.0),
 ('336', '405', 3.0),
 ('276', '8', 4.0),
 ('655', '528', 5.0),
 ('234', '195', 2.0),
 ('342', '197', 4.0),
 ('491', '273', 5.0),
 ('13', '56', 5.0),
 ('903', '196', 4.0),
 ('526', '313', 5.0),
 ('58', '692', 2.0),
 ('664', '449', 2.0),
 ('505', '742', 4.0),
 ('336', '90', 5.0),
 ('508', '151', 5.0),
 ('450', '223', 3.0),
 ('312', '432', 5.0),
 ('673', '286', 4.0),
 ('194', '117', 3.0),
 ('716', '190', 5.0),
 ('215', '182', 3.0),
 ('279', '1496', 3.0),
 ('464', '300', 4.0),
 ('151', '265', 5.0),
 ('570', '271', 4.0),
 ('451', '874', 4.0),
 ('817', '24', 4.0),
 ('104', '293', 3.0),
 ('87', '372', 3.0),
 ('361', '531', 5.0),
 ('207', '521', 4.0),
 ('308', '778', 3.0),
 ('430', '7', 3.0),
 ('393', '83', 4.0),
 ('110', '33', 4.0),
 ('301', '357', 5.0),
 ('749', '975', 4.0),
 ('399', '139', 3.0),
 ('7', '238', 5.0),
 ('719', '378', 4.0),
 ('382', '546', 2.0),
 ('200', '230', 5.0),
 ('601', '210', 4.0),
 ('655', '171', 2.0),
 ('504', '1442', 3.0),


https://github.com/NicolasHug/Surprise/issues/20