In [1]:
#saa 2021
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
#data = pd.read_csv('data/dataset.csv')
data = pd.read_csv('dataset.csv')
data = data.sort_values(['timestamp'])

In [3]:
train = data[:80000]
test = data[80000:]

In [4]:
train.head()

Unnamed: 0,user_id,item_id,rating,timestamp
217,259,255,4,874724710
83968,259,286,4,874724727
43030,259,298,4,874724754
21399,259,185,4,874724781
82658,259,173,4,874724843


In [5]:
test.head()

Unnamed: 0,user_id,item_id,rating,timestamp
1346,3,245,1,889237247
27978,3,355,3,889237247
1260,3,335,1,889237269
38673,3,322,3,889237269
3761,3,323,2,889237269


In [6]:
def average_precision(actual, recommended, k=30):
    ap_sum = 0
    hits = 0
    for i in range(k): #Пробегаем по k
        product_id = recommended[i] if i < len(recommended) else None #Получаем список продуктов из recomend
        if product_id is not None and product_id in actual: #Если есть попадание в реальную
            hits += 1
            ap_sum += hits / (i + 1) #Чем больше неверных в начале, тем хуже
    return ap_sum / k


def normalized_average_precision(actual, recommended, k=30): #Тупо считаем долю 
    actual = set(actual)
    if len(actual) == 0:
        return 0.0

    ap = average_precision(actual, recommended, k=k)
    ap_ideal = average_precision(actual, list(actual)[:k], k=k)
    return ap / ap_ideal

In [13]:
#! You can run this block just once !

from  sklearn.metrics.pairwise import pairwise_distances

items = data['item_id'].unique() # unique items
users = data['user_id'].unique() # unique users
width = items.shape[0] #width of the matrix
height = users.shape[0] #height of the matrix

#Create special matrix: rows - users, columns - items, value - user's grade for item (Nan = 0)
matrix = np.empty([height,width])
for i, user in enumerate(users):
    data_user = train[train['user_id']==user].iloc[:,1:3]
    
    if data_user.shape[0]==0:
        matrix[i,:] = 0
    else:
        items_user = list(train[train['user_id']==user]['item_id'].values)
        for k, item in enumerate(items):
            if item not in items_user:
                matrix[i,k] = 0
            else:
                matrix[i,k] = data_user[data_user['item_id']==item]['rating'].values[0]


#Create DataFrame with the matrix
DF_matrix = pd.DataFrame(matrix, index=users, columns=items)
                
#Create matrix cosine metrics for users
user_similarity = pairwise_distances(matrix, metric='cosine')

In [18]:
def recommend(user):
    
    #Find row with index=user
    row = list(DF_matrix.index).index(user)
    
    #Select row from similarity matrix
    user_sim = user_similarity[row]
    
    #Select the first 100 users in top without the first place (it's our user too)
    indexes = user_sim.argsort()[1:101]
    
    #Select row from similarity matrix with first 100 users
    user_sim_100 = user_similarity[row][indexes]
    
    #Count the mean grade for each user
    user_mean = matrix[row].mean()
    other_users_mean = matrix[indexes].mean(axis=1)
    
    #Initialise the matrix
    pred = np.empty([1,width])
    
    #Count the denominator
    denominator = np.abs(user_sim_100).sum()
    
    for i in range(width):
        #Count the numerator for each item
        numerator = user_sim_100.dot(matrix[indexes][:,i] - other_users_mean)
        #Add value to prediction for each item
        pred[0,i] = user_mean + numerator / denominator
    
    #Transform prediction to DataFrame
    pred = pd.DataFrame(pred[0,:], index=list(items), columns=['Pred'])
    
    #Select items from user's items in train
    train_items = train[train['user_id']==user]['item_id'].unique()
    
    #Drop train items (because thees items can't be in test)
    pred.drop(train_items, axis='index', inplace=True)
    
    #Sort prediction by grade and take the first 30
    pred_user = pred.sort_values(by=['Pred'], ascending=False).index[:30]
     
    return list(pred_user)

In [19]:
scores = []
for user in tqdm(test['user_id'].unique()):
    actual = list(test[test['user_id'] == user]['item_id'])
    recommended = recommend(user)
    
    scores.append(normalized_average_precision(actual, recommended))

np.mean(scores)

100%|██████████| 301/301 [01:38<00:00,  3.07it/s]


0.18077461904070333