In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from scipy import sparse as sps
import statistics
from math import sqrt
from scipy import sparse

In [2]:
data_path = 'datasets/'
ratings_filename = 'ratings.csv'

ratings_df = pd.read_csv(os.path.join(data_path, ratings_filename), usecols=['userId', 'movieId', 'rating'], dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [3]:
ratings_pivot_df = ratings_df.pivot(index='userId',columns='movieId',values='rating').fillna(0)
ratings_pivot_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
ratings_csr_matrix = sps.csr_matrix(ratings_pivot_df)
ratings_pivot_df.shape, print(ratings_csr_matrix.todense()) #toarray returns an ndarray; todense returns a matrix.

[[4.  0.  4.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 ...
 [2.5 2.  2.  ... 0.  0.  0. ]
 [3.  0.  0.  ... 0.  0.  0. ]
 [5.  0.  0.  ... 0.  0.  0. ]]


((610, 9724), None)

### Definicije funkcija za podelu skupa, sortiranje, predvidjanje ocena i filtriranje matrice

In [8]:
# Podela datog skupa podataka na skup za trening i skup za test se u narednoj funkciji
# vrsi tako sto se iz datog skupa "uzima" percentage % ocena koje se upisuju u test skup,
# a uklanjaju iz trening skupa

# parametar ratings_csr_matrix je skup podataka koji treba podeliti na trening i test skup
# percentage je broj koji predstavlja procenat ocena koje ulaze u test skup

def split_train_test(ratings_csr_matrix, percentage):
    
    test_ratings_number = percentage / 100
    print("Odnos ocena u skupu za testiranje: ", percentage, "%")
    print("Odnos ocena u skupu za treniranje: ", 100-percentage, "%")

    total_ratings = ratings_csr_matrix.toarray()

    dimensions_of_total_ratings = total_ratings.shape
    print("Ukupan broj korisnika: ", dimensions_of_total_ratings[0])
    print("Ukupan broj filmova: ", dimensions_of_total_ratings[1])

    test = np.zeros(dimensions_of_total_ratings)
    train = total_ratings.copy()

    nonzero_ratings_per_row = (total_ratings != 0).sum(1)
    print("Ukupan broj ne-nula ocena u svim redovima: \n", nonzero_ratings_per_row)
    
    for user in range(dimensions_of_total_ratings[0]):
        
        nonzero_test_ratings_per_user = int(np.ceil(test_ratings_number*nonzero_ratings_per_row[user])) # ispod - indeksi ne-nula elemenata
        # print("Ukupan broj ne-nula ocena u test skupu po korisniku: ", nonzero_test_ratings_per_user)
        
        test_ratings = np.random.choice(total_ratings[user, :].nonzero()[0], size = nonzero_test_ratings_per_user, replace = False)

        train[user, test_ratings] = 0 # izbacuju se iz trening skupa (upisuju se nule na odabranim pozicijama)

        test[user, test_ratings] = total_ratings[user, test_ratings] # ubacuju se u test skup (upisuju se ocene iz polaznog skupa, koje su uklonjene iz trening skupa)
        
    if (not(np.all((train * test) == 0))):
        print("Greska!")
    else:    
        return sps.csr_matrix(train),sps.csr_matrix(test)

In [9]:
def sort_descending(li):
    # reverse (Optional) - If True, the sorted list is reversed (or sorted in descending order).
    # reverse = None (Sorts in Ascending order)
    # key is set to sort using second element of 
    # sublist lambda has been used
    li.sort(key = lambda x: x[0], reverse=True)
    return li

In [10]:
def item_based_ratings_prediction(u, i, movies_similarity, ratings, k = 5):        

    neighbors = []
    # print("Movies similarity shape: ", movies_similarity.shape)
    similarities = list(zip(movies_similarity[i][:],range(movies_similarity.shape[0])))
    
    similarities_sorted = sort_descending(similarities)
    
    for i in range(1,k+1):
        neighbors.append(similarities_sorted[i][1])
        
    rated_i = ratings[:, i].nonzero()[0]
        
    item_i_mean = 0
    item_i_mean = (ratings[:, i].toarray()[0]).sum()
    
    if len(rated_i) != 0:
        item_i_mean = item_i_mean / len(rated_i)
        
    numerator, denominator = 0.0, 0.0
        
    for j in neighbors:
        rated_j = ratings[:, j].nonzero()[0]
        item_j_mean = 0
        
        if len(rated_j) != 0:
            arr2 = ratings[:, j].toarray()[0]
            item_j_mean = arr2.sum()
            item_j_mean = item_j_mean / len(rated_j)
        
        r_uj = ratings[u,j]
        numerator += movies_similarity[i][j]*(r_uj - item_j_mean)
        denominator += movies_similarity[i][j]
        
    return item_i_mean + numerator/denominator

In [11]:
def filter_matrix(matrix, n): # Filtriranje podataka (ostaju samo redovi korisnika koji su glasali vise od n puta)
    ratings_array = matrix.toarray()
    ratings_matrix_filtered = []
    for row in ratings_array:
        sum = 0
        nonzero_count = np.count_nonzero(row)
        if (nonzero_count > n):
            ratings_matrix_filtered.append(row)
    ratings_matrix_filtered_shape = np.vstack(ratings_matrix_filtered).shape
    print("Dimenzije filtrirane matrice ocena su: ", ratings_matrix_filtered_shape)
    return np.vstack(ratings_matrix_filtered)

### Redukcija, filtriranje i podela skupa podataka (matrice ocena) na skupove za trening i test

In [12]:
# Redukcija matrice ocena
ratings_csr_reduced = ratings_csr_matrix[:83, :900]
print("Dimenzije redukovane matrice ocena su: ", ratings_csr_reduced.shape)

# Filtriranje matrice ocena tako da ostanu korisnici koji su ocenili preko 100 filmova
filtered_matrix = filter_matrix(ratings_csr_reduced, n = 100)
print(filtered_matrix)

# Podela skupa na train_and_validation i test
filtered_ratings_csr_matrix = sps.csr_matrix(filtered_matrix)
train_and_validation, test = split_train_test(filtered_ratings_csr_matrix, 20)

Dimenzije redukovane matrice ocena su:  (83, 900)
Dimenzije filtrirane matrice ocena su:  (10, 900)
[[0.  4.  5.  ... 0.  0.  0. ]
 [4.  3.  3.  ... 4.  5.  5. ]
 [5.  0.  0.  ... 0.  0.  0. ]
 ...
 [0.  0.  3.  ... 0.  0.  0. ]
 [4.  0.  3.5 ... 3.5 4.5 0. ]
 [2.5 2.5 2.  ... 5.  5.  5. ]]
Odnos ocena u skupu za testiranje:  20 %
Odnos ocena u skupu za treniranje:  80 %
Ukupan broj korisnika:  10
Ukupan broj filmova:  900
Ukupan broj ne-nula ocena u svim redovima: 
 [314 212 103 136 113 114 167 112 119 189]


In [13]:
# Podela skupa train_and_validation na skupove train i validation
train, validation = split_train_test(train_and_validation, 30)

Odnos ocena u skupu za testiranje:  30 %
Odnos ocena u skupu za treniranje:  70 %
Ukupan broj korisnika:  10
Ukupan broj filmova:  900
Ukupan broj ne-nula ocena u svim redovima: 
 [251 169  82 108  90  91 133  89  95 151]


In [14]:
print('Dimenzije train skupa: ', train.shape)

Dimenzije train skupa:  (10, 900)


### Racunanje matrice slicnosti

In [15]:
# Kreiranje matrice slicnosti za filmove
similarity = cosine_similarity(train.T) + 0.000000003

### Racunanje odziva

In [16]:
# broj ocena razlicitih od 0 u izdvojenom train skupu za usera ciji je id = 1
user_id = 1
sum = 0
for i in range(train.shape[1]):
    if train[user_id,i] != 0.0:
        sum += 1
print('Broj ocena razlicitih od 0 za usera ciji je id = {} je {}.'.format(user_id, sum))

# Kraci zapis
# np.count_nonzero(train[1].toarray())

Broj ocena razlicitih od 0 za usera ciji je id = 1 je 118.


In [17]:
# Primer: ocena za user_id = 1 i movie_id = 686
user_id = 1
movie_id = 686
score = item_based_ratings_prediction(user_id, movie_id, similarity , train, k=1)
print('Ocena korisnika ciji je id = {} za film sa id-jem {} je {}.'.format(user_id, movie_id, score))

Ocena korisnika ciji je id = 1 za film sa id-jem 686 je 1.3333333333333333.


In [18]:
def get_predicted_movies_ids_and_scores(ratings_m, similarity, k, u_id, limit_score):
    # Recnik oblika {id_filma_1: ocena_1, id_filma_2: ocena_2, ...} za korisnika ciji je id = user_id
    dicts = {}

    keys = range(ratings_m.shape[1])

    for i in keys:
        dicts[i] = item_based_ratings_prediction(u_id, i, similarity, ratings_m, k)

    dicts_asc = {k: v for k, v in sorted(dicts.items(), key=lambda item: item[1])}

    # Predvidjene ocene vece od m
    movies_ids= []
    predicted_scores_values = []
    print('Predvidjene ocene vece od ', limit_score)
    for c in reversed(dicts_asc):
        if (dicts_asc[c] >= limit_score):
            tmp = np.round(dicts_asc[c])
            print(c, tmp)
            movies_ids.append(c)
            predicted_scores_values.append(tmp)

    return movies_ids, predicted_scores_values, dicts_asc

In [19]:
def get_all_actual_scores_for_user(ratings_m, u_id):
    dict_actual = {}

    keys = range(ratings_m.shape[1])
    for i in keys:
        dict_actual[i] = ratings_m[1, i]

    dict_actual_asc = {k: v for k, v in sorted(dict_actual.items(), key=lambda item: item[1])}
    return dict_actual_asc

In [20]:
def get_atual_scores_list(movies_indices, ratings_m, u_id):
    actual_for_predicted_scores = ratings_m[1, movies_indices].toarray()
    actual_for_predicted_scores_lst = list(actual_for_predicted_scores[0])
    return actual_for_predicted_scores_lst

In [21]:
def calculate_recall(ratings_m, similarity, k = 5, u_id = 1, limit_score = 4.5):
    movies_indices, predicted_scores_values, dicts_asc = get_predicted_movies_ids_and_scores(ratings_m, similarity, k, u_id, limit_score)
    print('Indeksi filmova cije su predvidjene ocene vece od {} su {}.'.format(limit_score, movies_indices))

    actual_ratings = get_all_actual_scores_for_user(ratings_m, u_id)

    actual_scores_lst = get_atual_scores_list(movies_indices, ratings_m, u_id)
    print('Prave ocene na datim indeksima: ', actual_scores_lst)

    movies_filtered_indices = []
    movies_filtered_ratings = []
    for i in movies_indices:
        if ratings_m[1, i] > 0:
            movies_filtered_indices.append(i)
            movies_filtered_ratings.append(ratings_m[1, i])
    print('Filtrirani indeksi (ocene vece od 0): ', movies_filtered_indices)
    print('Filtrirane ocene (vece od 0): ', movies_filtered_ratings)
    
    predicted_filtered_scores = []
    for key in dicts_asc:
        if key in movies_filtered_indices:
            predicted_filtered_scores.append(np.round(dicts_asc[key]))
    
    print('Filtrirane predvidjene ocene: ', predicted_filtered_scores)
    
    num_of_relevant_recommended_items = np.count_nonzero((np.array(movies_filtered_ratings) == np.array(predicted_filtered_scores))*1)
    recall = num_of_relevant_recommended_items/len(predicted_filtered_scores)
    return recall

In [22]:
recall_value = calculate_recall(train, similarity, k = 5, u_id = 1, limit_score = 3)
print('Recall: ', recall_value)

Predvidjene ocene vece od  3
472 5.0
422 4.0
254 4.0
116 4.0
334 4.0
606 3.0
862 3.0
507 3.0
224 3.0
857 3.0
695 3.0
560 3.0
559 3.0
558 3.0
217 3.0
618 3.0
Indeksi filmova cije su predvidjene ocene vece od 3 su [472, 422, 254, 116, 334, 606, 862, 507, 224, 857, 695, 560, 559, 558, 217, 618].
Prave ocene na datim indeksima:  [3.0, 2.0, 4.0, 1.0, 4.0, 3.0, 3.0, 5.0, 4.0, 3.0, 5.0, 4.0, 1.0, 2.0, 3.0, 4.0]
Filtrirani indeksi (ocene vece od 0):  [472, 422, 254, 116, 334, 606, 862, 507, 224, 857, 695, 560, 559, 558, 217, 618]
Filtrirane ocene (vece od 0):  [3.0, 2.0, 4.0, 1.0, 4.0, 3.0, 3.0, 5.0, 4.0, 3.0, 5.0, 4.0, 1.0, 2.0, 3.0, 4.0]
Filtrirane predvidjene ocene:  [3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 5.0]
Recall:  0.3125


In [23]:
# 3.5 = 4 a treba da ostane 3.5 jer i ocene tog oblika postoje