In [8]:
import numpy as np
import pandas as pd
import os
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from scipy import sparse as sps
from scipy import sparse
import statistics
from math import sqrt
from math import modf

In [9]:
EPS = 0.000000003

### Učitavanje i prikaz podataka, kreiranje retke matrice

In [10]:
data_path = 'datasets/'
ratings_filename = 'ratings.csv'

ratings_df = pd.read_csv(os.path.join(data_path, ratings_filename), usecols=['userId', 'movieId', 'rating'], dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [11]:
ratings_pivot_df = ratings_df.pivot(index='userId',columns='movieId',values='rating').fillna(0)
ratings_pivot_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
ratings_csr_matrix = sps.csr_matrix(ratings_pivot_df)
ratings_pivot_df.shape, print(ratings_csr_matrix.todense()) # toarray vraća an ndarray; todense vraća matrix

[[4.  0.  4.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 ...
 [2.5 2.  2.  ... 0.  0.  0. ]
 [3.  0.  0.  ... 0.  0.  0. ]
 [5.  0.  0.  ... 0.  0.  0. ]]


((610, 9724), None)

### Implementacije funkcija za podelu skupa, sortiranje, predvidjanje ocena i filtriranje matrice

In [13]:
# Podela datog skupa podataka na skup za trening i skup za test se u narednoj funkciji
# vrsi tako sto se iz datog skupa "uzima" procenat ocena koje se upisuju u test skup,
# a uklanjaju iz trening skupa

# parametar ratings_csr_matrix je skup podataka koji treba podeliti na trening i test skup
# percentage je broj koji predstavlja procenat ocena koje ulaze u test skup

def split_train_test(ratings_csr_matrix, percentage):
    
    test_ratings_number = percentage / 100
    print("Odnos ocena u skupu za testiranje: ", percentage, "%")
    print("Odnos ocena u skupu za treniranje: ", 100-percentage, "%")

    total_ratings = ratings_csr_matrix.toarray()

    dimensions_of_total_ratings = total_ratings.shape
    print("Ukupan broj korisnika: ", dimensions_of_total_ratings[0])
    print("Ukupan broj filmova: ", dimensions_of_total_ratings[1])

    test = np.zeros(dimensions_of_total_ratings)
    train = total_ratings.copy()

    nonzero_ratings_per_row = (total_ratings != 0).sum(1)
    print("Ukupan broj ne-nula ocena u svim redovima: \n", nonzero_ratings_per_row)
    
    for user in range(dimensions_of_total_ratings[0]):
        
        # indeksi ne-nula elemenata
        nonzero_test_ratings_per_user = int(np.ceil(test_ratings_number*nonzero_ratings_per_row[user]))
        
        test_ratings = np.random.choice(total_ratings[user, :].nonzero()[0], size = nonzero_test_ratings_per_user, replace = False)

        # ocene se izbacuju iz trening skupa (upisuju se nule na odabranim pozicijama)
        train[user, test_ratings] = 0

        # ocene se ubacuju u test skup (upisuju se ocene iz polaznog skupa, koje su uklonjene iz trening skupa)
        test[user, test_ratings] = total_ratings[user, test_ratings]
        
    if (not(np.all((train * test) == 0))):
        print("Greska!")
    else:    
        return sps.csr_matrix(train),sps.csr_matrix(test)

In [14]:
# Funkcija za sortiranje liste opadajuće

def sort_descending(li):
    # reverse (opcioni parametar)
    # reverse = True - sortirana lista se preokreće (postaje sortirana opadajuće)
    # reverse = None - lista ostaje sortirana rastuće
    li.sort(key = lambda x: x[0], reverse=True)
    return li

In [15]:
# Funkcija računa ocenu korisnika u za film i na osnovu matrice sličnosti filmova (item-based pristup)

def item_based_ratings_prediction(u, i, movies_similarity, ratings, k = 5):        

    neighbors = []
    similarities = list(zip(movies_similarity[i][:],range(movies_similarity.shape[0])))
    similarities_sorted = sort_descending(similarities)
    
    for i in range(1,k+1):
        neighbors.append(similarities_sorted[i][1])
        
    rated_i = ratings[:, i].nonzero()[0]
        
    item_i_mean = 0
    item_i_mean = (ratings[:, i].toarray()[0]).sum()
    
    if len(rated_i) != 0:
        item_i_mean = item_i_mean / len(rated_i)
        
    numerator, denominator = 0.0, 0.0
        
    for j in neighbors:
        rated_j = ratings[:, j].nonzero()[0]
        item_j_mean = 0
        
        if len(rated_j) != 0:
            arr2 = ratings[:, j].toarray()[0]
            item_j_mean = arr2.sum()
            item_j_mean = item_j_mean / len(rated_j)
        
        r_uj = ratings[u,j]
        numerator += movies_similarity[i][j]*(r_uj - item_j_mean)
        denominator += movies_similarity[i][j]
        
    return item_i_mean + numerator/denominator

In [16]:
# Funkcija filtrira datu matricu ocena tako da ostaju samo redovi korisnika koji su glasali više od n puta

def filter_matrix(matrix, n):
    ratings_array = matrix.toarray()
    ratings_matrix_filtered = []
    for row in ratings_array:
        sum = 0
        nonzero_count = np.count_nonzero(row)
        if (nonzero_count > n):
            ratings_matrix_filtered.append(row)
    ratings_matrix_filtered_shape = np.vstack(ratings_matrix_filtered).shape
    print("Dimenzije filtrirane matrice ocena su: ", ratings_matrix_filtered_shape)
    return np.vstack(ratings_matrix_filtered)

### Redukcija, filtriranje i podela skupa podataka (matrice ocena) na skupove za trening i test

In [17]:
# Redukcija matrice ocena
ratings_csr_reduced = ratings_csr_matrix[:83, :900]
print("Dimenzije redukovane matrice ocena su: ", ratings_csr_reduced.shape)

# Filtriranje matrice ocena tako da ostanu korisnici koji su ocenili preko 100 filmova
filtered_matrix = filter_matrix(ratings_csr_reduced, n = 100)
print('Filtrirana matrica: \n', filtered_matrix)

# Podela skupa na train_and_validation i test
filtered_ratings_csr_matrix = sps.csr_matrix(filtered_matrix)
train_and_validation, test = split_train_test(filtered_ratings_csr_matrix, 20)

Dimenzije redukovane matrice ocena su:  (83, 900)
Dimenzije filtrirane matrice ocena su:  (10, 900)
Filtrirana matrica: 
 [[0.  4.  5.  ... 0.  0.  0. ]
 [4.  3.  3.  ... 4.  5.  5. ]
 [5.  0.  0.  ... 0.  0.  0. ]
 ...
 [0.  0.  3.  ... 0.  0.  0. ]
 [4.  0.  3.5 ... 3.5 4.5 0. ]
 [2.5 2.5 2.  ... 5.  5.  5. ]]
Odnos ocena u skupu za testiranje:  20 %
Odnos ocena u skupu za treniranje:  80 %
Ukupan broj korisnika:  10
Ukupan broj filmova:  900
Ukupan broj ne-nula ocena u svim redovima: 
 [314 212 103 136 113 114 167 112 119 189]


In [18]:
# Podela skupa train_and_validation na skupove train i validation
train, validation = split_train_test(train_and_validation, 30)

Odnos ocena u skupu za testiranje:  30 %
Odnos ocena u skupu za treniranje:  70 %
Ukupan broj korisnika:  10
Ukupan broj filmova:  900
Ukupan broj ne-nula ocena u svim redovima: 
 [251 169  82 108  90  91 133  89  95 151]


In [19]:
print('Dimenzije train skupa: ', train.shape)

Dimenzije train skupa:  (10, 900)


### Računanje matrice sličnosti

In [20]:
# Kreiranje matrice sličnosti za filmove
similarity = cosine_similarity(train.T) + EPS
print('Matrica sličnosti:\n', similarity)
print('\nDimenzije matrice sličnosti: ', similarity.shape)

Matrica sličnosti:
 [[1.0000000e+00 2.9706702e-01 3.4179297e-01 ... 2.2175480e-01
  2.0761439e-01 3.0000000e-09]
 [2.9706702e-01 1.0000000e+00 3.0000000e-09 ... 3.0000000e-09
  3.8190201e-01 3.0000000e-09]
 [3.4179297e-01 3.0000000e-09 9.9999994e-01 ... 3.0000000e-09
  3.0000000e-09 3.0000000e-09]
 ...
 [2.2175480e-01 3.0000000e-09 3.0000000e-09 ... 1.0000000e+00
  7.9660189e-01 3.0000000e-09]
 [2.0761439e-01 3.8190201e-01 3.0000000e-09 ... 7.9660189e-01
  1.0000000e+00 3.0000000e-09]
 [3.0000000e-09 3.0000000e-09 3.0000000e-09 ... 3.0000000e-09
  3.0000000e-09 3.0000000e-09]]

Dimenzije matrice sličnosti:  (900, 900)


### Računanje preciznosti

In [21]:
# Broj ocena različitih od 0 u izdvojenom train skupu za korisnika čiji je id = 1
user_id = 1
sum = 0
for i in range(train.shape[1]):
    if train[user_id,i] != 0.0:
        sum += 1
print('Broj ocena različitih od 0 za usera čiji je id = {} je {}.'.format(user_id, sum))

# Kraći zapis:
# np.count_nonzero(train[1].toarray())

Broj ocena različitih od 0 za usera čiji je id = 1 je 118.


In [22]:
# Primer: ocena za user_id = 1 i movie_id = 686
user_id = 1
movie_id = 686
score = item_based_ratings_prediction(user_id, movie_id, similarity , train, k=1)
print('Ocena korisnika čiji je id = {} za film sa id-jem {} je {}.'.format(user_id, movie_id, score))

Ocena korisnika čiji je id = 1 za film sa id-jem 686 je 4.0.


In [23]:
# Funkcija koja razlomljeni deo broja, ukoliko je "blizu" 0.5 "zaokružuje" na 0.5,
# a inače broj zaokružuje pozivom funkcije round.
# Ovakav način zaokruživanja je bitan zato što postoje ocene oblika n.m
# i ukoliko bi se koristila samo funkcija round, takve ocene bi bile izostavljene

def round_score(score_value):
    frac, whole = modf(score_value)

    if 0.46 < frac < 0.56:
        frac = 0.5
        new_score_value = whole + frac
    else:
        new_score_value = np.round(whole + frac)

    return new_score_value

In [24]:
# Funkcija kreira rečnik sa svim ocenama korisnika čiji je id = u_id,
# izdvaja indekse i ocene filmova koji su od korisika čiji je id = u_id dobili ocene veće od datog praga

def get_predicted_movies_ids_and_all_scores_for_user(ratings_m, similarity, k, u_id, limit_score):
    
    # Rečnik oblika {id_filma_1: ocena_1, id_filma_2: ocena_2, ...} za korisnika čiji je id = user_id
    dicts = {}

    keys = range(ratings_m.shape[1])

    # Ocene korisnika čiji je id = u_id i sve filmove (do id-a ratings_m.shape[1])
    for i in keys:
        dicts[i] = item_based_ratings_prediction(u_id, i, similarity, ratings_m, k)

    dicts_asc = {k: v for k, v in sorted(dicts.items(), key=lambda item: item[1])}

    # Predviđene ocene veće od m
    movies_ids= []
    predicted_scores_values = []
    print('Predvidjene ocene vece od ', limit_score)
    for c in reversed(dicts_asc):
        if (dicts_asc[c] >= limit_score):
            score_tmp = round_score(dicts_asc[c])
            print('Id filma: {} - Ocena filma: {}'.format(c, score_tmp))
            movies_ids.append(c)
            predicted_scores_values.append(score_tmp)

    # Povratne vrednosti su:
    # indeksi filmova koji su od korisnika čiji je id = u_id dobili ocene veće od m,
    # rečnik svih ocena korisnika čiji je id = u_id
    return movies_ids, dicts_asc, predicted_scores_values

In [35]:
# Funkcija koja ispisuje informacije o filmovima za koje su predviđene ocene veće od nekog praga
# i pravim ocenama za filmove za koje su predviđene ocene veće od praga

def get_atual_scores_list(ratings_m, u_id, similarity, k = 5, limit_score = 4.5):
    movies_indices, dicts_asc, predicted_scores_values = get_predicted_movies_ids_and_all_scores_for_user(ratings_m, similarity, k, u_id, limit_score)
    print('Indeksi filmova čije su predviđene ocene veće od {} su {}'.format(limit_score, movies_indices))
    
    # prave ocene u trening skupu za korisnika čiji je id = u_id
    actual_for_predicted_scores = ratings_m[u_id, movies_indices].toarray()
    actual_for_predicted_scores_lst = list(actual_for_predicted_scores[0])
    print('Prave ocene na indeksima čije su predviđene ocene veće od {} su {}'.format(limit_score, actual_for_predicted_scores_lst))
    
    return actual_for_predicted_scores_lst

In [26]:
# Funkcija koja računa preciznost na osnovu broja relevantnih (tačno predviđenih) ocena i broja predviđenih ocena

def calculate_precision(ratings_m, similarity, u_id, k = 5, limit_score = 4.5):
    movies_indices, dicts_asc = get_predicted_movies_ids_and_all_scores_for_user(ratings_m, similarity, k, u_id, limit_score)

    movies_filtered_indices = []
    movies_filtered_ratings = []
    for i in movies_indices:
        if ratings_m[1, i] > 0:
            movies_filtered_indices.append(i)
            movies_filtered_ratings.append(ratings_m[1, i])
    print('Filtrirani indeksi (ocene veće od 0): ', movies_filtered_indices)
    print('Filtrirane stvarne ocene (veće od 0): ', movies_filtered_ratings)
    
    predicted_filtered_scores = []
    for key in dicts_asc:
        if key in movies_filtered_indices:
            predicted_filtered_scores.append(round_score(dicts_asc[key]))
    print('Filtrirane predviđene ocene: ', predicted_filtered_scores)
    
    num_of_relevant_recommended_items = np.count_nonzero((np.array(movies_filtered_ratings) == np.array(predicted_filtered_scores))*1)
    
    if len(predicted_filtered_scores) != 0:
        precision = num_of_relevant_recommended_items/len(predicted_filtered_scores)
        return precision
    else:
        print('Greška zbog deljenja nulom.')
        return

In [58]:
movies_ids, dicts_asc, predicted_scores_values = get_predicted_movies_ids_and_all_scores_for_user(train, similarity, k = 5, u_id = 1, limit_score = 3.5)

Predvidjene ocene vece od  3.5
Id filma: 862 - Ocena filma: 4.0
Id filma: 406 - Ocena filma: 4.0
Id filma: 883 - Ocena filma: 4.0
Id filma: 714 - Ocena filma: 4.0
Id filma: 686 - Ocena filma: 4.0
Id filma: 560 - Ocena filma: 4.0
Id filma: 559 - Ocena filma: 4.0


In [59]:
actual_for_predicted_scores_lst = get_atual_scores_list(train, u_id = 1, similarity = similarity, k = 5, limit_score = 3.5)

Predvidjene ocene vece od  3.5
Id filma: 862 - Ocena filma: 4.0
Id filma: 406 - Ocena filma: 4.0
Id filma: 883 - Ocena filma: 4.0
Id filma: 714 - Ocena filma: 4.0
Id filma: 686 - Ocena filma: 4.0
Id filma: 560 - Ocena filma: 4.0
Id filma: 559 - Ocena filma: 4.0
Indeksi filmova čije su predviđene ocene veće od 3.5 su [862, 406, 883, 714, 686, 560, 559]
Prave ocene na indeksima čije su predviđene ocene veće od 3.5 su [3.0, 3.0, 2.0, 4.0, 5.0, 4.0, 1.0]


In [60]:
predicted_scores_values, actual_for_predicted_scores_lst

([4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0], [3.0, 3.0, 2.0, 4.0, 5.0, 4.0, 1.0])

In [62]:
def get_recall(limit, predicted_scores_values, actual_for_predicted_scores_lst):
    # limit = 3.5
    i = 0
    tp = 0
    fn = 0

    for predicted_score in predicted_scores_values:
        if actual_for_predicted_scores_lst[i] >= limit:
            tp += 1
        else:
            fn +=1
        i += 1

    recall = tp / (tp + fn)
    return recall

In [63]:
# Odziv
recall = get_recall(limit, predicted_scores_values, actual_for_predicted_scores_lst)
print('Odziv je: ', recall)

Odziv je:  0.42857142857142855


In [26]:
# Ispis predviđenih ocena, odgovarajućih indeksa filmova i pravih ocena
get_atual_scores_list(train, u_id = 1, similarity = similarity, k = 5, limit_score = 3)

Predvidjene ocene vece od  3
Id filma: 622 - Ocena filma: 5.0
Id filma: 336 - Ocena filma: 5.0
Id filma: 113 - Ocena filma: 5.0
Id filma: 483 - Ocena filma: 4.0
Id filma: 434 - Ocena filma: 4.0
Id filma: 592 - Ocena filma: 4.0
Id filma: 779 - Ocena filma: 4.0
Id filma: 191 - Ocena filma: 3.5
Id filma: 505 - Ocena filma: 3.5
Id filma: 857 - Ocena filma: 3.0
Id filma: 820 - Ocena filma: 3.0
Id filma: 816 - Ocena filma: 3.0
Id filma: 808 - Ocena filma: 3.0
Id filma: 750 - Ocena filma: 3.0
Id filma: 714 - Ocena filma: 3.0
Id filma: 711 - Ocena filma: 3.0
Id filma: 703 - Ocena filma: 3.0
Id filma: 697 - Ocena filma: 3.0
Id filma: 695 - Ocena filma: 3.0
Id filma: 686 - Ocena filma: 3.0
Id filma: 685 - Ocena filma: 3.0
Id filma: 669 - Ocena filma: 3.0
Id filma: 667 - Ocena filma: 3.0
Id filma: 649 - Ocena filma: 3.0
Id filma: 643 - Ocena filma: 3.0
Id filma: 598 - Ocena filma: 3.0
Id filma: 587 - Ocena filma: 3.0
Id filma: 585 - Ocena filma: 3.0
Id filma: 578 - Ocena filma: 3.0
Id filma: 560 

In [30]:
# Preciznost
precision_value = calculate_precision(train, similarity, u_id = 1, k = 6, limit_score = 3.5)
print('Precision = ', precision_value)

Predvidjene ocene vece od  3.5
Id filma: 296 - Ocena filma: 5.0
Id filma: 622 - Ocena filma: 5.0
Id filma: 592 - Ocena filma: 4.0
Id filma: 191 - Ocena filma: 4.0
Id filma: 779 - Ocena filma: 4.0
Id filma: 483 - Ocena filma: 4.0
Id filma: 58 - Ocena filma: 4.0
Id filma: 334 - Ocena filma: 4.0
Id filma: 830 - Ocena filma: 4.0
Id filma: 558 - Ocena filma: 4.0
Id filma: 95 - Ocena filma: 4.0
Id filma: 883 - Ocena filma: 4.0
Id filma: 637 - Ocena filma: 4.0
Id filma: 599 - Ocena filma: 4.0
Id filma: 559 - Ocena filma: 4.0
Id filma: 857 - Ocena filma: 4.0
Id filma: 820 - Ocena filma: 4.0
Id filma: 816 - Ocena filma: 4.0
Id filma: 808 - Ocena filma: 4.0
Id filma: 750 - Ocena filma: 4.0
Id filma: 714 - Ocena filma: 4.0
Id filma: 711 - Ocena filma: 4.0
Id filma: 703 - Ocena filma: 4.0
Id filma: 697 - Ocena filma: 4.0
Id filma: 695 - Ocena filma: 4.0
Id filma: 686 - Ocena filma: 4.0
Id filma: 685 - Ocena filma: 4.0
Id filma: 669 - Ocena filma: 4.0
Id filma: 667 - Ocena filma: 4.0
Id filma: 649 