In [42]:
import numpy as np
import pandas as pd
import os
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from scipy import sparse as sps
from scipy import sparse
import statistics
from math import sqrt

# from ipynb.fs.full.knn_cf_movies_dataset import split_train_test

In [43]:
EPS = 0.000000003

### Učitavanje i prikaz skupa podataka, kreiranje retke matrice

In [44]:
data_path = 'datasets/'
ratings_filename = 'ratings.csv'
ratings_df = pd.read_csv(os.path.join(data_path, ratings_filename), usecols=['userId', 'movieId', 'rating'], dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})
ratings_pivot_df = ratings_df.pivot(index='userId',columns='movieId',values='rating').fillna(0)
ratings_pivot_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
ratings_csr_matrix = sps.csr_matrix(ratings_pivot_df)
ratings_pivot_df.shape, print(ratings_csr_matrix.todense()) # toarray vraća ndarray; todense vraća matrix

[[4.  0.  4.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 ...
 [2.5 2.  2.  ... 0.  0.  0. ]
 [3.  0.  0.  ... 0.  0.  0. ]
 [5.  0.  0.  ... 0.  0.  0. ]]


((610, 9724), None)

### Implementacije funkcija za podelu skupa podataka, sortiranje i filtriranje

In [46]:
# Podela datog skupa podataka na skup za trening i skup za test se u narednoj funkciji
# vrši tako što se iz datog skupa "uzima" procenat ocena koje se upisuju u test skup,
# a uklanjaju iz trening skupa

# parametar ratings_csr_matrix je skup podataka koji treba podeliti na trening i test skup
# percentage je broj koji predstavlja procenat ocena koje ulaze u test skup

def split_train_test(ratings_csr_matrix, percentage):
    
    test_ratings_number = percentage / 100
    print("Odnos ocena u skupu za testiranje: ", percentage, "%")
    print("Odnos ocena u skupu za treniranje: ", 100-percentage, "%")

    total_ratings = ratings_csr_matrix.toarray()

    dimensions_of_total_ratings = total_ratings.shape
    print("Ukupan broj korisnika: ", dimensions_of_total_ratings[0])
    print("Ukupan broj filmova: ", dimensions_of_total_ratings[1])

    test = np.zeros(dimensions_of_total_ratings)
    train = total_ratings.copy()

    nonzero_ratings_per_row = (total_ratings != 0).sum(1)
    print("Ukupan broj ne-nula ocena u svim redovima: \n", nonzero_ratings_per_row)
    
    for user in range(dimensions_of_total_ratings[0]):
        
        # indeksi ne-nula elemenata
        nonzero_test_ratings_per_user = int(np.ceil(test_ratings_number*nonzero_ratings_per_row[user]))
        
        test_ratings = np.random.choice(total_ratings[user, :].nonzero()[0], size = nonzero_test_ratings_per_user, replace = False)
        
        # ocene se izbacuju iz trening skupa (upisuju se nule na odabranim pozicijama)
        train[user, test_ratings] = 0

        # ubacuju se u test skup (upisuju se ocene iz polaznog skupa, koje su uklonjene iz trening skupa)
        test[user, test_ratings] = total_ratings[user, test_ratings]
        
    if (not(np.all((train * test) == 0))):
        print("Greska!")
    else:    
        return sps.csr_matrix(train),sps.csr_matrix(test)

In [47]:
# Funkcija za sortiranje liste opadajuće

def sort_descending(li):
    # reverse (opcioni parametar)
    # reverse = True - sortirana lista se preokreće (postaje sortirana opadajuće)
    # reverse = None - lista ostaje sortirana rastuće
    li.sort(key = lambda x: x[0], reverse=True)
    return li

In [48]:
# Funkcija filtrira datu matricu ocena tako da ostaju samo redovi korisnika koji su glasali vise od n puta

def filter_matrix(matrix, n):
    ratings_array = matrix.toarray()
    ratings_matrix_filtered = []
    for row in ratings_array:
        sum = 0
        nonzero_count = np.count_nonzero(row)
        if (nonzero_count > n):
            ratings_matrix_filtered.append(row)
    ratings_matrix_filtered_shape = np.vstack(ratings_matrix_filtered).shape
    print("Dimenzije filtrirane matrice ocena su: ", ratings_matrix_filtered_shape)
    return np.vstack(ratings_matrix_filtered)

### Predvidjanje ocene na osnovu suseda iznad praga sličnosti

In [49]:
# Funkcija računa ocenu korisnika u za film i na osnovu suseda čija je sličnost sa korisnikom u veća od postavljenog praga

def user_based_similarity_limit_ratings_prediction(u, i, users_similarity, ratings, similarity_limit):
    
    neighbors = []
    
    similarities = list(zip(users_similarity[u][:], range(users_similarity.shape[0])))  # id-evi korisnika
    similarities_sorted = sort_descending(similarities)
    similarities_sorted_dict = dict(similarities_sorted)
    
    most_similar_users_dict = dict(filter(lambda elem: similarity_limit < elem[0] < 0.999, similarities_sorted_dict.items()))

    print('Korisnici cija je slicnost sa ciljnim korisnikom veca od {} su:'.format(similarity_limit))
    for key in most_similar_users_dict:
        print('Vrednost: {:.3f} - slicnost ciljnog korisnika id={} i korisnika id={}'.format(key, u, most_similar_users_dict[key]))
    
    for key in most_similar_users_dict:
        neighbors.append(most_similar_users_dict[key])
        
    print('Id-evi najslicnijih korisnika: ', neighbors)
    
    rated_by_u = np.count_nonzero(ratings[u].toarray()[0])
    print('Broj ocena koje je dao korisnik u: ', rated_by_u)

    user_u_mean = 0
    user_u_mean = np.sum(ratings[u])
        
    if rated_by_u != 0:
        user_u_mean = user_u_mean / rated_by_u 
    else:
        print('Korisnik {} nije dao ni jednu ocenu'.format(v))
    
    print('Prosecna ocena korisnika {} je: {}'.format(u, user_u_mean))
    numerator, denominator = 0.0, 0.0

    for v in neighbors:
        rated_by_v = np.count_nonzero(ratings[v].toarray()[0])
        user_v_mean = 0
        user_v_mean = np.sum(ratings[v])
        
        if rated_by_v != 0:
            user_v_mean = user_v_mean / rated_by_v
            print('Prosecna ocena korisnika {} je: {}'.format(v, user_v_mean))
        else:
            print('Korisnik {} nije dao ni jednu ocenu'.format(v))
        
        r_vi = ratings[v,i]
        numerator += float((users_similarity[u][v]))*float((r_vi - user_v_mean))
        denominator += float(users_similarity[u][v])
        
    print('-------------------------------------')
    print("Ocena korisnika u za film i: {:.3f}".format(user_u_mean + numerator/denominator))    
    print('-------------------------------------')

    return user_u_mean + numerator/denominator

### Izvrsavanje implementiranih funkcija - redukcija, filtriranje, podela skupa, racunanje matrice slicnosti i predvidjanje ocene

In [50]:
# Redukcija matrice ocena
ratings_csr_reduced = ratings_csr_matrix[:83, :900]
print("Dimenzije redukovane matrice ocena su: ", ratings_csr_reduced.shape)

# Filtriranje matrice ocena tako da ostanu korisnici koji su ocenili preko 100 filmova
filtered_matrix = filter_matrix(ratings_csr_reduced, n = 100)
print('Filtrirana matrica: \n', filtered_matrix)

# Podela skupa na train_and_validation i test
filtered_ratings_csr_matrix = sps.csr_matrix(filtered_matrix)
train_and_validation, test = split_train_test(filtered_ratings_csr_matrix, 20)

Dimenzije redukovane matrice ocena su:  (83, 900)
Dimenzije filtrirane matrice ocena su:  (10, 900)
Filtrirana matrica: 
 [[0.  4.  5.  ... 0.  0.  0. ]
 [4.  3.  3.  ... 4.  5.  5. ]
 [5.  0.  0.  ... 0.  0.  0. ]
 ...
 [0.  0.  3.  ... 0.  0.  0. ]
 [4.  0.  3.5 ... 3.5 4.5 0. ]
 [2.5 2.5 2.  ... 5.  5.  5. ]]
Odnos ocena u skupu za testiranje:  20 %
Odnos ocena u skupu za treniranje:  80 %
Ukupan broj korisnika:  10
Ukupan broj filmova:  900
Ukupan broj ne-nula ocena u svim redovima: 
 [314 212 103 136 113 114 167 112 119 189]


In [51]:
# Podela skupa train_and_validation na skupove train i validation
train, validation = split_train_test(train_and_validation, 30)

Odnos ocena u skupu za testiranje:  30 %
Odnos ocena u skupu za treniranje:  70 %
Ukupan broj korisnika:  10
Ukupan broj filmova:  900
Ukupan broj ne-nula ocena u svim redovima: 
 [251 169  82 108  90  91 133  89  95 151]


In [52]:
# Kreiranje matrice sličnosti
similarity = cosine_similarity(train) + EPS
print(similarity)

[[0.99999875 0.17779072 0.25143448 0.16403002 0.29164058 0.23025678
  0.17998832 0.3528828  0.22164895 0.31254184]
 [0.17779072 0.9999994  0.16645023 0.20104311 0.16795905 0.27514765
  0.26195833 0.19785067 0.17109714 0.2526943 ]
 [0.25143448 0.16645023 1.0000002  0.10553927 0.1489402  0.14967065
  0.1237955  0.21546124 0.10633915 0.17537309]
 [0.16403002 0.20104311 0.10553927 1.         0.13625441 0.27447438
  0.18131205 0.15169543 0.197896   0.3012104 ]
 [0.29164058 0.16795905 0.1489402  0.13625441 1.0000004  0.20909394
  0.19659519 0.2739267  0.17234749 0.25583524]
 [0.23025678 0.27514765 0.14967065 0.27447438 0.20909394 0.9999999
  0.2808071  0.30172524 0.2643537  0.2769641 ]
 [0.17998832 0.26195833 0.1237955  0.18131205 0.19659519 0.2808071
  1.0000001  0.20880452 0.23676188 0.30804324]
 [0.3528828  0.19785067 0.21546124 0.15169543 0.2739267  0.30172524
  0.20880452 0.99999964 0.22941273 0.34913746]
 [0.22164895 0.17109714 0.10633915 0.197896   0.17234749 0.2643537
  0.23676188 0.

In [53]:
# Provera dimenzija
print("Dimenzije train skupa: ", train.shape)
print("Dimenzije matrice sličnosti: ", similarity.shape)

Dimenzije train skupa:  (10, 900)
Dimenzije matrice sličnosti:  (10, 10)


In [54]:
# Ocena korisnika čiji je id = 9 za film čiji je id = 10
result = user_based_similarity_limit_ratings_prediction(9, 10, similarity, train, 0.25)

Korisnici cija je slicnost sa ciljnim korisnikom veca od 0.25 su:
Vrednost: 0.349 - slicnost ciljnog korisnika id=9 i korisnika id=7
Vrednost: 0.313 - slicnost ciljnog korisnika id=9 i korisnika id=0
Vrednost: 0.308 - slicnost ciljnog korisnika id=9 i korisnika id=6
Vrednost: 0.301 - slicnost ciljnog korisnika id=9 i korisnika id=3
Vrednost: 0.288 - slicnost ciljnog korisnika id=9 i korisnika id=8
Vrednost: 0.277 - slicnost ciljnog korisnika id=9 i korisnika id=5
Vrednost: 0.256 - slicnost ciljnog korisnika id=9 i korisnika id=4
Vrednost: 0.253 - slicnost ciljnog korisnika id=9 i korisnika id=1
Id-evi najslicnijih korisnika:  [7, 0, 6, 3, 8, 5, 4, 1]
Broj ocena koje je dao korisnik u:  105
Prosecna ocena korisnika 9 je: 2.9047619047619047
Prosecna ocena korisnika 7 je: 3.8870967741935485
Prosecna ocena korisnika 0 je: 3.48
Prosecna ocena korisnika 6 je: 3.2580645161290325
Prosecna ocena korisnika 3 je: 3.5866666666666664
Prosecna ocena korisnika 8 je: 3.757575757575758
Prosecna ocena k