In [116]:
import numpy as np
import pandas as pd
import os
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from scipy import sparse as sps
import statistics
from math import sqrt
from scipy import sparse

# from ipynb.fs.full.knn_cf_movies_dataset import split_train_test

### Ucitavanje i prikaz skupa podataka i kreiranje retke matrice

In [55]:
data_path = 'datasets/'
ratings_filename = 'ratings.csv'

ratings_df = pd.read_csv(os.path.join(data_path, ratings_filename), usecols=['userId', 'movieId', 'rating'], dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})
# ratings_df.head()

ratings_pivot_df = ratings_df.pivot(index='userId',columns='movieId',values='rating').fillna(0)
ratings_pivot_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
ratings_csr_matrix = sps.csr_matrix(ratings_pivot_df)
ratings_pivot_df.shape, print(ratings_csr_matrix.todense()) #toarray returns an ndarray; todense returns a matrix.

[[4.  0.  4.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 ...
 [2.5 2.  2.  ... 0.  0.  0. ]
 [3.  0.  0.  ... 0.  0.  0. ]
 [5.  0.  0.  ... 0.  0.  0. ]]


((610, 9724), None)

### Podela skupa podataka, funkcija sa sortiranje i filtriranje

In [57]:
# Podela datog skupa podataka na skup za trening i skup za test se u narednoj funkciji
# vrsi tako sto se iz datog skupa "uzima" percentage % ocena koje se upisuju u test skup,
# a uklanjaju iz trening skupa

# parametar ratings_csr_matrix je skup podataka koji treba podeliti na trening i test skup
# percentage je broj koji predstavlja procenat ocena koje ulaze u test skup

def split_train_test(ratings_csr_matrix, percentage):
    
    test_ratings_number = percentage / 100
    print("Odnos ocena u skupu za testiranje: ", percentage, "%")
    print("Odnos ocena u skupu za treniranje: ", 100-percentage, "%")

    total_ratings = ratings_csr_matrix.toarray()

    dimensions_of_total_ratings = total_ratings.shape
    print("Ukupan broj korisnika: ", dimensions_of_total_ratings[0])
    print("Ukupan broj filmova: ", dimensions_of_total_ratings[1])

    test = np.zeros(dimensions_of_total_ratings)
    train = total_ratings.copy()

    nonzero_ratings_per_row = (total_ratings != 0).sum(1)
    print("Ukupan broj ne-nula ocena u svim redovima: \n", nonzero_ratings_per_row)
    
    for user in range(dimensions_of_total_ratings[0]):
        
        nonzero_test_ratings_per_user = int(np.ceil(test_ratings_number*nonzero_ratings_per_row[user])) # ispod - indeksi ne-nula elemenata
        # print("Ukupan broj ne-nula ocena u test skupu po korisniku: ", nonzero_test_ratings_per_user)
        
        test_ratings = np.random.choice(total_ratings[user, :].nonzero()[0], size = nonzero_test_ratings_per_user, replace = False)

        train[user, test_ratings] = 0 # izbacuju se iz trening skupa (upisuju se nule na odabranim pozicijama)

        test[user, test_ratings] = total_ratings[user, test_ratings] # ubacuju se u test skup (upisuju se ocene iz polaznog skupa, koje su uklonjene iz trening skupa)
        
    if (not(np.all((train * test) == 0))):
        print("Greska!")
    else:    
        return sps.csr_matrix(train),sps.csr_matrix(test)

In [58]:
def sort_descending(li):
    # reverse (Optional) - If True, the sorted list is reversed (or sorted in descending order).
    # reverse = None (Sorts in Ascending order)
    # key is set to sort using second element of 
    # sublist lambda has been used
    li.sort(key = lambda x: x[0], reverse=True)
    return li

In [59]:
def filter_matrix(matrix, n): # Filtriranje podataka (ostaju samo redovi korisnika koji su glasali vise od n puta)
    ratings_array = matrix.toarray()
    ratings_matrix_filtered = []
    for row in ratings_array:
        sum = 0
        nonzero_count = np.count_nonzero(row)
        if (nonzero_count > n):
            ratings_matrix_filtered.append(row)
    ratings_matrix_filtered_shape = np.vstack(ratings_matrix_filtered).shape
    print("Dimenzije filtrirane matrice ocena su: ", ratings_matrix_filtered_shape)
    return np.vstack(ratings_matrix_filtered)

### Predvidjanje ocene na osnovu suseda iznad praga slicnosti
Naredna funkcija racuna ocenu korisnika $u$ za film $i$ na osnovu suseda cija je slicnost sa korisnikom $u$ veca od postavljenog praga.

In [127]:
def user_based_similarity_limit_ratings_prediction(u, i, users_similarity, ratings, similarity_limit):
    
    neighbors = []
    similarities = list(zip(users_similarity[u][:], range(users_similarity.shape[0])))  # broj korisnika

    similarities_sorted = sort_descending(similarities)

    similarities_sorted_dict = dict(similarities_sorted)
    
    # korisnici cija je slicnost veca od praga similarity_limit
    most_similar_users_dict = dict(filter(lambda elem: similarity_limit < elem[0] < 0.999, similarities_sorted_dict.items()))

    print('Korisnici cija je slicnost sa ciljnim korisnikom veca od {} su:'.format(similarity_limit))
    for key in most_similar_users_dict:
        print('Vrednost: {:.3f} - slicnost ciljnog korisnika id={} i korisnika id={}'.format(key, u, most_similar_users_dict[key]))
    
    for key in most_similar_users_dict:
        neighbors.append(most_similar_users_dict[key])

    print('Id-evi najslicnijih korisnika: ', neighbors)
    
    rated_by_u = np.count_nonzero(ratings[u].toarray()[0])
    print('Broj ocena koje je dao korisnik u: ', rated_by_u)

    user_u_mean = 0
    user_u_mean = np.sum(ratings[u])
        
    if rated_by_u != 0:
        user_u_mean = user_u_mean / rated_by_u # prosecna ocena u-tog korisnika    
    else:
        print('Korisnik {} nije dao ni jednu ocenu'.format(v))
    
    print('Prosecna ocena korisnika {} je: {}'.format(u, user_u_mean))
    numerator, denominator = 0.0, 0.0

    for v in neighbors:
        rated_by_v = np.count_nonzero(ratings[v].toarray()[0])
        user_v_mean = 0
        user_v_mean = np.sum(ratings[v])
        
        if rated_by_v != 0:
            user_v_mean = user_v_mean / rated_by_v
            print('Prosecna ocena korisnika {} je: {}'.format(v, user_v_mean))
        else:
            print('Korisnik {} nije dao ni jednu ocenu'.format(v))
        
        r_vi = ratings[v,i]
        numerator += float((users_similarity[u][v]))*float((r_vi - user_v_mean))
        denominator += float(users_similarity[u][v])
        
    print('-------------------------------------')
    print("Ocena korisnika u za film i: {:.3f}".format(user_u_mean + numerator/denominator))    
    print('-------------------------------------')

    return user_u_mean + numerator/denominator

### Izvrsavanje funkcija - redukcija, filtriranje, podela skupa, racunanje matrice slicnosti i predvidjanje ocene

In [80]:
# Redukcija matrice ocena
ratings_csr_reduced = ratings_csr_matrix[:83, :900]
print("Dimenzije redukovane matrice ocena su: ", ratings_csr_reduced.shape)

# Filtriranje matrice ocena tako da ostanu korisnici koji su ocenili preko 100 filmova
filtered_matrix = filter_matrix(ratings_csr_reduced, n = 100)
print(filtered_matrix)

# Podela skupa na train_and_validation i test
filtered_ratings_csr_matrix = sps.csr_matrix(filtered_matrix)
train_and_validation, test = split_train_test(filtered_ratings_csr_matrix, 20)

Dimenzije redukovane matrice ocena su:  (83, 900)
Dimenzije filtrirane matrice ocena su:  (10, 900)
[[0.  4.  5.  ... 0.  0.  0. ]
 [4.  3.  3.  ... 4.  5.  5. ]
 [5.  0.  0.  ... 0.  0.  0. ]
 ...
 [0.  0.  3.  ... 0.  0.  0. ]
 [4.  0.  3.5 ... 3.5 4.5 0. ]
 [2.5 2.5 2.  ... 5.  5.  5. ]]
Odnos ocena u skupu za testiranje:  20 %
Odnos ocena u skupu za treniranje:  80 %
Ukupan broj korisnika:  10
Ukupan broj filmova:  900
Ukupan broj ne-nula ocena u svim redovima: 
 [314 212 103 136 113 114 167 112 119 189]


In [81]:
# Podela skupa train_and_validation na skupove train i validation
train, validation = split_train_test(train_and_validation, 30)

Odnos ocena u skupu za testiranje:  30 %
Odnos ocena u skupu za treniranje:  70 %
Ukupan broj korisnika:  10
Ukupan broj filmova:  900
Ukupan broj ne-nula ocena u svim redovima: 
 [251 169  82 108  90  91 133  89  95 151]


In [86]:
similarity = cosine_similarity(train) + 0.000000003
similarity

array([[1.0000002 , 0.16277936, 0.27327317, 0.19374663, 0.2555583 ,
        0.21333008, 0.23916195, 0.32696146, 0.21519339, 0.2503244 ],
       [0.16277936, 0.99999994, 0.16807643, 0.13646555, 0.16734196,
        0.19811103, 0.2525869 , 0.15948555, 0.23196897, 0.26004037],
       [0.27327317, 0.16807643, 1.0000004 , 0.14022163, 0.23449685,
        0.15672866, 0.116445  , 0.14406376, 0.20123106, 0.13845986],
       [0.19374663, 0.13646555, 0.14022163, 1.        , 0.19320029,
        0.24567717, 0.17143135, 0.17294814, 0.34652174, 0.29451516],
       [0.2555583 , 0.16734196, 0.23449685, 0.19320029, 0.9999998 ,
        0.13902248, 0.09823661, 0.16648668, 0.14491849, 0.17497712],
       [0.21333008, 0.19811103, 0.15672866, 0.24567717, 0.13902248,
        1.        , 0.30451655, 0.22773357, 0.3348273 , 0.28336543],
       [0.23916195, 0.2525869 , 0.116445  , 0.17143135, 0.09823661,
        0.30451655, 0.99999976, 0.18227284, 0.26072818, 0.2980686 ],
       [0.32696146, 0.15948555, 0.1440637

In [120]:
# provera
train.shape, similarity.shape

((10, 900), (10, 10))

In [128]:
result = user_based_similarity_limit_ratings_prediction(9, 10, similarity, train, 0.25)

Korisnici cija je slicnost sa ciljnim korisnikom veca od 0.25 su:
Vrednost: 0.324 - slicnost ciljnog korisnika id=9 i korisnika id=8
Vrednost: 0.298 - slicnost ciljnog korisnika id=9 i korisnika id=6
Vrednost: 0.295 - slicnost ciljnog korisnika id=9 i korisnika id=3
Vrednost: 0.283 - slicnost ciljnog korisnika id=9 i korisnika id=5
Vrednost: 0.282 - slicnost ciljnog korisnika id=9 i korisnika id=7
Vrednost: 0.260 - slicnost ciljnog korisnika id=9 i korisnika id=1
Vrednost: 0.250 - slicnost ciljnog korisnika id=9 i korisnika id=0
Id-evi najslicnijih korisnika:  [8, 6, 3, 5, 7, 1, 0]
Broj ocena koje je dao korisnik u:  105
Prosecna ocena korisnika 9 je: 2.933333333333333
Prosecna ocena korisnika 8 je: 3.696969696969697
Prosecna ocena korisnika 6 je: 3.3225806451612905
Prosecna ocena korisnika 3 je: 3.56
Prosecna ocena korisnika 5 je: 3.865079365079365
Prosecna ocena korisnika 7 je: 3.9516129032258065
Prosecna ocena korisnika 1 je: 2.610169491525424
Prosecna ocena korisnika 0 je: 3.537142