In [None]:
import numpy as np
import pandas as pd
import os
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from scipy import sparse as sps
from scipy import sparse
import statistics
from math import sqrt

In [None]:
EPS = 0.000000003

In [None]:
# Učitavanje skupa podataka

def loaddata(filename):
    df = pd.read_csv(f'{filename}.csv',sep=',',error_bad_lines=False, warn_bad_lines=False, encoding='latin-1')
    return df

amazon_data = loaddata('skup_amazon/fashionMeta')
data = loaddata('skup_amazon/fashion')

In [23]:
# Eliminisanje atributa koji nisu od značaja za dalju implementaciju

rating = data.drop(['verified', 'reviewTime', 'reviewerName', 'reviewText', 'summary','unixReviewTime', 'vote', 'style/Color:', 'style/Size:', 'image/0', 'style/Metal Type:', 'image/1', 'image/2', 'image/3', 'image/4', 'image/5', 'image/6', 'image/7', 'image/8', 'image/9', 'image/10', 'image/11', 'image/12', 'image/13', 'style/Size Name:', 'style/Style:'], axis=1)
rating

NameError: ignored

In [24]:
def encode_user_item(df, user_col, item_col, rating_col):
    """Funkcija za enkoiranje korisnika i stavki
    
    Parametri:     
        df (pd.DataFrame): Pandas dataframe - tabela sa interakcijama.
        user_col (string): Naziv kolone koja predstavlja korisnika.
        item_col (string): Naziv kolone koja predstavlja stavku.
        rating_col (string): Naziv kolone koja predstavlja ocenu.
    
    Povratna vrednost: 
        encoded_df (pd.DataFrame): Modifikovan dataframe sa indeksima korisnika i stavki
    """
    
    encoded_df = df.copy()
    
    user_encoder = LabelEncoder() # Enkodiranje vrednosti promenljie sa vrednostima između 0 i n_classes-1.
    user_encoder.fit(encoded_df[user_col].values)
    n_users = len(user_encoder.classes_)
    
    item_encoder = LabelEncoder()
    item_encoder.fit(encoded_df[item_col].values)
    n_items = len(item_encoder.classes_)

    encoded_df["USER"] = user_encoder.transform(encoded_df[user_col])
    encoded_df["ITEM"] = item_encoder.transform(encoded_df[item_col])
    
    encoded_df.rename({rating_col: "RATING"}, axis=1, inplace=True)
    
    print("Broj korisnika: ", n_users)
    print("Broj stavki: ", n_items)
    
    return encoded_df, user_encoder, item_encoder

In [None]:
DATA, user_encoder, item_encoder = encode_user_item(ratings, "reviewerID", "asin", "overall")

In [None]:
ratings_pivot_df = DATA.pivot(index='USER',columns='ITEM',values='RATING').fillna(0)
ratings_pivot_df.head()

In [None]:
ratings_csr_matrix = sps.csr_matrix(ratings_pivot_df)
ratings_pivot_df.shape, print(ratings_csr_matrix.todense()) # toarray vraća ndarray; todense vraća matrix

In [None]:
# Podela datog skupa podataka na skup za trening i skup za test se u narednoj funkciji
# vrši tako što se iz datog skupa "uzima" procenat ocena koje se upisuju u test skup,
# a uklanjaju iz trening skupa

# parametar ratings_csr_matrix je skup podataka koji treba podeliti na trening i test skup
# percentage je broj koji predstavlja procenat ocena koje ulaze u test skup

def split_train_test(ratings_csr_matrix, percentage):
    
    test_ratings_number = percentage / 100
    print("Odnos ocena u skupu za testiranje: ", percentage, "%")
    print("Odnos ocena u skupu za treniranje: ", 100-percentage, "%")

    total_ratings = ratings_csr_matrix.toarray()

    dimensions_of_total_ratings = total_ratings.shape
    print("Ukupan broj korisnika: ", dimensions_of_total_ratings[0])
    print("Ukupan broj stavki: ", dimensions_of_total_ratings[1])

    test = np.zeros(dimensions_of_total_ratings)
    train = total_ratings.copy()

    nonzero_ratings_per_row = (total_ratings != 0).sum(1)
    print("Ukupan broj ne-nula ocena u svim redovima: \n", nonzero_ratings_per_row)
    
    for user in range(dimensions_of_total_ratings[0]):
        
        # indeksi ne-nula elemenata
        nonzero_test_ratings_per_user = int(np.ceil(test_ratings_number*nonzero_ratings_per_row[user]))
        
        test_ratings = np.random.choice(total_ratings[user, :].nonzero()[0], size = nonzero_test_ratings_per_user, replace = False)
        
        # ocene se izbacuju iz trening skupa (upisuju se nule na odabranim pozicijama)
        train[user, test_ratings] = 0

        # ubacuju se u test skup (upisuju se ocene iz polaznog skupa, koje su uklonjene iz trening skupa)
        test[user, test_ratings] = total_ratings[user, test_ratings]
        
    if (not(np.all((train * test) == 0))):
        print("Greska!")
    else:    
        return sps.csr_matrix(train),sps.csr_matrix(test)

In [None]:
# Funkcija za sortiranje liste opadajuće

def sort_descending(li):
    # reverse (opcioni parametar)
    # reverse = True - sortirana lista se preokreće (postaje sortirana opadajuće)
    # reverse = None - lista ostaje sortirana rastuće
    li.sort(key = lambda x: x[0], reverse=True)
    return li

In [None]:
# Funkcija filtrira datu matricu ocena tako da ostaju samo redovi korisnika koji su glasali vise od n puta

def filter_matrix(matrix, n):
    ratings_array = matrix.toarray()
    ratings_matrix_filtered = []
    for row in ratings_array:
        sum = 0
        nonzero_count = np.count_nonzero(row)
        if (nonzero_count > n):
            ratings_matrix_filtered.append(row)
    ratings_matrix_filtered_shape = np.vstack(ratings_matrix_filtered).shape
    print("Dimenzije filtrirane matrice ocena su: ", ratings_matrix_filtered_shape)
    return np.vstack(ratings_matrix_filtered)

In [None]:
# Funkcija računa ocenu korisnika u za stavku i na osnovu suseda čija je sličnost sa korisnikom u veća od postavljenog praga

def user_based_similarity_limit_ratings_prediction(u, i, users_similarity, ratings, similarity_limit):
    
    neighbors = []
    
    similarities = list(zip(users_similarity[u][:], range(users_similarity.shape[0])))  # id-evi korisnika
    similarities_sorted = sort_descending(similarities)
    similarities_sorted_dict = dict(similarities_sorted)
    
    most_similar_users_dict = dict(filter(lambda elem: similarity_limit < elem[0] < 0.999, similarities_sorted_dict.items()))

    print('Korisnici cija je slicnost sa ciljnim korisnikom veca od {} su:'.format(similarity_limit))
    for key in most_similar_users_dict:
        print('Vrednost: {:.3f} - slicnost ciljnog korisnika id={} i korisnika id={}'.format(key, u, most_similar_users_dict[key]))
    
    for key in most_similar_users_dict:
        neighbors.append(most_similar_users_dict[key])
        
    print('Id-evi najslicnijih korisnika: ', neighbors)
    
    rated_by_u = np.count_nonzero(ratings[u].toarray()[0])
    print('Broj ocena koje je dao korisnik u: ', rated_by_u)

    user_u_mean = 0
    user_u_mean = np.sum(ratings[u])
        
    if rated_by_u != 0:
        user_u_mean = user_u_mean / rated_by_u 
    else:
        print('Korisnik {} nije dao ni jednu ocenu'.format(v))
    
    print('Prosecna ocena korisnika {} je: {}'.format(u, user_u_mean))
    numerator, denominator = 0.0, 0.0

    for v in neighbors:
        rated_by_v = np.count_nonzero(ratings[v].toarray()[0])
        user_v_mean = 0
        user_v_mean = np.sum(ratings[v])
        
        if rated_by_v != 0:
            user_v_mean = user_v_mean / rated_by_v
            print('Prosecna ocena korisnika {} je: {}'.format(v, user_v_mean))
        else:
            print('Korisnik {} nije dao ni jednu ocenu'.format(v))
        
        r_vi = ratings[v,i]
        numerator += float((users_similarity[u][v]))*float((r_vi - user_v_mean))
        denominator += float(users_similarity[u][v])
        
    print('-------------------------------------')
    print("Ocena korisnika u za stavku i: {:.3f}".format(user_u_mean + numerator/denominator))    
    print('-------------------------------------')

    return user_u_mean + numerator/denominator

In [None]:
# Funkcija računa ocenu korisnika u za stavku i na osnovu ocena nasumično odabranih suseda

import random

def user_based_random_ratings_prediction(u, i, users_similarity, ratings, similarity_limit, rand_number):
    
    neighbors = []
    
    similarities = list(zip(users_similarity[u][:], range(users_similarity.shape[0])))  # id-evi korisnika
    similarities_sorted = sort_descending(similarities)
    similarities_sorted_dict = dict(similarities_sorted)
    
    keys = random.sample(list(similarities_sorted_dict), rand_number)
    values = [similarities_sorted_dict[k] for k in keys]
    
    
    #most_similar_users_dict = dict(filter(lambda elem: similarity_limit < elem[0] < 0.999, similarities_sorted_dict.items()))

    print('Korisnici cija je slicnost sa ciljnim korisnikom veca od {} su:'.format(similarity_limit))
    for key in keys:
        print('Vrednost: {:.3f} - slicnost ciljnog korisnika id={} i korisnika id={}'.format(key, u, similarities_sorted_dict[key]))
    
    for key in keys:
        neighbors.append(similarities_sorted_dict[key])
        
    print('Id-evi najslicnijih korisnika: ', neighbors)
    
    rated_by_u = np.count_nonzero(ratings[u].toarray()[0])
    print('Broj ocena koje je dao korisnik u: ', rated_by_u)

    user_u_mean = 0
    user_u_mean = np.sum(ratings[u])
        
    if rated_by_u != 0:
        user_u_mean = user_u_mean / rated_by_u 
    else:
        print('Korisnik {} nije dao ni jednu ocenu'.format(v))
    
    print('Prosecna ocena korisnika {} je: {}'.format(u, user_u_mean))
    numerator, denominator = 0.0, 0.0

    for v in neighbors:
        rated_by_v = np.count_nonzero(ratings[v].toarray()[0])
        user_v_mean = 0
        user_v_mean = np.sum(ratings[v])
        
        if rated_by_v != 0:
            user_v_mean = user_v_mean / rated_by_v
            print('Prosecna ocena korisnika {} je: {}'.format(v, user_v_mean))
        else:
            print('Korisnik {} nije dao ni jednu ocenu'.format(v))
        
        r_vi = ratings[v,i]
        numerator += float((users_similarity[u][v]))*float((r_vi - user_v_mean))
        denominator += float(users_similarity[u][v])
        
    print('-------------------------------------')
    print("Ocena korisnika u za stavku i: {:.3f}".format(user_u_mean + numerator/denominator))    
    print('-------------------------------------')

    return user_u_mean + numerator/denominator

In [None]:
# Redukcija matrice ocena
ratings_csr_reduced = ratings_csr_matrix[:83, :900]
print("Dimenzije redukovane matrice ocena su: ", ratings_csr_reduced.shape)

# Filtriranje matrice ocena tako da ostanu korisnici koji su ocenili preko 100 artikla
filtered_matrix = filter_matrix(ratings_csr_reduced, n = 100)
print('Filtrirana matrica: \n', filtered_matrix)

# Podela skupa na train_and_validation i test
filtered_ratings_csr_matrix = sps.csr_matrix(filtered_matrix)
train_and_validation, test = split_train_test(filtered_ratings_csr_matrix, 20)

In [None]:
# Podela skupa train_and_validation na skupove train i validation
train, validation = split_train_test(train_and_validation, 30)

In [None]:
# Kreiranje matrice sličnosti
similarity = cosine_similarity(train) + EPS
print(similarity)

In [None]:
# Provera dimenzija
print("Dimenzije train skupa: ", train.shape)
print("Dimenzije matrice sličnosti: ", similarity.shape)

In [None]:
# Ocena korisnika čiji je id = 9 za stavku čiji je id = 10 - random choice

result_random = user_based_random_ratings_prediction(9, 10, similarity, train, 0.25, 2)

In [None]:
# Ocena korisnika čiji je id = 9 za stavku čiji je id = 10
result = user_based_similarity_limit_ratings_prediction(9, 10, similarity, train, 0.25)