In [309]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('dark')
%matplotlib inline
from  sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity

In [310]:
import data_preparation

In [311]:
(user_item_train, n_users_train, n_items_train), (user_item_test, n_users_test, n_items_test), movie_data = data_preparation_pipeline()

In [312]:
def get_similarity(data, ui='user'):
    if ui == 'user':
        user_similarity = pairwise_distances(np.nan_to_num(data), metric='cosine')
        return user_similarity
    else:
        item_similarity = pairwise_distances(np.nan_to_num(data).T, metric='cosine')
        return item_similarity

In [313]:
def baseline_predict(n_users, n_movies):
    pred = np.full((n_users, n_movies), 3)
    return pred

In [317]:
def naive_predict(train_data_matrix, n_users, n_movies, top):
    train_data_matrix = train_data_matrix.to_numpy()
    # Структура для хранения для каждого пользователя оценки фильмов top наиболее похожих на него пользователей:
    # top_similar_ratings[0][1] - оценки всех фильмов одного из наиболее похожих пользователей на пользователя с ид 0.
    # Здесь 1 - это не ид пользователя, а просто порядковый номер.
    top_similar_ratings = np.zeros((n_users, top, n_movies))

    # user_similarity = get_similarity(train_data_matrix)
    user_similarity = cosine_similarity(np.nan_to_num(train_data_matrix))

    for i in range(n_users):
        # Для каждого пользователя необходимо получить наиболее похожих пользователей:
        # Нулевой элемент не подходит, т.к. на этом месте находится похожесть пользователя самого на себя
        top_sim_users = np.flip(user_similarity[i].argsort())[1:top + 1]
        
        # берём только оценки из "обучающей" выборки 
        top_similar_ratings[i] = train_data_matrix[top_sim_users]

    pred = np.zeros((n_users, n_movies))
    for i in range(n_users):
        pred[i] = top_similar_ratings[i].sum(axis=0) / top
    
    return pred


def naive_predict_item(train_data_matrix, n_users, n_movies, top):
    train_data_matrix = train_data_matrix.to_numpy()

    top_similar_ratings = np.zeros((n_movies, top, n_users))

    # item_similarity = get_similarity(train_data_matrix, ui='item')
    item_similarity = cosine_similarity(np.nan_to_num(train_data_matrix.T))

    for i in range(n_movies):
        top_sim_movies = np.flip(item_similarity[i].argsort())[1:top + 1]
        top_similar_ratings[i] = train_data_matrix.T[top_sim_movies]
        
    pred = np.zeros((n_movies, n_users))
    for i in range(n_movies):
        pred[i] = top_similar_ratings[i].sum(axis=0) / top
    
    return pred.T

In [318]:
baseline_pred = baseline_predict(n_users_train, n_items_train)
print('Baseline RMSE: ', rmse(baseline_pred, user_item_test.to_numpy()))

naive_pred = naive_predict(user_item_train, n_users_train, n_items_train, 7)
print('User-based CF RMSE: ', rmse(naive_pred, user_item_test.to_numpy()))

naive_pred_item = naive_predict_item(user_item_train, n_users_train, n_items_train, 7)
print('Item-based CF RMSE: ', rmse(naive_pred_item, user_item_test.to_numpy()))

20000 20000
1.400475
Baseline RMSE:  1.183416663732601
14 14
1.8432944606413995
User-based CF RMSE:  1.3576798078491847
181 181
0.8066862103957605
Item-based CF RMSE:  0.8981571189918613


In [319]:
def k_fract_predict(train_data_matrix, n_users, n_movies, top):
    train_data_matrix = train_data_matrix.to_numpy()
    
    top_similar = np.zeros((n_users, top))
    
    # user_similarity = get_similarity(train_data_matrix)
    user_similarity = cosine_similarity(np.nan_to_num(train_data_matrix))
    
    for i in range(n_users):
        user_sim = user_similarity[i]
        top_sim_users = np.flip(user_sim.argsort())[1:top + 1]

        for j in range(top):
            top_similar[i, j] = top_sim_users[j]
            
    abs_sim = np.abs(user_similarity)
    pred = np.zeros((n_users, n_movies))
    
    for i in range(n_users):
        indexes = top_similar[i].astype(np.int64)
        numerator = user_similarity[i][indexes]
        
        product = numerator.dot(train_data_matrix[indexes])
        
        denominator = abs_sim[i][top_similar[i].astype(np.int64)].sum()
        
        pred[i] = product / denominator
    
    return pred


def k_fract_predict_item(train_data_matrix, n_users, n_movies, top):
    train_data_matrix = train_data_matrix.to_numpy()
    
    top_similar = np.zeros((n_movies, top))
    
    # item_similarity = get_similarity(train_data_matrix, ui='item')
    item_similarity = cosine_similarity(np.nan_to_num(train_data_matrix.T))
    
    for i in range(n_movies):
        movies_sim = item_similarity[i]
        top_sim_movies = np.flip(movies_sim.argsort())[1:top + 1]

        for j in range(top):
            top_similar[i, j] = top_sim_movies.T[j]
            
    abs_sim = np.abs(item_similarity)
    pred = np.zeros((n_movies, n_users))
    
    
    for i in range(n_users):
        indexes = top_similar[i].astype(np.int64)
        numerator = item_similarity[i][indexes]
        
        product = numerator.dot(train_data_matrix.T[indexes])
        
        denominator = abs_sim[i][indexes].sum()
        denominator = denominator if denominator != 0 else 1
        
        pred[i] = product / denominator
        
    return pred.T


In [320]:
k_predict = k_fract_predict(user_item_train, n_users_train, n_items_train, 7)
print('User-based CF RMSE: ', rmse(k_predict, user_item_test.to_numpy()))

k_predict_item = k_fract_predict_item(user_item_train, n_users_train, n_items_train, 7)
print('Item-based CF RMSE: ', rmse(k_predict_item, user_item_test.to_numpy()))

14 14
1.835988552930256
User-based CF RMSE:  1.354986550830028
22 22
1.2715897748292977
Item-based CF RMSE:  1.1276478948808877


In [337]:
def k_fract_mean_predict(train_data_matrix, n_users, n_movies, top):
    train_data_matrix = train_data_matrix.to_numpy()
    
    top_similar = np.zeros((n_users, top))
    
    # user_similarity = get_similarity(train_data_matrix)
    user_similarity = cosine_similarity(np.nan_to_num(train_data_matrix))
    
    for i in range(n_users):
        user_sim = user_similarity[i]
        top_sim_users = np.flip(user_sim.argsort())[1:top + 1]

        for j in range(top):
            top_similar[i, j] = top_sim_users[j]
            
    abs_sim = np.abs(user_similarity)
    pred = np.zeros((n_users, n_movies))
    
    for i in range(n_users):
        indexes = top_similar[i].astype(np.int64)
        numerator = user_similarity[i][indexes]
        
        mean_rating = np.array([x for x in train_data_matrix[i] if x > 0]).mean()
        # print(mean_rating)
        diff_ratings = (np.nan_to_num(train_data_matrix[indexes]).T - np.true_divide(np.nan_to_num(train_data_matrix[indexes]).sum(1),(np.nan_to_num(train_data_matrix[indexes])!=0).sum(1))).T
        print(np.true_divide(np.nan_to_num(train_data_matrix[indexes]).sum(1),(np.nan_to_num(train_data_matrix[indexes])!=0).sum(1)))
        numerator = numerator.dot(diff_ratings)
        denominator = abs_sim[i][top_similar[i].astype(np.int64)].sum()
        pred[i] = -mean_rating*0.5 + mean_rating + mean_rating + numerator / denominator
        # print(denominator)
        
    return pred

def k_fract_mean_predict_item(train_data_matrix, n_users, n_movies, top):
    train_data_matrix = train_data_matrix.to_numpy()
    
    top_similar = np.zeros((n_movies, top))
    
    # item_similarity = get_similarity(train_data_matrix, ui='item')
    item_similarity = cosine_similarity(np.nan_to_num(train_data_matrix.T))
    
    for i in range(n_movies):
        movie_sim = item_similarity[i]
        top_sim_movies = np.flip(movie_sim.argsort())[1:top + 1]
        
        for j in range(top):
            top_similar[i, j] = top_sim_movies[j]
    
    abs_sim = np.abs(item_similarity)
    pred = np.zeros((n_movies, n_users))
    
    for i in range(n_movies):
        indexes = top_similar[i].astype(np.int64)
        numerator = item_similarity[i][indexes]
        
        diff_ratings = train_data_matrix.T[indexes] - train_data_matrix.T[indexes].mean()
        numerator = numerator.dot(diff_ratings)
        denominator = abs_sim[i][top_similar[i].astype(np.int64)].sum()
        denominator = denominator if denominator != 0 else 1
        
        mean_rating = np.array([x for x in train_data_matrix.T[i] if x > 0]).mean()
        mean_rating = 0 if np.isnan(mean_rating) else mean_rating
        pred[i] = mean_rating + numerator / denominator
                
    return pred.T

In [338]:
k_predict = k_fract_mean_predict(user_item_train, n_users_train, n_items_train, 7)
print('User-based CF RMSE: ', rmse(k_predict, user_item_test.to_numpy()))

k_predict_item = k_fract_mean_predict_item(user_item_train, n_users_train, n_items_train, 7)
print('Item-based CF RMSE: ', rmse(k_predict_item, user_item_test.to_numpy()))

[3.80514706 3.19911504 3.81531532 3.62941176 3.98842593 3.40310078
 3.77052239]
[4.02469136 3.4375     3.35549133 3.56072874 3.55182927 3.98412698
 3.46827795]
[4.13251366 3.38793103 3.875      3.30585106 2.81368821 4.42307692
 2.5094162 ]
[4.07471264 4.13251366 3.41823899 3.30585106 3.875      3.77160494
 3.31734317]
[4.0125     3.3877551  3.79347826 2.97142857 3.69565217 2.912
 3.65454545]
[2.91712707 3.29141104 3.64285714 4.13218391 3.18050542 3.88976378
 3.56603774]
[3.29141104 4.21100917 3.69565217 3.76404494 4.13218391 3.24691358
 3.44347826]
[3.3877551  4.04444444 3.98412698 3.58695652 3.92857143 3.65454545
 2.912     ]
[4.19883041 3.31838565 3.30272953 4.20206767 3.68113772 2.50517598
 3.46943765]
[3.7991453  3.51785714 3.46943765 3.875      3.55182927 3.35549133
 3.71226415]
[3.39285714 3.95081967 3.2254902  3.55182927 3.56603774 3.35549133
 4.01041667]
[3.35549133 3.88414634 3.46827795 4.15533981 3.45138889 2.50517598
 3.4375    ]
[3.61023622 3.67261905 3.48031496 4.15526316 

ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.

In [240]:
a = np.array([[1, 1, 0, 1, 0],
              [1, 1, 0, 1, 0],
              [1, 0, 0, 0, 1],
              [0, 0, 1, 0, 1],
              [-1, -1, 0, -1, 0],
              [-1, -1, 0, -1, 0]])
b = np.array([1, 2, 3, 4, 5, 6])
print(np.true_divide(a.sum(1),(a!=0).sum(1)))
print(get_similarity(a))
print(cosine_similarity(a))

[ 1.  1.  1.  1. -1. -1.]
[[0.         0.         0.59175171 1.         2.         2.        ]
 [0.         0.         0.59175171 1.         2.         2.        ]
 [0.59175171 0.59175171 0.         0.5        1.40824829 1.40824829]
 [1.         1.         0.5        0.         1.         1.        ]
 [2.         2.         1.40824829 1.         0.         0.        ]
 [2.         2.         1.40824829 1.         0.         0.        ]]
[[ 1.          1.          0.40824829  0.         -1.         -1.        ]
 [ 1.          1.          0.40824829  0.         -1.         -1.        ]
 [ 0.40824829  0.40824829  1.          0.5        -0.40824829 -0.40824829]
 [ 0.          0.          0.5         1.          0.          0.        ]
 [-1.         -1.         -0.40824829  0.          1.          1.        ]
 [-1.         -1.         -0.40824829  0.          1.          1.        ]]


In [316]:
a = np.array([[1, 1, 0, 1, 0],
              [1, 1, 0, 1, 0],
              [1, 0, 0, 0, 1],
              [0, 0, 1, 0, 1],
              [-1, -1, 0, -1, 0],
              [-1, -1, 0, -1, 0]])
b = np.array([[1, 1, 0, 1, 0],
              [1, 1, 0, 1, 0],
              [1, 0, 0, 0, 1],
              [0, 0, 1, 0, 1],
              [-1, -1, 0, -1, 0],
              [-1, -1, 0, -1, 0]])
rmse(a, -b)

16 16
4.0


2.0