In [None]:
import numpy as np
import pandas as pd
import os
import torch
from tqdm import tqdm_notebook

from google_drive_downloader import GoogleDriveDownloader as gdd

gdd.download_file_from_google_drive(file_id='1GNllmHs0cH8FQVu68AwrttMzere4KJhG',
                                    dest_path='./ratings.dat',
                                    unzip=True)

gdd.download_file_from_google_drive(file_id='1V91TkGCwGFfeL08jpTbzsXBMndD_zWy0',
                                    dest_path='./movies.dat',
                                    unzip=True)

Downloading 1V91TkGCwGFfeL08jpTbzsXBMndD_zWy0 into ./movies.dat... Done.
Unzipping...



In [None]:
rating_data = pd.io.parsers.read_csv('ratings.dat', 
    names=['user_id', 'movie_id', 'rating', 'time'],
    engine='python', delimiter='::')

In [None]:
movies_data = pd.io.parsers.read_csv('movies.dat', 
    names=['movie_id', 'name', 'genre'],
    engine='python', delimiter='::')

movies_data

Unnamed: 0,movie_id,name,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama



# Data Preparation

In [None]:
# construct tensor I here
import datetime

rating_data['date'] = pd.to_datetime(rating_data['time'], unit='s', origin='unix')
rating_data['year'] = rating_data['date'].apply(lambda x: x.year)
rating_data['month'] = rating_data['date'].apply(lambda x: x.month)

In [None]:
def data_truncate(key):
    user_item_key = rating_data[['user_id', key, 'movie_id']]
    user_item_key['binary'] = np.ones(user_item_key.shape[0], dtype=np.int8)
    user_item_key = user_item_key.sort_values(by=['user_id', 'movie_id'])

    return user_item_key

In [None]:
def fill_extra_movies(data_user, column): 
    extra_movies = []
    users = data_user.user_id.unique()

    for user in tqdm_notebook(users):
        current_user_frame = data_user[data_user['user_id'] == user]
        movies_user = current_user_frame['movie_id'].unique()

        add_movies = np.array(list(set(all_movies) - set(movies_user)))
        qty_movies = add_movies.shape[0]

        user_rate = np.tile([user, 1], qty_movies).reshape(qty_movies, 2)
        binary_col = np.zeros(qty_movies, dtype=np.int8)[:, None]

        res_frame = np.hstack((user_rate, add_movies[:, None], binary_col))
        extra_movies.append(res_frame)

    extra_frame = pd.DataFrame(np.vstack(extra_movies), columns=['user_id', column, 'movie_id', 'binary'])
    data_new = pd.concat((data_user, extra_frame))

    return data_new

In [None]:
def create_pivot(data_user, column):
    pivot = pd.pivot_table(data_user, values='binary', index=['user_id', 'movie_id'],
                        columns=[column])
    
    return pivot

## User $\times$ Item $\times$ Month

In [None]:
user_item_month = data_truncate('month')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
all_movies = np.sort(user_item_month.movie_id.unique()) #Calculate only once
data_uim = fill_extra_movies(user_item_month, 'month')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=6040.0), HTML(value='')))




In [None]:
uim_pivot = create_pivot(data_uim, 'month')

In [None]:
uim_pivot = uim_pivot.fillna(0)

In [None]:
num_of_users = rating_data.user_id.nunique() #Calculate only once
num_of_movies = rating_data.movie_id.nunique() #Calculate only once
num_of_months = rating_data.month.nunique()

user_movie_month_array = uim_pivot.values.reshape(num_of_users, num_of_movies, num_of_months)
sparse_user_movie_month = torch.tensor(user_movie_month_array).to_sparse()

## User $\times$ Item $\times$ Rate

In [None]:
user_item_rate = data_truncate('rating')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
data_uir = fill_extra_movies(user_item_rate, 'rating')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=6040.0), HTML(value='')))




In [None]:
%%time 

uir_pivot = create_pivot(data_uir, 'rating')

CPU times: user 21.7 s, sys: 3.32 s, total: 25 s
Wall time: 25.1 s


In [None]:
uir_pivot = uir_pivot.fillna(0)

In [None]:
num_of_rates = rating_data.rating.nunique()

user_movie_rate_array = uir_pivot.values.reshape(num_of_users, num_of_movies, num_of_rates)
sparse_user_movie_rate = torch.tensor(user_movie_rate_array).to_sparse()

# Part II. Decompose (more decompositions?)

### 1.[ Tucker decomposition](http://tensorly.org/stable/user_guide/tensor_decomposition.html)

In [None]:
# !pip install tensorly

In [None]:
user_movie_rate_array = torch.load('./sparse_user_movie_rate.pt')
user_movie_rate_array = np.array(user_movie_rate_array.to_dense())

In [None]:
user_movie_rate_array.shape

(6040, 3706, 5)

In [None]:
%%time

from tensorly.decomposition import tucker

# decompose both tensors

core, (U, V, W) = tucker(user_movie_rate_array, rank=(20, 20, 5))

CPU times: user 1min 33s, sys: 3.56 s, total: 1min 37s
Wall time: 51.6 s


In [None]:
core.shape, U.shape, V.shape, W.shape

((20, 20, 5), (6040, 20), (3706, 20), (5, 5))

In [None]:
user0 = user_movie_rate_array[0]
user0.shape

(3706, 5)

In [None]:
R_user0 = V @ V.T @ user0 @ W @ W.T

In [None]:
merge_frames = rating_data.merge(movies_data, left_on='movie_id', right_on='movie_id')

In [None]:
merge_frames.head()

Unnamed: 0,user_id,movie_id,rating,time,date,year,month,name,genre
0,1,1193,5,978300760,2000-12-31 22:12:40,2000,12,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,2000-12-31 21:33:33,2000,12,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,2000-12-30 23:49:39,2000,12,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,2000-12-30 18:01:19,2000,12,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,2000-12-30 06:41:11,2000,12,One Flew Over the Cuckoo's Nest (1975),Drama


In [None]:
merge_frames[merge_frames['user_id'] == 1].sort_values(by='rating', ascending=False).head(20)

Unnamed: 0,user_id,movie_id,rating,time,date,year,month,name,genre
0,1,1193,5,978300760,2000-12-31 22:12:40,2000,12,One Flew Over the Cuckoo's Nest (1975),Drama
50759,1,1029,5,978302205,2000-12-31 22:36:45,2000,12,Dumbo (1941),Animation|Children's|Musical
41626,1,1,5,978824268,2001-01-06 23:37:48,2001,1,Toy Story (1995),Animation|Children's|Comedy
19503,1,3105,5,978301713,2000-12-31 22:28:33,2000,12,Awakenings (1990),Drama
43703,1,1961,5,978301590,2000-12-31 22:26:30,2000,12,Rain Man (1988),Drama
25853,1,527,5,978824195,2001-01-06 23:36:35,2001,1,Schindler's List (1993),Drama|War
37339,1,1022,5,978300055,2000-12-31 22:00:55,2000,12,Cinderella (1950),Animation|Children's|Musical
15859,1,1035,5,978301753,2000-12-31 22:29:13,2000,12,"Sound of Music, The (1965)",Musical
28501,1,48,5,978824351,2001-01-06 23:39:11,2001,1,Pocahontas (1995),Animation|Children's|Musical|Romance
49748,1,1028,5,978301777,2000-12-31 22:29:37,2000,12,Mary Poppins (1964),Children's|Comedy|Musical


In [None]:
probs = R_user0[:, -1]
range_array = {x: y for x, y in enumerate(probs)}
movies_user0 = merge_frames[merge_frames['user_id'] == 1].movie_id.unique()
range_array = {x: y for x, y in range_array.items() if x not in movies_user0}

In [None]:
dict(sorted(range_array.items(), key=lambda item: item[1], reverse=True))

{309: 0.34401964398406654,
 513: 0.3324525978971312,
 0: 0.26977323180040447,
 858: 0.26955931589891424,
 1025: 0.2639847592069071,
 346: 0.255524995252296,
 579: 0.23311197359644711,
 581: 0.20967695548752735,
 2557: 0.20373633057838655,
 144: 0.19455355558415913,
 1178: 0.18975633746619944,
 892: 0.18477012027544262,
 1563: 0.1845001029507634,
 2898: 0.1841601405029983,
 354: 0.18387015914431595,
 574: 0.18010631427190163,
 576: 0.177971739057177,
 963: 0.17763934749165972,
 1107: 0.17718731062621165,
 1848: 0.1765068289472686,
 1108: 0.1743238322666704,
 2599: 0.16816395921428898,
 2203: 0.16120633374057836,
 1781: 0.1576702008633432,
 970: 0.15610434208328094,
 1117: 0.15564983173463345,
 2931: 0.14979124370878752,
 1104: 0.14615968455079761,
 1900: 0.14390564789261526,
 1135: 0.14333150872503095,
 1154: 0.14262281063917928,
 580: 0.14144764540320753,
 1215: 0.1395144434908062,
 2131: 0.1388776151362067,
 106: 0.1383803112103742,
 957: 0.13087155798734387,
 1190: 0.1302359907094089

In [None]:
movies_data[movies_data['movie_id'] == 309]

Unnamed: 0,movie_id,name,genre
306,309,"Red Firecracker, Green Firecracker (1994)",Drama


### 2. [HOSVD decomposition](https://pytorch.org/docs/stable/generated/torch.svd.html) or [Lowrank HOSVD decomposition](https://pytorch.org/docs/stable/generated/torch.svd_lowrank.html)

In [None]:
# decompose both tensors