# Part I. Prepare data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import numpy as np
import pandas as pd
import os

# Read files

path_folder = '/content/drive/MyDrive/Sk/NLA/Project/RecommendationSystemsProject/files/'

ratings_path = os.path.join(path_folder, 'ratings.dat')
movies_path = os.path.join(path_folder, 'movies.dat')
users_path = os.path.join(path_folder, 'users.dat')

rating_data = pd.io.parsers.read_csv(ratings_path, 
    names=['user_id', 'movie_id', 'rating', 'time'],
    engine='python', delimiter='::')

movie_data = pd.io.parsers.read_csv(movies_path,
    names=['movie_id', 'title', 'genre'],
    engine='python', delimiter='::')

### 1. User $\times$ Item $\times$ Month (from 1 to 12)

In [2]:
# construct tensor I here
import datetime

rating_data['date'] = pd.to_datetime(rating_data['time'], unit='s', origin='unix')
rating_data['year'] = rating_data['date'].apply(lambda x: x.year)
rating_data['month'] = rating_data['date'].apply(lambda x: x.month)


rating_data.sort_values(by='date')

Unnamed: 0,user_id,movie_id,rating,time,date,year,month
1000138,6040,858,4,956703932,2000-04-25 23:05:32,2000,4
1000153,6040,2384,4,956703954,2000-04-25 23:05:54,2000,4
999873,6040,593,5,956703954,2000-04-25 23:05:54,2000,4
1000007,6040,1961,4,956703977,2000-04-25 23:06:17,2000,4
1000192,6040,2019,5,956703977,2000-04-25 23:06:17,2000,4
...,...,...,...,...,...,...,...
825793,4958,2399,1,1046454338,2003-02-28 17:45:38,2003,2
825438,4958,1407,5,1046454443,2003-02-28 17:47:23,2003,2
825724,4958,3264,4,1046454548,2003-02-28 17:49:08,2003,2
825731,4958,2634,3,1046454548,2003-02-28 17:49:08,2003,2


In [3]:
user_item_month = rating_data[['user_id', 'month', 'movie_id']]
user_item_month['binary'] = np.ones(user_item_month.shape[0], dtype=np.int8)
user_item_month = user_item_month.sort_values(by=['user_id', 'movie_id'])
user_item_month.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,user_id,month,movie_id,binary
40,1,1,1,1
25,1,1,48,1
39,1,12,150,1
44,1,12,260,1
23,1,1,527,1


In [4]:
all_movies = np.sort(user_item_month.movie_id.unique())
all_movies

array([   1,    2,    3, ..., 3950, 3951, 3952])

In [5]:
from tqdm import tqdm_notebook

extra_movies = []
users = user_item_month.user_id.unique()

for user in tqdm_notebook(users):
    current_user_frame = user_item_month[user_item_month['user_id'] == user]
    movies_user = current_user_frame['movie_id'].unique()

    add_movies = np.array(list(set(all_movies) - set(movies_user)))
    qty_movies = add_movies.shape[0]

    user_rate = np.tile([user, 1], qty_movies).reshape(qty_movies, 2)
    binary_col = np.zeros(qty_movies, dtype=np.int8)[:, None]

    res_frame = np.hstack((user_rate, add_movies[:, None], binary_col))
    extra_movies.append(res_frame)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=6040.0), HTML(value='')))




In [6]:
extra_frame = pd.DataFrame(np.vstack(extra_movies), columns=['user_id', 'month', 'movie_id', 'binary'])

extra_frame

Unnamed: 0,user_id,month,movie_id,binary
0,1,1,2,0
1,1,1,3,0
2,1,1,4,0
3,1,1,5,0
4,1,1,6,0
...,...,...,...,...
21384026,6040,1,3948,0
21384027,6040,1,3949,0
21384028,6040,1,3950,0
21384029,6040,1,3951,0


In [7]:
data_uim = pd.concat((user_item_month, extra_frame))

In [8]:
data_uim

Unnamed: 0,user_id,month,movie_id,binary
40,1,1,1,1
25,1,1,48,1
39,1,12,150,1
44,1,12,260,1
23,1,1,527,1
...,...,...,...,...
21384026,6040,1,3948,0
21384027,6040,1,3949,0
21384028,6040,1,3950,0
21384029,6040,1,3951,0


In [9]:
%%time

uim_pivot = pd.pivot_table(data_uim, values='binary', index=['user_id', 'movie_id'],
                    columns=['month'])

CPU times: user 28.1 s, sys: 7.09 s, total: 35.1 s
Wall time: 35.2 s


In [10]:
uim_pivot = uim_pivot.fillna(0)

In [14]:
num_of_users = rating_data.user_id.nunique()
num_of_movies = rating_data.movie_id.nunique()
num_of_months = rating_data.month.nunique()

user_movie_month_array = uim_pivot.values.reshape(num_of_users, num_of_movies, num_of_months)

In [16]:
from scipy.sparse import csr_matrix, coo_matrix
import torch

sparse_user_movie_month = torch.tensor(user_movie_month_array).to_sparse()

In [17]:
torch.save(sparse_user_movie_month, 'sparse_user_movie_month.pt')

### 2. User $\times$ Item $\times$ Rating (from 1 to 10)

In [None]:
# contruct tensor II here

print(rating_data.rating.value_counts())
rating_data.rating.max(), rating_data.rating.min()

4    348971
3    261197
5    226310
2    107557
1     56174
Name: rating, dtype: int64


(5, 1)

In [None]:
import datetime

rating_data['date'] = pd.to_datetime(rating_data['time'], unit='s', origin='unix')
rating_data['year'] = rating_data['date'].apply(lambda x: x.year)
rating_data['month'] = rating_data['date'].apply(lambda x: x.month)


rating_data.sort_values(by='date')

Unnamed: 0,user_id,movie_id,rating,time,date,year,month
1000138,6040,858,4,956703932,2000-04-25 23:05:32,2000,4
1000153,6040,2384,4,956703954,2000-04-25 23:05:54,2000,4
999873,6040,593,5,956703954,2000-04-25 23:05:54,2000,4
1000007,6040,1961,4,956703977,2000-04-25 23:06:17,2000,4
1000192,6040,2019,5,956703977,2000-04-25 23:06:17,2000,4
...,...,...,...,...,...,...,...
825793,4958,2399,1,1046454338,2003-02-28 17:45:38,2003,2
825438,4958,1407,5,1046454443,2003-02-28 17:47:23,2003,2
825724,4958,3264,4,1046454548,2003-02-28 17:49:08,2003,2
825731,4958,2634,3,1046454548,2003-02-28 17:49:08,2003,2


In [None]:
rating_data.user_id.nunique(),  rating_data.rating.nunique(), rating_data.movie_id.nunique()

(6040, 5, 3706)

In [None]:
user_item_rat = rating_data[['user_id', 'rating', 'movie_id']]
user_item_rat['binary'] = np.ones(user_item_rat.shape[0], dtype=np.int8)
user_item_rat = user_item_rat.sort_values(by=['user_id', 'movie_id'])
user_item_rat.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,user_id,rating,movie_id,binary
40,1,5,1,1
25,1,5,48,1
39,1,5,150,1
44,1,4,260,1
23,1,5,527,1


In [None]:
user_item_rat[['user_id', 'movie_id']].groupby(['user_id'])['movie_id'].value_counts().sort_values()

user_id  movie_id
1        1           1
4013     3697        1
         3698        1
         3699        1
         3705        1
                    ..
1968     1641        1
         1644        1
         1645        1
         1672        1
6040     3819        1
Name: movie_id, Length: 1000209, dtype: int64

In [None]:
all_movies = np.sort(user_item_rat.movie_id.unique())
all_movies

array([   1,    2,    3, ..., 3950, 3951, 3952])

In [None]:
from tqdm import tqdm_notebook

extra_movies = []
users = user_item_rat.user_id.unique()

for user in tqdm_notebook(users):
    current_user_frame = user_item_rat[user_item_rat['user_id'] == user]
    movies_user = current_user_frame['movie_id'].unique()

    add_movies = np.array(list(set(all_movies) - set(movies_user)))
    qty_movies = add_movies.shape[0]

    user_rate = np.tile([user, 1], qty_movies).reshape(qty_movies, 2)
    binary_col = np.zeros(qty_movies, dtype=np.int8)[:, None]

    res_frame = np.hstack((user_rate, add_movies[:, None], binary_col))
    extra_movies.append(res_frame)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=6040.0), HTML(value='')))




In [None]:
extra_frame = pd.DataFrame(np.vstack(extra_movies), columns=['user_id', 'rating', 'movie_id', 'binary'])

extra_frame

Unnamed: 0,user_id,rating,movie_id,binary
0,1,1,2,0
1,1,1,3,0
2,1,1,4,0
3,1,1,5,0
4,1,1,6,0
...,...,...,...,...
21384026,6040,1,3948,0
21384027,6040,1,3949,0
21384028,6040,1,3950,0
21384029,6040,1,3951,0


In [None]:
data_uir = pd.concat((user_item_rat, extra_frame))

In [None]:
data_uir

Unnamed: 0,user_id,rating,movie_id,binary
40,1,5,1,1
25,1,5,48,1
39,1,5,150,1
44,1,4,260,1
23,1,5,527,1
...,...,...,...,...
21384026,6040,1,3948,0
21384027,6040,1,3949,0
21384028,6040,1,3950,0
21384029,6040,1,3951,0


In [None]:
%%time

uir_pivot = pd.pivot_table(data_uir, values='binary', index=['user_id', 'movie_id'],
                    columns=['rating'])

CPU times: user 22 s, sys: 1.86 s, total: 23.8 s
Wall time: 23.8 s


In [None]:
uir_pivot = uir_pivot.fillna(0)

In [None]:
uir_pivot

Unnamed: 0_level_0,rating,1,2,3,4,5
user_id,movie_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,0.0,0.0,0.0,0.0,1.0
1,2,0.0,0.0,0.0,0.0,0.0
1,3,0.0,0.0,0.0,0.0,0.0
1,4,0.0,0.0,0.0,0.0,0.0
1,5,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
6040,3948,0.0,0.0,0.0,0.0,0.0
6040,3949,0.0,0.0,0.0,0.0,0.0
6040,3950,0.0,0.0,0.0,0.0,0.0
6040,3951,0.0,0.0,0.0,0.0,0.0


In [None]:
num_of_users = rating_data.user_id.nunique()
num_of_movies = rating_data.rating.nunique()
num_of_rates = rating_data.movie_id.nunique()

user_movie_rate_array = uir_pivot.values.reshape(num_of_users, num_of_rates, num_of_movies)

In [None]:
from scipy.sparse import csr_matrix, coo_matrix
import torch

sparse_user_movie_rate = torch.tensor(user_movie_rate_array).to_sparse()

In [None]:
torch.save(sparse_user_movie_rate, 'sparse_user_movie_rate.pt')

# Part II. Decompose (more decompositions?)

### 1.[ Tucker decomposition](http://tensorly.org/stable/user_guide/tensor_decomposition.html)

In [None]:
# decompose both tensors

### 2. [HOSVD decomposition](https://pytorch.org/docs/stable/generated/torch.svd.html) or [Lowrank HOSVD decomposition](https://pytorch.org/docs/stable/generated/torch.svd_lowrank.html)

In [None]:
# decompose both tensors

### 3. [Tensor train decomposition ](https://github.com/oseledets/TT-Toolbox)

In [None]:
# decompose both tensors

### Bonus: Use [Polara](https://github.com/evfro/polara)

In [None]:
# use Polara

# Part III. Recommend (no clue how to do it yet)

In [None]:
# time_metric
# quality_metric

In [None]:
# Tensor 1: Tucker result metrics

# Tensor 1: HOSVD result metrics

# Tensor 1: Tensor train result metrics

In [None]:
# Tensor 2: Tucker result metrics

# Tensor 2: HOSVD result metrics

# Tensor 2: Tensor train result metrics

# Part IV. Visuals

In [None]:
# performance plot: Tucker vs HOSVD vs Tensor Train

In [None]:
# quality plot: "Tensor 1: HOSVD" vs "Tensor 1: Tucker" vs "Tensor 1: TT" vs "Tensor 2: HOSVD" vs "Tensor 2: Tucker" vs "Tensor 2: TT"