In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv('/content/drive/MyDrive/rating.csv')
data.head()

# data preprocessing

In [None]:
data.drop(columns=['timestamp'], inplace=True)
data.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [None]:
user_id_freq = data.userId.value_counts()
user_id_freq

118205    9254
8405      7515
82418     5646
121535    5520
125794    5491
          ... 
89305       20
110463      20
96990       20
134747      20
6526        20
Name: userId, Length: 138493, dtype: int64

In [None]:
data.userId -= 1

In [None]:
data.head()

Unnamed: 0,userId,movieId,rating
0,0,2,3.5
1,0,29,3.5
2,0,32,3.5
3,0,47,3.5
4,0,50,3.5


In [None]:
# create mapping for movieIds
unique_movie_ids = set(data.movieId.values)
movie2idx = {}
count = 0
for movie_id in unique_movie_ids:
  movie2idx[movie_id] = count
  count += 1

In [None]:
# add new movie ids to the dataframe
data['movie_idx'] = data.apply(lambda row: movie2idx[row.movieId], axis=1)
data.head()

Unnamed: 0,userId,movieId,rating,movie_idx
0,0,2,3.5,2
1,0,29,3.5,29
2,0,32,3.5,32
3,0,47,3.5,47
4,0,50,3.5,50


In [None]:
data.movie_idx.min(), data.movie_idx.max()

(0, 26743)

In [None]:
data.to_csv('preprocessed_data')

## shrinking data

In [None]:
import pickle
from collections import Counter

In [None]:
df = pd.read_csv('/content/drive/MyDrive/preprocessed_data.csv', index_col=0)
df.head()

Unnamed: 0,userId,movieId,rating,movie_idx
0,0,2,3.5,2
1,0,29,3.5,29
2,0,32,3.5,32
3,0,47,3.5,47
4,0,50,3.5,50


In [None]:
print('Original data size:', len(data))

Original data size: 20000263


In [None]:
N = df.userId.max() + 1
M = df.movie_idx.max() + 1

In [None]:
user_ids_count = Counter(df.userId)
movie_ids_count = Counter(df.movie_idx)

In [None]:
# number of users and number of movies we would like to keep
n = 10000
m = 2000

In [None]:
user_ids = [u for u,c in user_ids_count.most_common(n)]
movie_ids = [m for m,c in movie_ids_count.most_common(m)]

In [None]:
df_small = df[df.userId.isin(user_ids) & df.movie_idx.isin(movie_ids)].copy()
len(df_small)

5392025

In [None]:
new_user_id_map = {}
i = 0
for old in user_ids:
  new_user_id_map[old] = i
  i += 1

In [None]:
new_movie_id_map = {}
j = 0
for old in movie_ids:
  new_movie_id_map[old] = j
  j += 1

In [None]:
# setting new ids
df_small.loc[:,'userId'] = df_small.apply(lambda row: new_user_id_map[row.userId], axis=1)
df_small.loc[:,'movie_idx'] = df_small.apply(lambda row: new_movie_id_map[row.movie_idx], axis=1)

In [None]:
print('Max user id:', df_small.userId.max())
print('Max movie id:', df_small.movie_idx.max())

Max user id: 9999
Max movie id: 1999


In [None]:
print('Small dataframe size:', len(df_small))
df_small.to_csv('small_df')

Small dataframe size: 5392025


## preprocessing 2 dictionaries

In [None]:
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

In [None]:
small_df = pd.read_csv('/content/drive/MyDrive/small_df.csv', index_col=0)
small_df.head()

Unnamed: 0,userId,movieId,rating,movie_idx
960,7307,1,4.5,10
961,7307,10,2.5,68
962,7307,19,3.5,143
963,7307,32,5.0,19
964,7307,39,4.5,85


In [None]:
N = small_df.userId.max() + 1
M = small_df.movie_idx.max() + 1

In [None]:
small_df = shuffle(small_df)
cutoff = int(0.8*len(small_df))
df_train = small_df[:cutoff]
df_test = small_df[cutoff:]

In [None]:
# a dictionary to tell us which users have rated which movies
user2movie = {}

# a dictionary to tell us which movies have been rated by which users
movie2user = {}

# a dictionary to look up ratings
usermovie2rating = {}

In [None]:
count = 0

def update_user2movie_and_movie2user(row):
  global count
  count += 1
  if count % 100000 == 0:
    print('processed: %.3f' %(float(count)/cutoff))
  i = int(row.userId)
  j = int(row.movie_idx)
  if i not in user2movie:
    user2movie[i] = [j]
  else:
    user2movie[i].append(j)

  if j not in movie2user:
    movie2user[j] = [i]
  else:
    movie2user[j].append(i)
  usermovie2rating[(i,j)] = row.rating

df_train.apply(update_user2movie_and_movie2user, axis=1)

processed: 0.023
processed: 0.046
processed: 0.070
processed: 0.093
processed: 0.116
processed: 0.139
processed: 0.162
processed: 0.185
processed: 0.209
processed: 0.232
processed: 0.255
processed: 0.278
processed: 0.301
processed: 0.325
processed: 0.348
processed: 0.371
processed: 0.394
processed: 0.417
processed: 0.440
processed: 0.464
processed: 0.487
processed: 0.510
processed: 0.533
processed: 0.556
processed: 0.580
processed: 0.603
processed: 0.626
processed: 0.649
processed: 0.672
processed: 0.695
processed: 0.719
processed: 0.742
processed: 0.765
processed: 0.788
processed: 0.811
processed: 0.835
processed: 0.858
processed: 0.881
processed: 0.904
processed: 0.927
processed: 0.950
processed: 0.974
processed: 0.997


18908955    None
5218402     None
19375724    None
12302821    None
6649873     None
            ... 
11982537    None
13651680    None
3303963     None
11050607    None
5326134     None
Length: 4313620, dtype: object

In [None]:
# test ratings dictionary
usermovie2rating_test = {}
count = 0
def update_usermovie2rating_test(row):
  global count
  count += 1
  if count % 100000 == 0:
    print('processed: %.3f' %(float(count)/len(df_test)))
  i = int(row.userId)
  j = int(row.movie_idx)
  usermovie2rating_test[(i,j)] = row.rating

df_test.apply(update_usermovie2rating_test, axis=1)

processed: 0.093
processed: 0.185
processed: 0.278
processed: 0.371
processed: 0.464
processed: 0.556
processed: 0.649
processed: 0.742
processed: 0.835
processed: 0.927


9766503     None
10279648    None
11057350    None
18141062    None
5187689     None
            ... 
693587      None
18202737    None
17640052    None
1550060     None
1707027     None
Length: 1078405, dtype: object

In [None]:
with open('user2movie.json', 'wb') as f:
  pickle.dump(user2movie, f)
with open('movie2user.json', 'wb') as f:
  pickle.dump(movie2user, f)
with open('usermovie2rating.json', 'wb') as f:
  pickle.dump(usermovie2rating, f)
with open('usermovie2rating_test.json', 'wb') as f:
  pickle.dump(usermovie2rating_test, f)