In [1]:
import numpy as np
import pandas as pd

In [2]:
!mkdir -p data
!kaggle datasets download -d grouplens/movielens-20m-dataset -p data/ --unzip

Downloading movielens-20m-dataset.zip to data
 99%|███████████████████████████████████████▋| 194M/195M [00:03<00:00, 71.8MB/s]
100%|████████████████████████████████████████| 195M/195M [00:03<00:00, 65.0MB/s]


In [31]:
ratings = pd.read_csv('./data/rating.csv')
ratings.shape

(20000263, 4)

In [32]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [33]:
ratings.userId.nunique()

138493

In [34]:
min(ratings.userId), max(ratings.userId)

(1, 138493)

In [35]:
ratings.userId = ratings.userId - 1

In [36]:
min(ratings.userId), max(ratings.userId), ratings.userId.nunique()

(0, 138492, 138493)

In [37]:
print(f'The unique number of movies is {ratings.movieId.nunique()}\nThe minimum of movie is is {min(ratings.movieId)} and the maximum of movie id is {max(ratings.movieId)}')

The unique number of movies is 26744
The minimum of movie is is 1 and the maximum of movie id is 131262


In [38]:
movie2Idx = {movieId:idx for idx, movieId in enumerate(ratings.movieId.unique())}
movie2Idx

{2: 0,
 29: 1,
 32: 2,
 47: 3,
 50: 4,
 112: 5,
 151: 6,
 223: 7,
 253: 8,
 260: 9,
 293: 10,
 296: 11,
 318: 12,
 337: 13,
 367: 14,
 541: 15,
 589: 16,
 593: 17,
 653: 18,
 919: 19,
 924: 20,
 1009: 21,
 1036: 22,
 1079: 23,
 1080: 24,
 1089: 25,
 1090: 26,
 1097: 27,
 1136: 28,
 1193: 29,
 1196: 30,
 1198: 31,
 1200: 32,
 1201: 33,
 1208: 34,
 1214: 35,
 1215: 36,
 1217: 37,
 1219: 38,
 1222: 39,
 1240: 40,
 1243: 41,
 1246: 42,
 1249: 43,
 1258: 44,
 1259: 45,
 1261: 46,
 1262: 47,
 1266: 48,
 1278: 49,
 1291: 50,
 1304: 51,
 1321: 52,
 1333: 53,
 1348: 54,
 1350: 55,
 1358: 56,
 1370: 57,
 1374: 58,
 1387: 59,
 1525: 60,
 1584: 61,
 1750: 62,
 1848: 63,
 1920: 64,
 1967: 65,
 1994: 66,
 1997: 67,
 2021: 68,
 2100: 69,
 2118: 70,
 2138: 71,
 2140: 72,
 2143: 73,
 2173: 74,
 2174: 75,
 2193: 76,
 2194: 77,
 2253: 78,
 2288: 79,
 2291: 80,
 2542: 81,
 2628: 82,
 2644: 83,
 2648: 84,
 2664: 85,
 2683: 86,
 2692: 87,
 2716: 88,
 2761: 89,
 2762: 90,
 2804: 91,
 2872: 92,
 2918: 93,
 29

In [39]:
ratings['movie_idx'] = ratings.apply(lambda row: movie2Idx[row.movieId], axis=1)

In [40]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,movie_idx
0,0,2,3.5,2005-04-02 23:53:47,0
1,0,29,3.5,2005-04-02 23:31:16,1
2,0,32,3.5,2005-04-02 23:33:39,2
3,0,47,3.5,2005-04-02 23:32:07,3
4,0,50,3.5,2005-04-02 23:29:40,4


In [41]:
print(f'The unique number of movies is {ratings.movie_idx.nunique()}\nThe minimum of movie is is {min(ratings.movie_idx)} and the maximum of movie id is {max(ratings.movie_idx)}')

The unique number of movies is 26744
The minimum of movie is is 0 and the maximum of movie id is 26743


In [42]:
ratings = ratings.drop(columns=['timestamp'])

In [43]:
ratings.head()

Unnamed: 0,userId,movieId,rating,movie_idx
0,0,2,3.5,0
1,0,29,3.5,1
2,0,32,3.5,2
3,0,47,3.5,3
4,0,50,3.5,4


In [44]:
ratings.to_csv('./data/edited_ratings.csv')

### Shrinking the data

* Select subset of users and movies
* Users who rated the most movies
* Movies that have been rated by the most users

In [45]:
N = ratings.userId.max() + 1 # Number of Users
M = ratings.movie_idx.max() + 1 # Number of movies

In [46]:
N, M

(138493, 26744)

In [47]:
from collections import Counter
user_ids_count = Counter(ratings.userId)
movie_ids_count = Counter(ratings.movie_idx)

In [48]:
n = 10000
m = 2000

In [56]:
user_ids = [u for u, c in user_ids_count.most_common(n)]
movie_ids = [m for m, c in movie_ids_count.most_common(m)]

In [57]:
ratings_small = ratings[ratings.userId.isin(user_ids) & ratings.movie_idx.isin(movie_ids)].copy()

In [58]:
ratings_small.shape

(5392025, 4)

In [59]:
new_user_id_map = {user_id:idx for idx, user_id in enumerate(ratings_small.userId.unique())}
new_movie_id_map = {movie_id:idx for idx, movie_id in enumerate(ratings_small.movie_idx.unique())}

In [60]:
ratings_small.loc[:, 'userId'] = ratings_small.apply(lambda row: new_user_id_map[row.userId], axis=1)
ratings_small.loc[:, 'movie_idx'] = ratings_small.apply(lambda row: new_movie_id_map[row.movie_idx], axis=1)

In [61]:
print(f'min user_id is {ratings_small.userId.min()} max user_id {ratings_small.userId.max()} unique userid are {ratings_small.userId.nunique()}')
print(f'min movie_id is {ratings_small.movie_idx.min()} max movie_id {ratings_small.movie_idx.max()} unique movie_ids are {ratings_small.movie_idx.nunique()}')

min user_id is 0 max user_id 9999 unique userid are 10000
min movie_id is 0 max movie_id 1999 unique movie_ids are 2000


In [62]:
ratings_small.shape

(5392025, 4)

### Table to Dictionary

* In code, I want to ask questions like
    * Given user i, which movies did they rate?
    * Given movie j, which users i have rated it ?
    * Given user i and movie j, what is the rating?
    * Given user i and movie j, what is the rating?
* Theoretically, pandas dataframe is like an SQL table, so we should be able to write queries to grab this info?
* I know SQL has indexes to make these lookups somewhat fast, but can pandas ?
* Python Dictionaries are already a key -> value lookup
* user2movie : user ID -> movie ID
* movie2user : movie ID -> user ID
* usermovie2rating : (user ID, movie ID) -> rating

why Dictionaries?
* Looping through the array would be $\Omicron(NM)$
* Looping through the dictionary is $\Omicron(|\Omega|)$
$\Omega$ is the length of the set of ratings

In [63]:
N = ratings_small.userId.max()
M = ratings_small.movie_idx.max()

In [64]:
N, M

(9999, 1999)

##### Train Test Split

In [66]:
from sklearn.utils import shuffle

In [67]:
ratings_small = shuffle(ratings_small)

In [68]:
ratings_small.head()

Unnamed: 0,userId,movieId,rating,movie_idx
14777855,7407,3052,4.5,559
2148134,1108,3178,3.5,956
2105276,1088,2728,5.0,912
19355755,9685,2973,4.0,1134
12838400,6422,1097,4.0,75


Train set size = 80%

Test set size = 20%

In [77]:
test_split = 0.2
test_set_size = int(test_split * len(ratings_small))
print(f'The test set size would be {test_set_size}')

The test set size would be 1078405


In [103]:
train_data = ratings_small.iloc[:-test_set_size]
train_data.shape

(4313620, 4)

In [104]:
test_data = ratings_small.iloc[-test_set_size:]
test_data.shape

(1078405, 4)

##### Dictionary Creation

In [105]:
user2movie = {}
movie2user = {}
usermovie2rating = {}

In [106]:
count = 0
def updateDictionaries(row):
    global count
    count += 1
    if count % 100000 == 0:
        print(f'Processed: {(float(count)/train_data.shape[0]):.3f}')

    i = int(row.userId)
    j = int(row.movie_idx)

    if i not in user2movie:
        user2movie[i] = [j]
    else:
        user2movie[i].append(j)

    if j not in movie2user:
        movie2user[j] = [i]
    else:
        movie2user[j].append(i)

    usermovie2rating[(i, j)] = row.rating

In [107]:
train_data.apply(updateDictionaries, axis=1)

Processed: 0.023
Processed: 0.046
Processed: 0.070
Processed: 0.093
Processed: 0.116
Processed: 0.139
Processed: 0.162
Processed: 0.185
Processed: 0.209
Processed: 0.232
Processed: 0.255
Processed: 0.278
Processed: 0.301
Processed: 0.325
Processed: 0.348
Processed: 0.371
Processed: 0.394
Processed: 0.417
Processed: 0.440
Processed: 0.464
Processed: 0.487
Processed: 0.510
Processed: 0.533
Processed: 0.556
Processed: 0.580
Processed: 0.603
Processed: 0.626
Processed: 0.649
Processed: 0.672
Processed: 0.695
Processed: 0.719
Processed: 0.742
Processed: 0.765
Processed: 0.788
Processed: 0.811
Processed: 0.835
Processed: 0.858
Processed: 0.881
Processed: 0.904
Processed: 0.927
Processed: 0.950
Processed: 0.974
Processed: 0.997


14777855    None
2148134     None
2105276     None
19355755    None
12838400    None
            ... 
7079332     None
7878054     None
18924272    None
5240381     None
8747797     None
Length: 4313620, dtype: object

In [108]:
usermovie2rating_test = {}

In [109]:
count = 0
def update_usermovie2rating_test(row):
    global count
    count += 1
    if count % 100000 == 0:
        print(f'Processed: {(float(count)/test_data.shape[0]):.3f}')

    i = int(row.userId)
    j = int(row.movie_idx)

    usermovie2rating_test[(i, j)] = row.rating

In [110]:
test_data.apply(update_usermovie2rating_test, axis=1)

Processed: 0.093
Processed: 0.185
Processed: 0.278
Processed: 0.371
Processed: 0.464
Processed: 0.556
Processed: 0.649
Processed: 0.742
Processed: 0.835
Processed: 0.927


31527       None
6823081     None
144473      None
6672992     None
16219646    None
            ... 
6150908     None
17454016    None
9147648     None
6433127     None
5127603     None
Length: 1078405, dtype: object

### Saving Dictionaries as Binary files using Pickle

In [112]:
import pickle

In [113]:
with open('user2movie_dict', 'wb') as f:
    pickle.dump(user2movie, f)

with open('movie2user_dict', 'wb') as f:
    pickle.dump(movie2user, f)

with open('usermovie2rating_dict', 'wb') as f:
    pickle.dump(usermovie2rating, f)

with open('usermovie2rating_test_dict', 'wb') as f:
    pickle.dump(usermovie2rating_test, f)