In [1]:
#import necessary libraries
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

In [2]:
#read the csv file to a pandas dataframe
df = pd.read_csv("ratings.csv")
df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [3]:
# make the user ids go from 0...N-1 instead of 1...N
df.userId = df.userId - 1

In [4]:
# create a mapping for movie ids
# this is done because a lot of movie ids are missing the ratings file
unique_movie_ids = set(df.movieId.values)
movie2idx = {}
count = 0
for movie_id in unique_movie_ids:
  movie2idx[movie_id] = count
  count += 1

In [5]:
# add them to the data frame
df['movie_idx'] = df.apply(lambda row: movie2idx[row.movieId], axis=1)

#drop the timestamp column
df_edited = df.drop(columns=['timestamp'])

In [6]:
df_edited

Unnamed: 0,userId,movieId,rating,movie_idx
0,0,1,4.0,0
1,0,3,4.0,2
2,0,6,4.0,5
3,0,47,5.0,46
4,0,50,5.0,49
...,...,...,...,...
100831,609,166534,4.0,2291
100832,609,168248,5.0,3714
100833,609,168250,5.0,3716
100834,609,168252,5.0,3718


In [7]:
# a dictionary to tell us which users have rated which movies
user2movie = {}
# a dicationary to tell us which movies have been rated by which users
movie2user = {}
# a dictionary to look up ratings
usermovie2rating = {}

In [9]:
#function to create the dictionaries for dataset
def update_user2movie_and_movie2user(row):
  i = int(row.userId)
  j = int(row.movie_idx)
  if i not in user2movie:
    user2movie[i] = [j]
  else:
    user2movie[i].append(j)

  if j not in movie2user:
    movie2user[j] = [i]
  else:
    movie2user[j].append(i)

  usermovie2rating[(i,j)] = row.rating

df.apply(update_user2movie_and_movie2user, axis=1)

0         None
1         None
2         None
3         None
4         None
          ... 
100831    None
100832    None
100833    None
100834    None
100835    None
Length: 100836, dtype: object

In [10]:
#saving the all the required dictionaries as files in the directory
try: 
    dict1 = open('user2movie', 'wb') 
    pickle.dump(user2movie, dict1) 
    dict1.close()
    
    dict2 = open('movie2user', 'wb') 
    pickle.dump(movie2user, dict2) 
    dict2.close()
    
    dict3 = open('usermovie2rating', 'wb') 
    pickle.dump(usermovie2rating, dict3) 
    dict3.close()
    
except: 
    print("Something went wrong")