### A notebook used to filter the data from the original Grouplens Dataset for simplicity

In [2]:
import pandas as pd

In [3]:
min_votes = 1000

In [22]:
df_movies = pd.read_csv('ml-latest/movies.csv')

In [23]:
df_ratings = pd.read_csv('ml-latest/ratings.csv')
df_ratings.set_index('movieId', inplace=True)

In [24]:
df_ratings

Unnamed: 0_level_0,userId,rating,timestamp
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,4.0,1225734739
110,1,4.0,1225865086
158,1,4.0,1225733503
260,1,4.5,1225735204
356,1,5.0,1225735119
...,...,...,...
8340,330975,2.0,1091583256
8493,330975,2.5,1091585709
8622,330975,4.0,1091581777
8665,330975,3.0,1091581765


In [6]:
df_ratings["no_people_rated"] = [1 for _ in range(len(df_ratings))]

In [7]:
grouped_ratings = df_ratings.groupby('movieId')

grouped_ratings = grouped_ratings.sum()


In [8]:
grouped_ratings

Unnamed: 0_level_0,userId,rating,timestamp,no_people_rated
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,12696689201,299072.0,90676232180619,76813
2,4969411174,99030.5,34965684106628,30209
3,2608079724,50169.5,15487105413879,15820
4,502841790,8685.5,2874569542209,3028
5,2603834260,48619.0,15955867010862,15801
...,...,...,...,...
288967,47791,3.5,1689748357,1
288971,98408,0.5,1689798322,1
288975,154483,4.0,1689812351,1
288977,291389,3.0,1689815902,1


In [9]:
grouped_ratings["avg_rating"] = grouped_ratings["rating"]/grouped_ratings["no_people_rated"]

In [10]:
grouped_ratings.sort_values('avg_rating', ascending=False, inplace=True)
grouped_ratings = grouped_ratings[grouped_ratings["no_people_rated"]>min_votes]
grouped_ratings

Unnamed: 0_level_0,userId,rating,timestamp,no_people_rated,avg_rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
171011,343910470,9086.0,3186768141626,2041,4.451739
159817,503560410,13411.0,4681464629694,3015,4.448093
170705,470416403,12542.0,4464049315381,2835,4.423986
318,20213155335,540156.0,156345852293172,122296,4.416792
202439,2071618320,53687.0,20015714719947,12399,4.329946
...,...,...,...,...,...
2555,285063817,2896.5,1898360097339,1679,1.725134
57532,194063825,1903.0,1636766416042,1172,1.623720
43919,183721607,1828.0,1496965366895,1130,1.617699
3593,879774150,8615.0,6537777504093,5414,1.591245


In [11]:
df_movies = pd.read_csv('ml-latest/movies.csv')
pop_genres = ["Action", "Adventure", "Animation", "Comedy", "Crime", "Drama", 
              "Fantasy", "Horror", "Mystery", "Romance", "Sci-Fi", "Thriller"]
hm_suitability = {}
genre_suitable_lst = []
for i in range(len(df_movies)):
    genre_temp_storage = ""
    movie = df_movies.iloc[i]
    all_genres = movie["genres"]
    is_suitable = 0
    for genre in pop_genres: 
        if genre in all_genres:
            is_suitable += 1
            genre_temp_storage = genre
    if is_suitable == 1:
        hm_suitability[movie["movieId"]] = genre_temp_storage
    else:
        hm_suitability[movie["movieId"]] = "Not suitable"
df_suitability = pd.DataFrame()
df_suitability["movieId"] = hm_suitability.keys()
df_suitability["suitable_genre"] = hm_suitability.values()
df_suitability.set_index("movieId", inplace=True)
df_suitability = df_suitability[df_suitability["suitable_genre"] != "Not suitable"]
df_suitability

Unnamed: 0_level_0,suitable_genre
movieId,Unnamed: 1_level_1
5,Comedy
8,Adventure
9,Action
14,Drama
18,Comedy
...,...
288953,Comedy
288955,Drama
288957,Horror
288959,Drama


join using the movie id

In [12]:
grouped_ratings = grouped_ratings.join(df_suitability, how="inner")
grouped_ratings.to_csv("suitable_movies.csv")

How many movies in each category?

In [13]:
import numpy as np
for genre in pop_genres:
    print(genre, "occurences: ", len(grouped_ratings[grouped_ratings["suitable_genre"]==genre]))
# 1078

Action occurences:  22
Adventure occurences:  16
Animation occurences:  16
Comedy occurences:  418
Crime occurences:  4
Drama occurences:  474
Fantasy occurences:  5
Horror occurences:  67
Mystery occurences:  2
Romance occurences:  6
Sci-Fi occurences:  8
Thriller occurences:  40


In [14]:
df_movies.set_index("movieId", inplace=True)
grouped_ratings = grouped_ratings.join(df_movies, how="inner")
grouped_ratings

Unnamed: 0_level_0,userId,rating,timestamp,no_people_rated,avg_rating,suitable_genre,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1203,3738563090,96992.5,29894679700250,22730,4.267158,Drama,12 Angry Men (1957),Drama
527,13929219732,357340.5,103671567916204,84232,4.242337,Drama,Schindler's List (1993),Drama|War
1193,8118516317,207758.5,61568690477830,49316,4.212801,Drama,One Flew Over the Cuckoo's Nest (1975),Drama
750,5650978895,144160.5,42473159316615,34324,4.199991,Comedy,Dr. Strangelove or: How I Learned to Stop Worr...,Comedy|War
26082,213092748,5374.0,1897779020524,1282,4.191888,Drama,Harakiri (Seppuku) (1962),Drama
...,...,...,...,...,...,...,...,...
1981,243854131,2772.0,1714465547406,1495,1.854181,Horror,Friday the 13th Part VIII: Jason Takes Manhatt...,Horror
1760,552474506,6083.5,3830780656494,3317,1.834037,Comedy,Spice World (1997),Comedy
6482,381293121,4065.5,2992591178695,2292,1.773778,Comedy,Dumb and Dumberer: When Harry Met Lloyd (2003),Comedy
2555,285063817,2896.5,1898360097339,1679,1.725134,Comedy,Baby Geniuses (1999),Comedy


Now we also see the names of the movies - this will be our new base of movies we will do our CF on - sort of our sampling window

For CF though all we really care about are the movies that we use - not the rest of the data - we need the data to be represented as individual people rating a movie i.e. a sparse matrix - so we have to do the whole thing again and this will for now only serve us as a filter...

Load the ratings again

In [15]:
df_sparse_ratings_data = pd.read_csv("ml-latest/ratings.csv")

Drop the ratings that do not correspond to the movies we selected as fit for our purpose

In [16]:
df_sparse_ratings_data = df_sparse_ratings_data[df_sparse_ratings_data["movieId"].isin(grouped_ratings.index.values)]

Initialize a dataframe

In [17]:
movies_sparse_ratings = grouped_ratings.index.tolist()
users_sparse_ratings = df_sparse_ratings_data["userId"].unique().tolist()
#print(movies_sparse_ratings)
tmp = [0 for _ in range(len(users_sparse_ratings))]

[1203, 527, 1193, 750, 26082, 1178, 6818, 926, 160718, 112552, 109487, 1217, 1207, 1172, 913, 6669, 3134, 1280, 3089, 1949, 2351, 96829, 7327, 326, 668, 5147, 2357, 1237, 2360, 142488, 1939, 1276, 3196, 3022, 148626, 3819, 1225, 6643, 3468, 1132, 1256, 5995, 3469, 3090, 306, 164179, 167832, 26131, 1299, 208703, 1222, 3224, 1041, 3415, 1228, 3091, 1258, 26150, 56782, 4427, 188773, 214, 2937, 2132, 1304, 7256, 219994, 4432, 6985, 1283, 213, 8228, 2726, 3811, 1288, 168250, 104944, 31410, 66371, 166643, 1272, 8014, 7234, 3949, 3634, 2925, 1266, 205156, 1080, 3083, 2788, 954, 457, 89759, 3152, 3095, 1719, 1358, 6777, 30749, 4422, 1263, 307, 8128, 1293, 2804, 86345, 176371, 1952, 8542, 86347, 250010, 26326, 140174, 1238, 27815, 4334, 95654, 3467, 2932, 1104, 3814, 7091, 599, 2951, 1246, 4967, 86377, 6791, 8154, 2575, 158966, 6235, 1242, 165551, 927, 971, 3365, 6104, 156387, 7706, 89774, 1231, 2612, 1449, 127098, 27803, 1961, 2918, 106100, 81845, 89492, 1090, 475, 170697, 4327, 163645, 2313, 

In [18]:
hm_movie_idx = {movies_sparse_ratings[i]:i for i in range(len(movies_sparse_ratings))}
hm_user_idx = {users_sparse_ratings[i]:i for i in range(len(users_sparse_ratings))}
#print(hm_movie_idx)

{1203: 0, 527: 1, 1193: 2, 750: 3, 26082: 4, 1178: 5, 6818: 6, 926: 7, 160718: 8, 112552: 9, 109487: 10, 1217: 11, 1207: 12, 1172: 13, 913: 14, 6669: 15, 3134: 16, 1280: 17, 3089: 18, 1949: 19, 2351: 20, 96829: 21, 7327: 22, 326: 23, 668: 24, 5147: 25, 2357: 26, 1237: 27, 2360: 28, 142488: 29, 1939: 30, 1276: 31, 3196: 32, 3022: 33, 148626: 34, 3819: 35, 1225: 36, 6643: 37, 3468: 38, 1132: 39, 1256: 40, 5995: 41, 3469: 42, 3090: 43, 306: 44, 164179: 45, 167832: 46, 26131: 47, 1299: 48, 208703: 49, 1222: 50, 3224: 51, 1041: 52, 3415: 53, 1228: 54, 3091: 55, 1258: 56, 26150: 57, 56782: 58, 4427: 59, 188773: 60, 214: 61, 2937: 62, 2132: 63, 1304: 64, 7256: 65, 219994: 66, 4432: 67, 6985: 68, 1283: 69, 213: 70, 8228: 71, 2726: 72, 3811: 73, 1288: 74, 168250: 75, 104944: 76, 31410: 77, 66371: 78, 166643: 79, 1272: 80, 8014: 81, 7234: 82, 3949: 83, 3634: 84, 2925: 85, 1266: 86, 205156: 87, 1080: 88, 3083: 89, 2788: 90, 954: 91, 457: 92, 89759: 93, 3152: 94, 3095: 95, 1719: 96, 1358: 97, 6777

We will create a list of arrays, one for each movie. Then, we will fill in ratings for only those users who have rated each movie.

In [20]:
all_movie_ratings_lst = []
ij = 0
for movie in movies_sparse_ratings:
    all_ratings_for_a_movie = np.zeros([len(users_sparse_ratings)])
    users_who_rated_movie = df_sparse_ratings_data[df_sparse_ratings_data["movieId"] == movie]

    for i in range(len(users_who_rated_movie)):
        user = users_who_rated_movie.iloc[i]
        all_ratings_for_a_movie[hm_user_idx[user["userId"]]] = user["rating"]
    
    all_movie_ratings_lst.append(all_ratings_for_a_movie)
    ij += 1
    # for tracking purposes - better uncomment it - it takes a long time
    #print(ij/len(movies_sparse_ratings), " percent done")
print(all_movie_ratings_lst)

0.0009276437847866419  percent done
0.0018552875695732839  percent done
0.0027829313543599257  percent done
0.0037105751391465678  percent done
0.00463821892393321  percent done
0.0055658627087198514  percent done
0.006493506493506494  percent done
0.0074211502782931356  percent done
0.008348794063079777  percent done
0.00927643784786642  percent done
0.01020408163265306  percent done
0.011131725417439703  percent done
0.012059369202226345  percent done
0.012987012987012988  percent done
0.013914656771799629  percent done
0.014842300556586271  percent done
0.015769944341372914  percent done
0.016697588126159554  percent done
0.017625231910946195  percent done
0.01855287569573284  percent done
0.01948051948051948  percent done
0.02040816326530612  percent done
0.021335807050092765  percent done
0.022263450834879406  percent done
0.023191094619666047  percent done
0.02411873840445269  percent done
0.02504638218923933  percent done
0.025974025974025976  percent done
0.026901669758812616  

KeyboardInterrupt: 

In [None]:
df_sparse_ratings_from_lst = pd.DataFrame({movies_sparse_ratings[i]: all_movie_ratings_lst[i] for i in range(len(all_movie_ratings_lst))})
df_sparse_ratings_from_lst.index = users_sparse_ratings
df_sparse_ratings_from_lst.index.name = "userId"
df_sparse_ratings_from_lst = df_sparse_ratings_from_lst.T  # Transpose to get users as columns and movies as rows
df_sparse_ratings_from_lst.index.name = "movieId"

So our convention will be - rows correspond with movies, and cols correspond with users. We assume that users will have rated only a few movies relative to the size of the dataset so it makes sense to use a sparse column scipy matrix - we will get to that later

In [None]:
df_sparse_ratings_from_lst

In [None]:
df_sparse_ratings_from_lst.to_pickle("sparse_ratings.pkl")