In [1]:
from __future__ import division
import os 
import numpy as np 
import pandas as pd
from collections import defaultdict
import itertools
import ast

In [2]:
datadir = '../../data/the-movies-dataset/'

In [3]:
small_ratings_file = datadir + 'ratings_small.csv'
all_ratings_file = datadir + 'ratings.csv'
# df = pd.read_csv(all_ratings_file, delimiter=',')
df = pd.read_csv(small_ratings_file, delimiter=',')

In [4]:
df.dataframeName = 'ratings_small.csv'
print(df.columns)

Index([u'userId', u'movieId', u'rating', u'timestamp'], dtype='object')


In [5]:
df.drop(columns=['timestamp'])
print(df.head(5))

   userId  movieId  rating   timestamp
0       1       31     2.5  1260759144
1       1     1029     3.0  1260759179
2       1     1061     3.0  1260759182
3       1     1129     2.0  1260759185
4       1     1172     4.0  1260759205


In [6]:
print(max(df['movieId']))
print(min(df['movieId']))
print(type(df['movieId'][0]))

163949
1
<type 'numpy.int64'>


In [7]:
print (max(df['userId']))
print (min(df['userId']))
print(type(df['userId'][0]))

671
1
<type 'numpy.int64'>


In [8]:
userid_set = set()
movieid_set = set()

for i in df['userId']:
    if i not in userid_set:
        userid_set.add(i)

for i in df['movieId']:
    if i not in movieid_set:
        movieid_set.add(i)

In [9]:
print(len(userid_set))

671


In [10]:
print(len(movieid_set))

9066


In [11]:
# TODO
# Cast all floats to int since comparison and access
# can be problematic later on in dict keys

In [12]:
# creating a hash map of  movieId -> list of userIds 
# First create a threshold for the ratings to consider: rating_threshold
# Then prune the hashmap to only include the movies which have been rated 
# (clearing the ratings_threshold) by at least say 50 number of users (usernum_threshold)

# Could probably use faster pandas dataframe lambda methods to chunk out
# lots of rows which have a rating value less than the threshold


In [13]:
rating_threshold = 3.5
movie_user_dict = defaultdict(list)

# This step can be done efficiently
# with lambda apply on pandas df
for index, row in df.iterrows():    
#     movId = row['movieId']
    movId = int(row['movieId'])
#     usrId = row['userId']
    usrId = int(row['userId'])
    rating = row['rating']       
    
    if rating >= rating_threshold:
        movie_user_dict[movId].append(usrId)


In [14]:
usernum_threshold = 50
final_dict = {}

for key,val in movie_user_dict.items():
    if len(val) >= usernum_threshold:
        final_dict[key] = val


In [15]:
len(final_dict)

258

In [18]:
x = set([2,7])
y = set([1,2,4,4,3])

In [19]:
x = x.union(y)
print(x)

set([1, 2, 3, 4, 7])


In [20]:
# Calculate the total number of users. 
# Only consider the movies in final_dict
# i.e after all the final thresholding

total_users = set()
for mov in final_dict.keys():
    row = set(movie_user_dict[mov])
    total_users = total_users.union(row)

num_users = len(total_users)
print(num_users)
    

661


In [24]:
num_users = float(num_users)
print(num_users)

661.0


In [25]:
def findnum_common_elements(list1, list2):
    return len(set(list1).intersection(list2))

In [26]:
movie_idlist = list(final_dict.keys())
print(movie_idlist)

[1, 2, 2987, 1028, 6, 10, 11, 1036, 33794, 54286, 527, 16, 17, 32587, 21, 1625, 25, 539, 541, 32, 2081, 34, 36, 1573, 39, 3114, 555, 1580, 47, 1584, 1073, 50, 1079, 1080, 68157, 62, 1089, 1090, 1035, 2628, 1097, 586, 587, 588, 589, 590, 592, 593, 595, 597, 953, 1380, 5989, 608, 104, 56367, 7438, 110, 111, 1136, 1653, 49272, 2683, 1148, 2174, 2916, 4226, 5995, 2692, 648, 3948, 2700, 141, 3949, 2599, 1682, 150, 1732, 40815, 2716, 44191, 161, 165, 3751, 1704, 1193, 3671, 1196, 1197, 1198, 1199, 1200, 1203, 1206, 1207, 1208, 1721, 1210, 8360, 6333, 1213, 1214, 58559, 7361, 1219, 1220, 1221, 1222, 1225, 2762, 1230, 3793, 1234, 1240, 6874, 2268, 733, 1246, 223, 736, 1250, 1252, 5349, 231, 745, 1258, 551, 2797, 750, 1265, 2291, 2804, 1270, 1784, 68954, 253, 1278, 8961, 260, 8874, 1288, 778, 1291, 780, 4878, 1923, 2324, 4886, 1304, 2329, 1307, 79132, 4896, 292, 293, 296, 2012, 2858, 300, 2791, 2355, 3471, 3897, 1247, 316, 318, 5952, 4011, 5445, 6711, 5418, 1356, 1358, 3408, 337, 339, 344, 858,

In [27]:
count_matrix = {}

for pair in itertools.combinations(movie_idlist, r=2):
    mov1, mov2 = pair
    count_matrix[pair] = findnum_common_elements(final_dict[mov1], final_dict[mov2])

In [28]:
print(len(count_matrix))

33153


In [31]:
# Now just add the marginal counts to the count_matrix dict

for k in final_dict.keys():
    pair = (k,k)
    count_matrix[pair] = len(final_dict[k])

In [32]:
count_matrix[(10,11)]

12

In [33]:
count_matrix[(10.0,10.0)]

65

In [34]:
maxval = max([v for k,v in count_matrix.items()])
print(maxval)    

286


In [42]:
def marginal_prob(movie_id, count_matrix, num_users):
    '''function to get the marginal prob:
        P(movie_id1)       
    '''
    margn_count = count_matrix[(movie_id, movie_id)]
    return margn_count * 1.0/num_users

def joint_prob(movie_id1, movie_id2, count_matrix, num_users):
    '''function to get the joint prob:
        P(movie_id1, movie_id2)       
    '''
    if (movie_id1, movie_id2) not in count_matrix.keys():
        key = (movie_id2, movie_id1)
    else:
        key = (movie_id1, movie_id2)
    
    joint_count = count_matrix[key]    
    return joint_count * 1.0/num_users


def conditional_prob(movie_id1, movie_id2, count_matrix):
    '''function to get the conditional prob:
        P(movie_id1 | movie_id2)       
    '''
    if (movie_id1, movie_id2) not in count_matrix.keys():
        key = (movie_id2, movie_id1)
    else:
        key = (movie_id1, movie_id2)
    
    joint_count = count_matrix[key]
    margn_count = count_matrix[movie_id2, movie_id2]
    
    return joint_count * 1.0/margn_count

In [43]:
# count_matrix.keys()
len(count_matrix)

33411

In [44]:
N = int(num_users)  # sanity check, to see whether we get valid results with int valued N (instead of a float)
marginal_prob(10, count_matrix, N)

0.09833585476550681

In [38]:
joint_prob(10, 11, count_matrix, N)

0.018154311649016642

In [39]:
conditional_prob(10, 8961, count_matrix)

0.1782178217821782

In [40]:
# Sample example
# Conditioning CAN be less than the marginal! 
# ref: 2018-box-paper table-1

for k in final_dict.keys():
    p1 = conditional_prob(10, k, count_matrix)
    p2 = marginal_prob(10, count_matrix, N)
    if p1 <= p2:
        print(k, p1)

(25, 0.08695652173913043)
(36, 0.08860759493670886)
(608, 0.08205128205128205)
(56367, 0.0847457627118644)
(111, 0.07692307692307693)
(5995, 0.09259259259259259)
(2599, 0.07936507936507936)
(1193, 0.08064516129032258)
(1203, 0.046875)
(58559, 0.0673076923076923)
(1225, 0.06976744186046512)
(1230, 0.047619047619047616)
(1252, 0.04411764705882353)
(745, 0.07547169811320754)
(750, 0.05555555555555555)
(68954, 0.07547169811320754)
(1304, 0.09230769230769231)
(79132, 0.08602150537634409)
(1247, 0.04)
(318, 0.0944055944055944)
(1358, 0.058823529411764705)
(2395, 0.05555555555555555)
(1394, 0.07407407407407407)
(899, 0.057692307692307696)
(903, 0.06451612903225806)
(904, 0.09411764705882353)
(908, 0.06493506493506493)
(912, 0.09278350515463918)
(913, 0.03571428571428571)
(920, 0.08928571428571429)
(923, 0.08333333333333333)
(924, 0.08695652173913043)
(46578, 0.09375)
(59315, 0.09230769230769231)
(50872, 0.058823529411764705)
(60069, 0.08928571428571429)


In [47]:
## Sample code to write a list to a text file
# tst_list = [1,2,3,4,5,6]
# with open('test.txt', 'w') as f:
#     for item in tst_list:
#         f.write("%s\n" % item)

In [49]:
# Create the marginals file and a separate vocab file (for all the possible movieIds)
# Consider only the diagonal entries in count_matrix
# Also, the 2 lists work as follows: 
# marginals[i] == marginal_prob(movieid_vocab[i], count_matrix, num_users)
# i.e the indices of both lists match on movieid and its marginal probability

marginals = []
movieid_vocab = []
for key_pair in count_matrix.keys():
    k1, k2 = key_pair
    if k1 == k2:        
        movieid_vocab.append(k1)
        marginals.append(marginal_prob(k1, count_matrix, N))


In [51]:
print(movieid_vocab)

[5989, 5349, 5418, 60069, 1617, 7153, 474, 778, 1358, 919, 16, 920, 316, 590, 50, 1704, 2959, 47, 1090, 1580, 2028, 2599, 923, 1252, 593, 1258, 1234, 912, 4027, 318, 3996, 4886, 1136, 2012, 367, 2762, 1356, 2683, 2396, 39, 2692, 551, 750, 440, 1732, 3751, 1250, 231, 17, 253, 6539, 357, 1374, 1220, 4878, 49272, 1682, 2324, 541, 3471, 587, 745, 5995, 6016, 1288, 527, 5618, 339, 2329, 454, 2987, 2797, 1517, 6874, 3793, 1500, 2174, 260, 349, 8360, 4995, 1304, 2, 150, 1221, 36, 5952, 539, 1200, 858, 68157, 292, 904, 1270, 2804, 1206, 1198, 2791, 11, 588, 293, 1240, 1721, 1230, 3949, 1265, 44191, 509, 1035, 1222, 4226, 1968, 344, 555, 104, 161, 736, 8368, 592, 1148, 1, 3897, 25, 56367, 2395, 910, 4896, 457, 1073, 2542, 2194, 4306, 59315, 4011, 223, 597, 8665, 33794, 58559, 2000, 79132, 3114, 1394, 1291, 2716, 1028, 7438, 1393, 8636, 1080, 1203, 68954, 2858, 235, 377, 1089, 924, 2011, 6711, 5445, 2115, 40815, 2081, 6333, 5816, 10, 2918, 4034, 34, 4963, 1097, 3147, 1213, 1210, 2355, 356, 1199,

In [52]:
# Write out the lists to text files
fname_marginals = "marginals.txt"
with open(fname_marginals, "w") as f:
    for prob in marginals:
        f.write("%s\n" % prob)        

In [53]:
fname_vocab = "movie_vocab.txt"
with open(fname_vocab, "w") as f:
    for movid in movieid_vocab:
        f.write("%s\n" % movid)        

In [56]:
marginal_prob(1617, count_matrix, N) # just a sanity check for the values written in text files

0.1573373676248109

In [57]:
# Now write out the pairwise conditional probs.

In [None]:
# master text file for cond_probs. To be split later into trn, tst, and dev files.
# format for each line is: 
# (rel, mov1, mov2, prob(mov2 | mov1))

In [66]:
REL = "IsWith" # Instead of 'IsA' relation, we use 'IsWith'. Nothing substantially different.
tup_list = []
for key_pair in count_matrix.keys():
    k1, k2 = key_pair
    if k1 != k2:
        prob_k2k1 = conditional_prob(k2, k1, count_matrix)
        tmp_tup1 = (REL, k1, k2, prob_k2k1)        
        tup_list.append(tmp_tup1)
        
        prob_k1k2 = conditional_prob(k1, k2, count_matrix)
        tmp_tup2 = (REL, k2, k1, prob_k1k2)
        tup_list.append(tmp_tup2)
        

In [72]:
len(tup_list)

66306

In [76]:
tup_list[:6]

[('IsWith', 1225, 474, 0.16279069767441862),
 ('IsWith', 474, 1225, 0.2545454545454545),
 ('IsWith', 79132, 1393, 0.17204301075268819),
 ('IsWith', 1393, 79132, 0.21621621621621623),
 ('IsWith', 6711, 924, 0.36363636363636365),
 ('IsWith', 924, 6711, 0.2608695652173913)]

In [70]:
len(count_matrix)

33411

In [61]:
len(marginals)

258

In [62]:
# sanity checks!
(33411-258) * 2

66306

In [71]:
# this master file is essentially a csv file with delimiter as a blank space
# using python's inbuilt csv library for that
import csv
fname_master = "master_data.txt"
with open(fname_master, "w") as f:
    writer = csv.writer(f, delimiter=' ', lineterminator='\n')
    writer.writerows(tup_list)        

In [78]:
total = len(tup_list)
splits = (0.8, 0.1, 0.1)
trn_num = int(total * splits[0])
dev_num = int(total * splits[1])
tst_num = int(total * splits[2])

if trn_num % 2 != 0:
    trn_num += 1
trn_split = trn_num

if dev_num % 2 != 0:
    dev_num += 1
dev_split = trn_split + dev_num

# REDUNDANT
if tst_num % 2 != 0:
    tst_num += 1
tst_split = dev_split + tst_num

trn_data = tup_list[:trn_split]
dev_data = tup_list[trn_split : dev_split]
tst_data = tup_list[dev_split:]



In [79]:
len(trn_data)

53044

In [80]:
len(dev_data)

6630

In [81]:
len(tst_data)

6632

In [82]:
# write out the training file
# using python's inbuilt csv library for that
import csv
fname = "movie_train.txt"
with open(fname, "w") as f:
    writer = csv.writer(f, delimiter=' ', lineterminator='\n')
    writer.writerows(trn_data)        

In [83]:
# write out the training file
# using python's inbuilt csv library for that
import csv
fname = "movie_test.txt"
with open(fname, "w") as f:
    writer = csv.writer(f, delimiter=' ', lineterminator='\n')
    writer.writerows(tst_data)        

In [84]:
# write out the training file
# using python's inbuilt csv library for that
import csv
fname = "movie_dev.txt"
with open(fname, "w") as f:
    writer = csv.writer(f, delimiter=' ', lineterminator='\n')
    writer.writerows(dev_data)        

In [None]:
# METADATA ACCESS -- for genres.

metadata_file = datadir + 'movies_metadata.csv'
df_metadata = pd.read_csv(metadata_file, delimiter=',')


In [None]:
df_metadata.columns

In [None]:
df_metadata.head(5)

In [None]:
df_meta = df_meta[['genres', 'id']]

In [None]:
df_meta.head(5)

In [None]:
df_meta.loc[0].id

In [None]:
print(df_meta.loc[0,'genres'])

In [None]:
x = df_meta.loc[0,'genres']
print(type(x))
print(x)

In [None]:
s = x.split('},')
print(s)

In [None]:
tmp = ast.literal_eval(x)
print(type(tmp))
print(tmp)

In [None]:
ids = []
gen = []

for d in tmp:
    ids.append(d['id'])
    gen.append(d['name'])

print(ids, gen)

In [None]:
s0=s[0]
print(s0)

In [None]:
i = s0.find('id')
n = s0.find('name')
print(s0[i+5:n-3]) # the genre id
print(s0[n+8:-1])

In [None]:
s1 = s[1]
i = s1.find('id')
n = s1.find('name')
print(s1[i+5:n-3]) # the genre id
print(s1[n+8:-1])

In [None]:

# def get_genId_genres(genres_json, id_to_genre):
#     '''
#     id_to_genre: its like a global dict to store the mappings 
#                 from the genre-id to genre-name
#     '''
#     if not genres_json:
#         return None
    
#     splits = genres_json.split('},')
    
#     ids_list = []
#     gen_list = []           
    
#     for s in splits:
#         i = s.find('id')
#         n = s.find('name')
#         # print(s[i+5:n-3]) # genre id
#         # print(s[n+8:-1])  # genre name
#         id_num = int(s[i+5:n-3])
#         genre_name = s[n+8:-1]
#         ids_list.append(id_num)
#         gen_list.append(genre_name)
        
#         if id_num not in id_to_genre:
#             id_to_genre[id_num] = genre_name
    
#     return ids_list, gen_list, id_to_genre



def get_genres(genres_json, id_to_genre):
    '''
    id_to_genre: its like a global dict to store the mappings 
                from the genre-id to genre-name
    '''
    if not genres_json:
        return None
    
    tmp = ast.literal_eval(genres_json)
    ids_list = []
    gen_list = []           
    
    for d in tmp:
        id_num = d['id']
        genre_name = d['name']
        ids_list.append(id_num)
        gen_list.append(genre_name)
        
        if id_num not in id_to_genre:
            id_to_genre[id_num] = genre_name
    
    return ids_list, gen_list, id_to_genre


In [None]:
idd = {}
get_genres(x, idd)

In [None]:
genid_dict = {}
genname_dict = {}
gen_counts = defaultdict(int) # counting the genre-genre pairs
id_to_genre = {}

for index, row in df_meta.iterrows():    
    
    try:
        movId = int(row['id'])
        gen_json = row['genres']
    
        ids, genres, id_to_genre = get_genres(gen_json, id_to_genre)    
        genid_dict[movId] = ids
        genname_dict[movId] = genres

        for i in ids:
            singleton_pair = (i, i)
            gen_counts[singleton_pair] += 1

        for pair in itertools.combinations(ids, r=2):
            # id1, id2 = pair
            gen_counts[pair] += 1
            
    except Exception as e:
        print(index, row['id'])
        repr(e)


In [None]:
print(id_to_genre)
print(len(id_to_genre))

In [None]:
genid_dict

In [None]:
genname_dict

In [None]:
gen_counts

In [None]:
import operator
for w in sorted(gen_counts, key=gen_counts.get, reverse=True):
    val = gen_counts[w]
    g1 = id_to_genre[w[0]]
    g2 = id_to_genre[w[1]]
    print(g1, g2, val)

In [None]:
diffdict = {}
for k,v in gen_counts.items():
    if k[0] != k[1]:
        p1 = gen_counts[k]/gen_counts[(k[0],k[0])]
        p2 = gen_counts[k]/gen_counts[(k[1],k[1])]
        diff = p1 - p2
        diffdict[k] = diff

        

In [None]:
# getting a crude map

for w in sorted(diffdict, key=diffdict.get, reverse=True):
    val = diffdict[w]
    g1 = id_to_genre[w[0]]
    g2 = id_to_genre[w[1]]
    print(g1, g2, val)