In [1]:
import pandas as pd

ratings = pd.read_csv("data/ratings.csv", sep = ',')
movies = pd.read_csv("data/movies.csv", sep = ',')
tags = pd.read_csv("data/tags.csv", sep = ',')

In [2]:
import numpy as np
import math

# build a TF dataframe
tf = tags.groupby(['movieId','tag'], as_index=False, sort=False)\
         .count()\
         .rename(columns = {'userId':'tag_count_tf'})[['movieId','tag','tag_count_tf']]
tag_distinct = tags[['tag','movieId']].drop_duplicates()

# build a DF dataframe
df = tag_distinct.groupby(['tag'], as_index=False, sort=False)\
                 .count()\
                 .rename(columns = {'movieId':'tag_count_df'})[['tag','tag_count_df']]

# compute TF-IDF values
idf = math.log10(len(np.unique(tags['movieId'])))
df['idf'] = idf-np.log10(df['tag_count_df'])

tf = pd.merge(tf, df, on='tag', how='left', sort=False)
tf['tf-idf'] = tf['tag_count_tf']*tf['idf']

# show TF-IDF values for each movie
#tf[['movieId','tag','tf-idf']].head()

In [3]:
# get the vector length with rows of movieId and columns of TF-IDF
vect_length = tf.loc[:,('movieId','tf-idf')]

# normalize the vector by unit length
vect_length['tf-idf-sq'] = vect_length['tf-idf']**2
vect_length = vect_length.groupby(['movieId'], as_index=False, sort=False)\
                   .sum()\
                   .rename(columns = {'tf-idf-sq':'tf-idf-sq-total'})[['movieId','tf-idf-sq-total']]
vect_length['vect_length'] = np.sqrt(vect_length[['tf-idf-sq-total']].sum(axis=1))

tf = pd.merge(tf, vect_length, on='movieId', how='left', sort=False)
tf['tag_vec'] = tf['tf-idf']/tf['vect_length']

# display the feature unit length vector of each movie: 'tag_vec'
#tf[tf['movieId'] == 60756][['movieId','tag','tf-idf','vect_length','tag_vec']].head()

Compute user profile vector
---------------------------------

# Step 3-1. Calculate user profile: sum of the item-tag vectors of all items with positive ratings (>=3)

In [4]:
import pandas as pd

ratings_filter = ratings[ratings['rating']>=3]
user_distinct = np.unique(ratings['userId'])
user_tag_pref = pd.DataFrame()
i = 1

# enter userId for analysis
userId = 65

# compute the profile vector for the selected user
user_index = user_distinct.tolist().index(userId)

for user in user_distinct[user_index:user_index+1]:
            
    user_data= ratings_filter[ratings_filter['userId']==user]
    user_data = pd.merge(tf,user_data, on = 'movieId', how = 'inner', sort = False)
    user_data_itr = user_data.groupby(['tag'], as_index = False, sort = False)\
                             .sum()\
                             .rename(columns = {'tag_vec': 'tag_pref'})[['tag','tag_pref']]

    user_tag_pref = user_tag_pref.append(user_data_itr, ignore_index=True)

# display the user profile vector: 'tag_pref'
user_tag_pref['userId'] = userId

In [11]:
user_tag_pref_all = user_tag_pref
movie_distinct = np.unique(tf['movieId'])
tag_merge_all = pd.DataFrame()

for movie in movie_distinct:
        
    tf_movie = tf[tf['movieId']==movie]
    tag_merge = pd.merge(tf_movie, user_tag_pref_all, on = 'tag', how = 'left', sort = False)
    tag_merge['tag_pref'] = tag_merge['tag_pref'].fillna(0)
    tag_merge['tag_value'] = tag_merge['tag_vec']*tag_merge['tag_pref']
        
    tag_vec_val = np.sqrt(np.sum(np.square(tag_merge['tag_vec']), axis=0))
    tag_pref_val = np.sqrt(np.sum(np.square(user_tag_pref_all['tag_pref']), axis=0))
    tag_merge_final = tag_merge.groupby(['userId','movieId'])[['tag_value']]\
                                   .sum()\
                                   .rename(columns = {'tag_value': 'rating'})\
                                   .reset_index()
        
    tag_merge_final['rating']=tag_merge_final['rating']/(tag_vec_val*tag_pref_val)
        
    tag_merge_all = tag_merge_all.append(tag_merge_final, ignore_index=True)
    
# remove movies already rated by user
movies_rated = ratings[ratings['userId'] == userId]['movieId']
tag_merge_all = tag_merge_all[~tag_merge_all['movieId'].isin(movies_rated)]
tag_merge_all['userId'] = tag_merge_all['userId'].apply(np.int64)

print(tag_merge_all.shape)

# display the top 10 movies by rating
tag_merge_all = tag_merge_all.head(15)
tag_merge_all = pd.merge(tag_merge_all, movies, on = 'movieId', how = 'left')
tag_merge_all.head(20)

(309, 3)


Unnamed: 0,userId,movieId,rating,title,genres
0,65,2.0,0.045759,Jumanji (1995),Adventure|Children|Fantasy
1,65,11.0,0.043311,"American President, The (1995)",Comedy|Drama|Romance
2,65,14.0,0.043311,Nixon (1995),Drama
3,65,32.0,0.049624,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
4,65,47.0,0.129279,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
5,65,50.0,0.08652,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
6,65,101.0,0.017243,Bottle Rocket (1996),Adventure|Comedy|Crime|Romance
7,65,110.0,0.035917,Braveheart (1995),Action|Drama|War
8,65,116.0,0.059272,Anne Frank Remembered (1995),Documentary
9,65,147.0,0.035217,"Basketball Diaries, The (1995)",Drama
