In [1]:
import pandas as pd
import pickle 
import nltk
from nltk.corpus import wordnet as wn
import numpy as np
from scipy.spatial.distance import cosine
from scipy.spatial import distance

In [2]:
df_book = pd.read_csv('integrated_data.csv')
book_genre_df = df_book[['title', 'genre']]
book_genre_df = book_genre_df.drop_duplicates() # getting the list of unique books (no repetitions)
book_genre_df.shape

(341200, 2)

In [3]:
book_genre_df[:2]

Unnamed: 0,title,genre
0,Captain America: Winter Soldier (The Ultimate ...,comics
3,Bounty Hunter 4/3: My Life in Combat from Mari...,comics


In [4]:
movie_df = pd.read_csv('movie_map.csv') # creating a dataframe having unique movies and its respective genre

In [5]:
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


**Unpickling the result of ALS on book data**

Unpickling the dictionary of recommendations, generated by running the ALS algorithm on good_reads book dataset.

In [6]:
def unpickling_data():
    file = open('book_rec.pickle','rb')
    pickled_rec = pickle.load(file)
    file.close()
    return pickled_rec

rec = unpickling_data()

In [7]:
'''Book recommendation for the test users  
with id_
'''

id_ = '7375c7dcaa4b586d4c6854f3c9c11903'
val = rec.get(id_)

**Result obtained from ALS**

Converting the recommendations dictionary to form a dataframe, showing the predicted rating from ALS and the book's respective genre.

In [8]:
res = list(zip(*val)) # to convert a list of tuples to dataframe
book_list = res[0]
pred_rating = res[1]
#print(book_list)
pred_rating
recommended_books = pd.DataFrame(val, columns =['Title', 'Predicted Rating'])
book_genre_list = [book_genre_df.loc[book_genre_df['title'] == each]['genre'].values[0] for each in recommended_books['Title']]
#type(result)
recommended_books['genre'] = book_genre_list

print('Book recommendations for the book user:', id_)

recommended_books.head() # Top book recommendations for the given user id 

Book recommendations for the book user: 7375c7dcaa4b586d4c6854f3c9c11903


Unnamed: 0,Title,Predicted Rating,genre
0,"Betrayers (The Devil's Roses, #9)",4.953307,Horror
1,"The Siren (The Soul Summoner, #2)",4.953307,Horror
2,"Alpha (Jad Bell, #1)",3.493938,Thriller
3,Earthly Powers,3.399424,Thriller
4,"Savage (The Kingwood Duet, #1)",3.293544,Romance


In [9]:
# A list of genre 
book_genre = ['Comics', 'Horror', 'Mystery', 'Graphics', 'Adventure', 'Fantasy', 'Romance', 'Crime', 'Thriller']

In [10]:
movie_genre = movie_df['genres']
movie_genre = movie_genre.drop_duplicates()

In [11]:

movie_genre.tolist()
movie_genre = [i.split('|', 1)[0] for i in movie_genre]
movie_genre
unique_movie_genre = [] 
for i in movie_genre: 
    if i not in unique_movie_genre: 
        unique_movie_genre.append(i) 
unique_movie_genre.remove('(no genres listed)')
print(unique_movie_genre)

['Adventure', 'Comedy', 'Action', 'Drama', 'Crime', 'Children', 'Mystery', 'Documentary', 'Animation', 'Thriller', 'Horror', 'Fantasy', 'Western', 'Film-Noir', 'Romance', 'War', 'Sci-Fi', 'Musical', 'IMAX']


In [12]:
similar_genre = pd.DataFrame(unique_movie_genre, columns = ['movie_genre'])

#similar_genre = [ token for idx, token in enumerate(set(itertools.chain.from_iterable(unique_movie_genre)))]
similar_genre = pd.DataFrame(similar_genre)

#similar_genre

Along with the dataframe to show the mapping of genres, we are also creating a dictionary with keys of book genre and values as the list of synonyms from the movie genre. This dictionary created, helps in faster transformation of the book user profile.

In [13]:
book_movie_genre_map = {} # Creating a dictionary of te mapping
def find_relation(str):
    value = str
    str = str.replace('-', ' ')
    if(len(str.split())==1):
        a = wn.synsets(str) # synsets is available in Natural Language ToolKit used to find synonimous words
        if len(a) == 0:
            return 'Null' # If null we are not adding it to dictionary
        else:
            x = []
            for genre in book_genre:
                m = wn.synsets(genre)
                x.append(a[0].path_similarity(m[0]))
            x = np.array(x, dtype = float)
            max_idx = x.argmax()
            key = book_genre[max_idx] # Dictionary key is the book genre name
            val = book_movie_genre_map.get(key, []) # Value for the key is the list synonyms for that book genre
            val.append(value)
            book_movie_genre_map[key] = val
            return book_genre[max_idx]
    else:
        a = wn.synsets(str.split()[0])
        b = wn.synsets(str.split()[1])
        if not a and not b:
            return 'Null'
        elif (a and not b) or (not a and b):    
            if a and not b:
                a = wn.synsets(str.split()[0])
            else:
                a = wn.synsets(str.split()[1])
            if not a:
                return 'Null'
            else:
                x = []
                for genre in book_genre:
                    m = wn.synsets(genre)
                    x.append(a[0].path_similarity(m[0]))
                x = np.array(x, dtype = float)
                max_idx = x.argmax()
                key = book_genre[max_idx]
                val = book_movie_genre_map.get(key, [])
                val.append(value)
                book_movie_genre_map[key] = val
                return book_genre[max_idx]
        else:
            x = []
            y = []
            for genre in book_genre:
                m = wn.synsets(genre)
                x.append(a[0].path_similarity(m[0]))
                y.append(b[0].path_similarity(m[0]))
            x = np.array(x, dtype = float)
            y = np.array(y, dtype = float)
            if (x.max() > y.max()):
                max_idx = x.argmax()
            else:
                max_idx = y.argmax()
            key = book_genre[max_idx]
            val = book_movie_genre_map.get(key, [])
            val.append(value)
            book_movie_genre_map[key] = val
            return book_genre[max_idx]
    

similar_genre['relation_with_book_genre'] = similar_genre['movie_genre'].apply(find_relation)

similar_genre
book_movie_genre_map['comics'] = ['Children'] # adding this mapping as it was not obtained from NLP
book_movie_genre_map

{'Adventure': ['Adventure', 'Action', 'War'],
 'Graphics': ['Comedy',
  'Drama',
  'Children',
  'Documentary',
  'Western',
  'Film-Noir',
  'Musical'],
 'Crime': ['Crime'],
 'Mystery': ['Mystery'],
 'Romance': ['Animation', 'Romance'],
 'Thriller': ['Thriller'],
 'Horror': ['Horror'],
 'Fantasy': ['Fantasy'],
 'comics': ['Children']}

In [14]:
# Removing the meaningless and null mappings
print('Mapping of the book genre to movie genre using NLP:')
similar_genre

Mapping of the book genre to movie genre using NLP:


Unnamed: 0,movie_genre,relation_with_book_genre
0,Adventure,Adventure
1,Comedy,Graphics
2,Action,Adventure
3,Drama,Graphics
4,Crime,Crime
5,Children,Graphics
6,Mystery,Mystery
7,Documentary,Graphics
8,Animation,Romance
9,Thriller,Thriller


In [15]:
# reading book user profiles to the dataframe
book_user = pd.read_csv('book_userprofile.csv')
movie_profile = pd.read_csv('movie_profile.csv')

In [16]:
book_user.head()

Unnamed: 0.1,Unnamed: 0,user_id,Horror,Romance,Thriller,comics,Graphics,Adventure,Fantasy,Mystery,Crime
0,0,00009e46d18f223a82b22da38586b605,0.180952,0.175238,0.118095,0.03619,0.03619,0.03619,0.180952,0.118095,0.118095
1,1,0000c3d51aa099745e93a4e99c4856c8,0.219628,0.216293,0.094807,0.02001,0.02001,0.02001,0.219628,0.094807,0.094807
2,2,0001085188e302fc6b2568de45a5f56b,0.166667,0.166667,0.166667,0.0,0.0,0.0,0.166667,0.166667,0.166667
3,3,00013f3df6711fc887013c9692de4416,0.166667,0.166667,0.166667,0.0,0.0,0.0,0.166667,0.166667,0.166667
4,4,000157a6f8331e9c9a21252e1fee91d1,0.166667,0.166667,0.166667,0.0,0.0,0.0,0.166667,0.166667,0.166667


In [17]:
# Dropping the no genre listed, IMAX and Sci-Fi as they did not map to any genre from the user profile

movie_profile = movie_profile.drop(['IMAX', '(no genres listed)', 'Sci-Fi'], axis = 1)

In [18]:
print('Shape of the dataframe', movie_profile.shape) # gives the number of movies present in the dataset

print(movie_profile.columns)
movie_profile.head()

Shape of the dataframe (27278, 19)
Index(['movieId', 'title', 'Action', 'Adventure', 'Animation', 'Children',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'Musical', 'Mystery', 'Romance', 'Thriller', 'War',
       'Western'],
      dtype='object')


Unnamed: 0,movieId,title,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Thriller,War,Western
0,1,Toy Story (1995),0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Transforming user profiles Using NLP

In order to calculate the similarity between any two feature vectors, their dimensions have to be same.
To have the same dimension and genre order as the movie profile, transformation of the book user profile is required.
Then the similarity between the book user profile and every movie profile is computed. More similar profiles have lesser cosine distance between them.

In [19]:
# Movie recommendations for the test user 0001085188e302fc6b2568de45a5f56b from book dataset

book_user.loc[book_user['user_id'] == id_]

Unnamed: 0.1,Unnamed: 0,user_id,Horror,Romance,Thriller,comics,Graphics,Adventure,Fantasy,Mystery,Crime
147988,147988,7375c7dcaa4b586d4c6854f3c9c11903,0.187023,0.187023,0.101781,0.044529,0.044529,0.044529,0.187023,0.101781,0.101781


In [20]:
#Creating a dataframe for book user to have the genre list as the movie profiles

trans_user = pd.DataFrame(columns = ['Action', 'Adventure', 'Animation', 'Children',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
       'Horror', 'Musical', 'Mystery', 'Romance', 'Thriller', 'War',
       'Western'])


In [21]:
test_user = book_user.loc[book_user['user_id'] == id_]
test_user.columns[2:]
for col in test_user.columns[2:]:
    result_columns = book_movie_genre_map[col]
    for result_column in result_columns:
        #print(col, result_column)
        trans_user[result_column] = test_user[col]
#trans_user

In [22]:
print('Movie profile:', movie_profile.loc[0][2:].to_numpy()) # movie profile example

print('User profile after transformation:', trans_user.to_numpy()) # user profile 

Movie profile: [0.0 1.0 1.0 1.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0]
User profile after transformation: [[0.04452926 0.04452926 0.1870229  0.04452926 0.04452926 0.10178117
  0.04452926 0.04452926 0.1870229  0.04452926 0.1870229  0.04452926
  0.10178117 0.1870229  0.10178117 0.04452926 0.04452926]]


In [23]:
df_cosine = pd.DataFrame(movie_profile['movieId'], columns =  ['movieId'])
title = movie_profile['title']
df_cosine = df_cosine.join(title)

In [24]:
cos_distance = []

for i,each in enumerate(movie_profile['movieId']):
    cos_value = distance.cosine(movie_profile.loc[i][2:].to_numpy(), trans_user.to_numpy())
    cos_distance.append(cos_value)
    
df_cosine['cosine distance'] = cos_distance

  dist = 1.0 - uv / np.sqrt(uu * vv)


Sorting (in ascending order) the dataframe based on the cosine distance arranges the movies which are most similar to the target book user.

In [25]:
#print('Movie Recommendations for the test user:', id_)
df_cosine = df_cosine.sort_values(by = 'cosine distance') # Lower the cosine distance more similar are the profiles
df_cosine = df_cosine[:5] # Creating a dataframe with the movies which are most similar to the user profile
#df_cosine.head()

In [26]:
# Getting the genre of the top movies to compare with the user profile for inference
movie_genre_list = [movie_df.loc[movie_df['movieId'] == each]['genres'].values for each in df_cosine['movieId']]

In [27]:
print('Movie Recommendations for the test user:', id_)
df_cosine['genre'] = movie_genre_list
df_cosine.head()

Movie Recommendations for the test user: 7375c7dcaa4b586d4c6854f3c9c11903


Unnamed: 0,movieId,title,cosine distance,genre
2994,3081,Sleepy Hollow (1999),0.241289,[Fantasy|Horror|Mystery|Romance]
8595,26080,Eegah (1962),0.241289,[Fantasy|Horror|Romance|Thriller]
1309,1339,Dracula (Bram Stoker's Dracula) (1992),0.241289,[Fantasy|Horror|Romance|Thriller]
10084,33294,Vampire Hunter D (1985),0.258439,[Animation|Fantasy|Horror]
14800,74095,Wicked City (Yôjû toshi) (1987),0.258439,[Animation|Fantasy|Horror|Sci-Fi]


In [28]:
print('book recommendations for the user:', id_)
recommended_books.head()

book recommendations for the user: 7375c7dcaa4b586d4c6854f3c9c11903


Unnamed: 0,Title,Predicted Rating,genre
0,"Betrayers (The Devil's Roses, #9)",4.953307,Horror
1,"The Siren (The Soul Summoner, #2)",4.953307,Horror
2,"Alpha (Jad Bell, #1)",3.493938,Thriller
3,Earthly Powers,3.399424,Thriller
4,"Savage (The Kingwood Duet, #1)",3.293544,Romance


**Inference**

With this we can infer that the genre most liked by the users based on his book ratings is almost same as the genre of the movies recommended to the user with a few similar genres.

There are few more genres of movies recommended which are not present in the book genre. These are obtained by NLP mapping. The new genres are closely similar but not exactly the same as the genres liked by the user. This might introduce certain level of novelty and serendipity into the recommender system.

**References:**

https://github.com/vjvishaljha/Recommender_Sys/blob/master/Recommender%20System.ipynb <br>
https://recsys.acm.org/wp-content/uploads/2014/10/recsys2014-tutorial-cross_domain.pdf <br>
https://towardsdatascience.com/prototyping-a-recommender-system-step-by-step-part-1-knn-item-based-collaborative-filtering-637969614ea <br>
https://github.com/asciel/Cross-domain-Recommendation <br>
https://www.programcreek.com/python/example/91609/nltk.corpus.wordnet.path_similarity <br>
https://www.researchgate.net/profile/Richa_Sharma122/publication/307532631_MOVBOK_A_Personalized_Social_Network_Based_Cross_Domain_Recommender_System/links/5aed6f920f7e9b01d3e1769b/MOVBOK-A-Personalized-Social-Network-Based-Cross-Domain-Recommender-System.pdf <br>

