In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.utils.extmath import randomized_svd

In [2]:
data = pd.io.parsers.read_csv('rating_data.csv', engine='python', delimiter=',', encoding='latin1')
book_data = pd.io.parsers.read_csv('book_data.csv', engine='python', delimiter=',')
book_data = book_data.sort_values(by='book_id')
book_data = book_data.drop_duplicates()
book_data.reset_index(drop=True, inplace=True)
print(book_data)

          book_id                                              title  \
0               1  Harry Potter and the Half-Blood Prince (Harry ...   
1               2  Harry Potter and the Order of the Phoenix (Har...   
2               3  Harry Potter and the Sorcerer's Stone (Harry P...   
3               5  Harry Potter and the Prisoner of Azkaban (Harr...   
4               6  Harry Potter and the Goblet of Fire (Harry Pot...   
...           ...                                                ...   
1439787  36525824             ফুলগুলি যেন কথা (Phoolguli jeno katha)   
1439788  36526448                            João y el oso Antártica   
1439789  36529314                         The Uprising (Erafeen, #3)   
1439790  36529772                                             Brevet   
1439791  36530431                                 Planning Christmas   

                                                    genres  
0        ['fantasy' 'paranormal' 'young-adult' 'fiction...  
1        ['fa

In [3]:
book_ids_df1 = set(book_data['book_id'])
book_ids_df2 = set(data['book_id'])

# Calculate the set difference to find the IDs that are in df1 but not in df2
ids_only_in_df1 = book_ids_df1 - book_ids_df2
print(ids_only_in_df1)

{22839296, 34275329, 6619138, 29425670, 8650758, 18776070, 14614536, 28016649, 28016650, 15728650, 35913741, 4751374, 27426831, 29523984, 35880977, 262162, 11632653, 29098004, 16187414, 32702487, 23167000, 20381718, 7602203, 28016668, 28016669, 28016670, 21725217, 26542113, 31916067, 3276836, 7110695, 22577192, 4915241, 13697073, 25657398, 15826998, 589880, 27131964, 17924158, 6455358, 7569470, 1278017, 8388675, 35225673, 12681290, 32079945, 7110731, 13828173, 3080266, 28934222, 35717201, 13369426, 27394133, 25198678, 30834779, 11436123, 34078815, 29098081, 2162787, 32309348, 655461, 14581862, 27426918, 26148968, 1474664, 2588774, 8880235, 24510568, 27230317, 27787373, 32866413, 32309366, 21168247, 1638519, 19038329, 36241527, 23298174, 12845184, 4489344, 34308226, 27918464, 32735361, 18350214, 4620425, 28573834, 2031755, 35782796, 1081485, 23298187, 20381833, 6652056, 27885721, 34144410, 18219163, 5079196, 10191002, 2490526, 23003296, 7045281, 7602336, 31522979, 360619, 12091564, 2686

In [4]:
user_ids = data['user_id'].unique()
book_ids = data['book_id'].unique()
book_id_to_index = {book_id: index for index, book_id in enumerate(book_ids)}
user_id_to_index = {user_id: index for index, user_id in enumerate(user_ids)}

row = data['user_id'].apply(lambda x: user_id_to_index[x]).values
col = data['book_id'].apply(lambda x: book_id_to_index[x]).values

print(user_ids)
print(book_ids)

[     0      1      2 ... 171833 171834 171835]
[      21       30       45 ... 36110997 36217419 36217772]


In [5]:
"""data_values = data['rating'].astype(float).values
matrix = csr_matrix((data_values, (col, row)))
print(matrix.shape)"""
matrix = csr_matrix((data.rating.values, (data.user_id.values, data.book_id.values)))
print(matrix.shape)

(171836, 36530432)


In [6]:
U, S, V = randomized_svd(matrix, 
                              n_components=15,
                              n_iter=5,
                              random_state=42)


In [12]:
def top_cosine_similarity(data, book_id, top_n=10):
    index = book_id - 1
    book_row = data[index, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))
    similarity = np.dot(book_row, data.T) / (magnitude[index] * magnitude)
    sort_indexes = np.argsort(-similarity)
    return sort_indexes[:top_n]

def print_similar_books(book_data, book_id, top_indexes):
    print('Recommendations for {0}: \n'.format(
        book_data[book_data.book_id == book_id].title.values[0]))
    for index in top_indexes + 1:
        if index in book_data.index:
            print(book_data.iloc[index].title)
            print("\n")
        else:
            print(book_data[book_data["book_id"] == index - 1].title.values[0])
            print("\n")


k = 50
book_id = 1
top_n = 10
sliced = V.T[:, :k]
indexes = top_cosine_similarity(sliced, book_id, top_n)
print_similar_books(book_data, book_id, indexes)

  similarity = np.dot(book_row, data.T) / (magnitude[index] * magnitude)


Recommendations for Harry Potter and the Half-Blood Prince (Harry Potter, #6): 

Harry Potter and the Order of the Phoenix (Harry Potter, #5)


Harry Potter Boxed Set, Books 1-5 (Harry Potter, #1-5)


The Power Of Meditation: Zen Guide To Relieving Stress And Living A Peaceful Life


Miguel's Secret Journal


The Drawings of the Electric Pencil


Five Big Questions in Life: And How to Answer Them


The Strange Life of Walenty Karnowski: The Rabbi's Illegitimate Grandson


The Rift


Death at willow creek mine


Beautiful Twisted Night




In [4]:
pd.read_feather("./data/english-books.feather").to_csv("X:/english-books.csv", index=False)

In [3]:
import pandas as pd

def get_interactions_data():
    orig_interactions = pd.read_feather("./data/english-interactions.feather")
    adjusted_ratings_poetry = pd.read_csv("./data/adjusted_rating_poetry.csv", engine='python', delimiter=',', encoding='latin1')
    adjusted_ratings_thriller = pd.read_csv("./data/adjusted_rating_thrillers.csv", engine='python', delimiter=',', encoding='latin1')
    book_data = pd.read_feather("./data/english-books.feather")
    book_data['book_id'] = book_data['book_id'].astype(int)
    
    

    merged_df = merge_df(orig_interactions, adjusted_ratings_poetry, book_data)
    merged_df = merge_df(merged_df, adjusted_ratings_thriller, book_data)

    merged_df.to_csv("X:/interactions_with_adjusted_ratings.csv", index=False)

def merge_df(interactions_df, adjusted_rating_df, book_df):
    book_df['book_id'] = book_df['book_id'].astype(int)
    
    adjusted_rating_df = adjusted_rating_df.drop(columns=["review_id", "rating", "sentiment_score", "text_length", "review_text"])

    interactions_df['book_id'] = interactions_df['book_id'].astype(int)
    interactions_df['user_id'] = interactions_df['user_id'].astype(str)
    adjusted_rating_df['book_id'] = adjusted_rating_df['book_id'].astype(int)
    adjusted_rating_df['user_id'] = adjusted_rating_df['user_id'].astype(str)
    merged_data = pd.merge(interactions_df, adjusted_rating_df, on=['book_id', 'user_id'], how='left')
    merged_data['rating'].fillna(merged_data['adjusted_rating'], inplace=True)
    
    merged_data.drop(columns=['adjusted_rating'], inplace=True)
    return merged_data[merged_data['book_id'].isin(book_df['book_id'])]

get_interactions_data()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_data['rating'].fillna(merged_data['adjusted_rating'], inplace=True)


Unnamed: 0,rating,book_id,user_id
0,5,21,8842281e1d1347389f2ab93d60773d4d
1,5,21,ab2923b738ea3082f5f3efcbbfacb218
2,3,21,93c5e16254e7838b69178338bb20459e
3,5,21,3465f056f8146c0aa9af1fd764d7d634
4,4,21,18759758c7cf75975cff3fbb1363a73f
...,...,...,...
84551722,5,36110583,e223be160b89f218dbee70b5fbdccf76
84551723,5,36110839,e223be160b89f218dbee70b5fbdccf76
84551724,5,36110997,e223be160b89f218dbee70b5fbdccf76
84551725,5,36217419,e223be160b89f218dbee70b5fbdccf76
