In [1]:
from IPython.display import display
import pandas as pd
import numpy as np

# Import seaborn for statistic evaluation
import seaborn as sns
# Apply the default theme
sns.set_theme()
import matplotlib.pyplot as plt

from sklearn.neighbors import NearestNeighbors
import sklearn
import re
import ipywidgets as widgets
from IPython.display import display, clear_output
from contextlib import contextmanager
import os, sys

from sklearn.model_selection import train_test_split

In [2]:
book_path = r'C:\Users\Legion\Downloads\b\1INTRO2AI\BX-Books.csv'
user_path = r'C:\Users\Legion\Downloads\b\1INTRO2AI\BX-Users.csv'
rating_path = r'C:\Users\Legion\Downloads\b\1INTRO2AI\BX-Book-Ratings.csv'
books = pd.read_csv(book_path, encoding='latin-1' , on_bad_lines='skip' , sep=';' , low_memory=False, escapechar='\\')
users = pd.read_csv(user_path, encoding='latin-1' , on_bad_lines='skip' , sep=';' , low_memory=False, escapechar='\\')
ratings = pd.read_csv(rating_path, encoding='latin-1' , on_bad_lines='skip' , sep=';' , low_memory=False, escapechar='\\')

In [3]:
books.dtypes

ISBN                   object
Book-Title             object
Book-Author            object
Year-Of-Publication     int64
Publisher              object
Image-URL-S            object
Image-URL-M            object
Image-URL-L            object
dtype: object

In [4]:
pd.set_option('display.max_colwidth', None)

In [5]:
users.dtypes

User-ID       int64
Location     object
Age         float64
dtype: object

In [6]:
ratings.dtypes

User-ID         int64
ISBN           object
Book-Rating     int64
dtype: object

In [7]:
# Data Preprocessing

books.loc[books['Book-Author'].isnull(), 'Book-Author'] = "No author"
books.loc[books['Publisher'].isnull(), 'Book-Author'] = "Other"

In [8]:
print(users.shape)
users.head()
print(sorted(users['Age'].unique()))

(278858, 3)
[nan, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, 100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, 110.0, 111.0, 113.0, 114.0, 115.0, 116.0, 118.0, 119.0, 123.0, 124.0, 127.0, 128.0, 132.0, 133.0, 136.0, 137.0, 138.0, 140.0, 141.0, 143.0, 146.0, 147.0, 148.0, 151.0, 152.0, 156.0, 157.0, 159.0, 162.0, 168.0, 172.0, 175.0, 183.0, 186.0, 189.0, 199.0, 200.0, 201.0, 204.0, 207.0, 208.0, 209.0, 210.0, 212.0, 219.0, 220.0, 

In [9]:
users.loc[(users['Age'] < 5) | (users['Age'] > 100), 'Age'] = pd.NA
users['Age'] = users['Age'].fillna(users['Age'].mean())
users['Age'] = users['Age'].astype(np.int64)

In [10]:
ratings['Book-Rating'].unique()

array([ 0,  5,  3,  6,  8,  7, 10,  9,  4,  1,  2], dtype=int64)

In [11]:
# print(users.shape[0] * books.shape[0])
# Check if people rated is in database
# r_existpeople = ratings[ratings['User-ID'].isin(users['User-ID'])]
# print(r_existpeople.shape)

# Check if books rated is in database
r_existbook = ratings[ratings['ISBN'].isin(books['ISBN'])]
r_existbook

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
...,...,...,...
1149774,276704,0876044011,0
1149775,276704,1563526298,9
1149776,276706,0679447156,0
1149777,276709,0515107662,10


In [12]:
ratings_nonzero = r_existbook[r_existbook['Book-Rating'] != 0]
# ratings_explicit
ratings_allzero = r_existbook[r_existbook['Book-Rating'] == 0]
# ratings_implicit
ratings_nonzero

Unnamed: 0,User-ID,ISBN,Book-Rating
1,276726,0155061224,5
3,276729,052165615X,3
4,276729,0521795028,6
8,276744,038550120X,7
16,276747,0060517794,9
...,...,...,...
1149771,276704,0743211383,7
1149773,276704,0806917695,5
1149775,276704,1563526298,9
1149777,276709,0515107662,10


In [13]:
# reducing to user rating 100 books and books have 100 ratings
used_rating = r_existbook
rating_filter = r_existbook

counts1 = rating_filter['User-ID'].value_counts()
counts = rating_filter['ISBN'].value_counts()
print(counts1)
print(counts)

used_rating = used_rating[used_rating['User-ID'].isin(counts1[counts1 >= 100].index)]
used_rating = used_rating[used_rating['ISBN'].isin(counts[counts >= 50].index)]

used_rating = used_rating[used_rating['Book-Rating'] != 0]

print(used_rating)

11676     11144
198711     6456
153662     5814
98391      5779
35859      5646
          ...  
116792        1
116798        1
116801        1
116810        1
276721        1
Name: User-ID, Length: 92107, dtype: int64
0971880107    2502
0316666343    1295
0385504209     883
0060928336     732
0312195516     723
              ... 
0739431536       1
072324801X       1
1401300006       1
1568659830       1
0806917695       1
Name: ISBN, Length: 270170, dtype: int64
         User-ID        ISBN  Book-Rating
1456      277427  002542730X           10
1474      277427  0061009059            9
1522      277427  0316776963            8
1543      277427  0345413903           10
1564      277427  0380702843            8
...          ...         ...          ...
1149564   276680  0452283205            7
1149592   276680  0743203631            7
1149604   276680  0743486226            6
1149629   276680  1573229083            7
1149637   276680  1931561648            9

[31418 rows x 3 columns]


In [14]:
used_rating = used_rating.sample(frac = 1)

ratings_matrix = used_rating.pivot(index = 'User-ID', columns = 'ISBN', values = 'Book-Rating')
userID = ratings_matrix.index
ISBN = ratings_matrix.columns
print(ratings_matrix.shape)
ratings_matrix = ratings_matrix.fillna(0)

print(used_rating)
ratings_matrix

(1560, 2160)
        User-ID        ISBN  Book-Rating
173336    37644  0553274295           10
649724   157247  0451188489            9
462165   110934  0440211727            8
440917   105517  1558743316            7
899974   217740  0312979479            9
...         ...         ...          ...
337420    80538  0345417623            8
341301    81492  0345342968            9
84366     17859  0684195976            9
259513    60244  0064405176            9
223999    52350  0156007754            9

[31418 rows x 3 columns]


ISBN,000649840X,0007110928,002026478X,0020442203,002542730X,0028604199,006000438X,0060008032,0060008776,006001203X,...,1860492592,1878424319,1885171080,1931561648,3257228007,3257229534,3404148665,3423202327,3442541751,3492045170
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1424,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1435,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
277639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278188,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# KNN Function
#setting global variables
global metric,k
k=10
global_metric='cosine'
global_algorithm = 'brute'

In [16]:
def findksimilarusers(user_id, ratings, metric=global_metric, algo=global_algorithm,k=k):
    similarities=[]
    indices=[]
    model_knn = NearestNeighbors(metric = metric, algorithm = algo)
    model_knn.fit(ratings.values)
    loc = ratings.index.get_loc(user_id)
    distances, indices = model_knn.kneighbors(ratings.iloc[loc, :].values.reshape(1, -1), n_neighbors = k+1)
    similarities = 1 - distances.flatten()
    
    return similarities, indices

In [17]:
#This function predicts rating for specified user-item combination based on user-based approach
def predict_userbased(user_id, item_id, ratings, metric = global_metric, algorithm = global_algorithm, k=k):
    prediction=0
    user_loc = ratings.index.get_loc(user_id)
    item_loc = ratings.columns.get_loc(item_id)
    similarities, indices = findksimilarusers(user_id, ratings, metric, algorithm, k) #similar users based on cosine similarity
    mean_rating = ratings.iloc[user_loc, :].mean() #to adjust for zero based indexing
    sum_wt = np.sum(similarities)-1
    product=1
    wtd_sum = 0 
    
    for i in range(0, len(indices.flatten())):
        if indices.flatten()[i] == user_loc:
            continue;
        else: 
            ratings_diff = ratings.iloc[indices.flatten()[i],item_loc]-np.mean(ratings.iloc[indices.flatten()[i],:])
            product = ratings_diff * (similarities[i])
            wtd_sum = wtd_sum + product
    
    #in case of very sparse datasets, using correlation metric for collaborative based approach may give negative ratings
    #which are handled here as below
    if prediction <= 0:
        prediction = 1   
    elif prediction >10:
        prediction = 10
    
    prediction = int(round(mean_rating + (wtd_sum/sum_wt)))
    # print ('\nPredicted rating for user {0} -> item {1}: {2}'.format(user_id,item_id,prediction))

    return prediction

In [18]:
# Skip this line because 0001056107 may or may not exist in the pivot table depending on how books & users are filtered
# predict_userbased(11676,'0001056107',ratings_matrix);

In [19]:
def recommendedItem(user_id, ratings, metric=global_metric):
    if (user_id not in ratings.index.values) or type(user_id) is not int:
        print("User id should be a valid integer from this list : \n\n {}".format(re.sub('[\[\]]', '', np.array_str(ratings_matrix.index.values))))
    else:
        prediction = []
        
#         ids = ['Item-based (correlation)', 'Item-based (cosine)', 'User-based (correlation)', 'User-based (cosine)']
#         select = widgets.Dropdown(options = ids, value = ids[0], description = ' Select approach', width = '1000px')
#         def on_change(change):
#             clear_output(wait = True)
#             prediction = []
#             if change['type'] == 'change' and change['name'] == 'value':
#                 if (select.value == 'Item-based (correlation)') | (select.value == 'User-based (correlation)'):
#                     global_metric = 'correlation'
#                 else:
#                     global_metric = 'cosine'
                    
#                 with suppress_stdout():
#         if (select.value == 'User-based (correlation)') | (select.value == 'User-based (cosine)'):

        total = ratings.shape[1]
        for i in range(ratings.shape[1]):
            if (ratings[str(ratings.columns[i])][user_id] != 0): #not rated already
                print("\r{0:<0.2f}% Finished".format(i*100/total), end='')
                prediction.append(predict_userbased(user_id, str(ratings.columns[i]) ,ratings, metric))
            else:                    
                prediction.append(-1) #for books that user already rated
#         else:
#             print("Item-based not implemented yet")
        print("\r100.00% Finished")
        
        prediction = pd.Series(prediction)
        prediction = prediction.sort_values(ascending = False)
        print("...")
        print(prediction)
        
        recommended = prediction[:10]
        print("\nFor the User-based ({0} - {1}) approach, the following books are recommended: \n".format(global_algorithm, global_metric))
        for i in range(len(recommended)):
            print("{0}. Rated {1}: {2}".format(i+1, prediction.values[i], books['Book-Title'][recommended.index[i]]))
                

In [20]:
print(len(ratings_matrix))
global_algorithm = 'brute'
global_metric = 'cosine'
recommendedItem(254, ratings_matrix)

1560
100.00% Finished
...
880     10
876      9
1656     7
878      7
1665     3
        ..
716     -1
715     -1
714     -1
712     -1
2159    -1
Length: 2160, dtype: int64

For the User-based (brute - cosine) approach, the following books are recommended: 

1. Rated 10: The Thorn Birds (Modern Classics)
2. Rated 9: Ufos, JFK and Elvis: Conspiracies You Don't Have to Be Crazy to Believe
3. Rated 7: Witching Hour (Lives of the Mayfair Witches)
4. Rated 7: About a Boy (Movie Tie-In)
5. Rated 3: Me and My Little Brain
6. Rated 1: Mostly Harmless
7. Rated 1: Hannibal
8. Rated 1: The True and Outstanding Adventures of the Hunt Sisters: A Novel
9. Rated 1: The Swan: A Novel
10. Rated 1: The Museum Guard


In [21]:
global_algorithm = 'kd_tree'
global_metric = 'euclidean'
recommendedItem(254, ratings_matrix)

100.00% Finished
...
880     10
876      9
1656     7
878      7
1665     3
        ..
716     -1
715     -1
714     -1
712     -1
2159    -1
Length: 2160, dtype: int64

For the User-based (kd_tree - euclidean) approach, the following books are recommended: 

1. Rated 10: The Thorn Birds (Modern Classics)
2. Rated 9: Ufos, JFK and Elvis: Conspiracies You Don't Have to Be Crazy to Believe
3. Rated 7: Witching Hour (Lives of the Mayfair Witches)
4. Rated 7: About a Boy (Movie Tie-In)
5. Rated 3: Me and My Little Brain
6. Rated 1: Mostly Harmless
7. Rated 1: Hannibal
8. Rated 1: The True and Outstanding Adventures of the Hunt Sisters: A Novel
9. Rated 1: The Swan: A Novel
10. Rated 1: The Museum Guard
