<h1>Books assignment</h1>

In [100]:
import numpy as np 
import pandas as pd 
import os

<h3>Reading data</h3>

In [101]:
path_ratings="~/ukol/BX-Book-Ratings-nonzero.csv"
df_ratings=pd.read_csv(path_ratings, on_bad_lines="warn", delimiter=";", encoding = "ISO-8859-1")
df_ratings.dropna()
df_ratings['ISBN'] = df_ratings['ISBN'].str.replace('[^\d]', '', regex=True)  # ISBN mess

df_ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276726,155061224,5
1,276729,52165615,3
2,276729,521795028,6
3,276736,3257224281,8
4,276737,600570967,6


In [102]:
# removing users rating only one book
# we may or may not use this (depends on expected changes in the database)

df_ratings = df_ratings.loc[df_ratings.duplicated(subset='User-ID', keep=False), :]
df_ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
1,276729,52165615,3
2,276729,521795028,6
7,276747,60517794,9
8,276747,671537458,9
9,276747,679776818,8


In [103]:
path_interact="~/ukol/BX-Book-Ratings-zero.csv"
df_interact=pd.read_csv(path_interact, on_bad_lines="warn", delimiter=";", encoding = "ISO-8859-1")
df_interact.dropna()
#df_interact['ISBN'] = df_interact['ISBN'].str.replace('[^\d]', '', regex=True)  # ISBN mess
df_interact.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276727,0446520802,0
2,276733,2080674722,0
3,276746,0425115801,0
4,276746,0449006522,0


In [105]:
# removing users interacting with only one book
# we may or may not use this (depends on expected changes in the database)

df_interact = df_interact.loc[df_interact.duplicated(subset='User-ID', keep=False), :] # remove unique users
df_interact.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
3,276746,0425115801,0
4,276746,0449006522,0
5,276746,0553561618,0
6,276746,055356451X,0
7,276746,0786013990,0


In [106]:
# auxiliary

#df_interact.describe()
#df_interact.shape
#df_interact.info()
#df_interact.dtypes

#df_ratings.describe()
#df_ratings.shape
#df_ratings.info()
#df_ratings.dtypes

In [107]:
path_books="~/ukol/BX-Books.csv"
df_books=pd.read_csv(path_books, on_bad_lines="skip", delimiter=";", encoding = "ISO-8859-1", dtype={"ISBN": "string", "Year-Of-Publication": "string"})
df_books.dropna()
df_books['ISBN'] = df_books['ISBN'].str.replace('[^\d]', '', regex=True)
df_books = df_books.drop_duplicates(subset='ISBN')
df_books_shorter = df_books.drop(['Publisher','Image-URL-S','Image-URL-M','Image-URL-L','Year-Of-Publication'],axis=1)
df_books_shorter.head(5)

Unnamed: 0,ISBN,Book-Title,Book-Author
0,195153448,Classical Mythology,Mark P. O. Morford
1,2005018,Clara Callan,Richard Bruce Wright
2,60973129,Decision in Normandy,Carlo D'Este
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,393045218,The Mummies of Urumchi,E. J. W. Barber


<h3>Functions definitions<h3>

In [108]:
# looking up the book's full book
def full_name(isbn):
    book_name = df_books_shorter[df_books_shorter['ISBN']==isbn]['Book-Title']
    book_author = df_books_shorter[df_books_shorter['ISBN']==isbn]['Book-Author']
    if ((book_name.empty) or (book_author.empty)):
        return "", ""
    else:
        return book_name.iloc[0], book_author.iloc[0]

In [109]:
# improvised test for full_name
bn, ba = full_name("0002005018")
# should return Clara Callan, Richard Bruce Wright
#bn, ba = full_name("bleble")
# should return empty strings
if (bn and ba):
    print("You should read {} written by {}.".format(bn, ba))



You should read Clara Callan written by Richard Bruce Wright.


<h4>Building dictionaries</h4>

In [111]:
# building dictionaries of users' interactions with books (rating 0)
def build_dictionary_seen(df_interact):
    users = {}
    books = {}
    for index, row in df_interact.iterrows():
        user = row['User-ID']
        isbn = row['ISBN']
        if (user in users.keys()):
            users[user].append(isbn)
        else:
            users[user] = [isbn]
        if (isbn in books.keys()):
            books[isbn].append(user)
        else:
            books[isbn] = [user]
    return books, users
        

In [112]:
# building dictionaries of high book ratings (not preserving the acual ratings)
# currently used with min_rating = 8
def build_dictionary_ratings_amount(min_rating, df_ratings):
    users = {}
    books = {}
    for index, row in df_ratings.iterrows():
        user = row['User-ID']
        isbn = row['ISBN']
        rating = row['Book-Rating']
        if rating >= min_rating:
            if (user in users.keys()):
                users[user].append(isbn)
            else:
                users[user] = [isbn]
            if (isbn in books.keys()):
                books[isbn].append(user)
            else:
                books[isbn] = [user]
    return books, users
        

In [113]:
# building dictionaries containing ratings with values
# currently not used
def build_dictionary_ratings_values(df_ratings):
    users = {}
    books = {}
    for index, row in df_ratings.iterrows():
        user = row['User-ID']
        isbn = row['ISBN']
        rating = row['Book-Rating']
        
        if (user in users.keys()):
            users[user].append((isbn, rating))
        else:
            users[user] = [(isbn, rating)]
            if (isbn in books.keys()):
                books[isbn].append((user, rating))
            else:
                books[isbn] = [(user, rating)]
    return books, users
            

<h4>Recommendation fuctions</h4>

In [114]:
# used both for interaction and high ratings
def recomm_dict_without_ratings(books, users, isbn):
    videli_taky = {}
    if isbn in books.keys():
        videli_to = books[isbn]
        for user in videli_to:
            if user in users.keys():     # check cause he might have been removed for evaluating
                for book in users[user]:
                    if book != isbn:         # not contain book from the initial question
                        if (book in videli_taky.keys()):
                            videli_taky[book] += 1
                        else: 
                            videli_taky[book] = 1
    else:    
        print("Insufficient data for a reliable recommendation. Your taste is unique.")
                    
    return videli_taky

In [115]:
# used, but not used, for actual ratings
def recomm_with_ratings(books, users, isbn, treshold):
    videli_taky = {}
    if isbn in books.keys():
        videli_to = books[isbn]
        for (user, rating) in videli_to:
            if ((rating >= treshold) and (user in users.keys())):    
                                             # check cause he might have been removed for evaluating
                for (book, rating2) in users[user]:
                    if book != isbn:         # not contain book from the initial question
                        if (book in videli_taky.keys()):
                            videli_taky[book].append(rating2)
                        else: 
                            videli_taky[book] = [rating2]
    else:    
        print("Insufficient data for a reliable recommendation. Your taste is unique.")
                    
    return videli_taky

<h4>Formatting output</h4>

In [116]:
# used both for interaction and high ratings
def written_output(sorted_recom, what_did, max_recommendations, reasonable_amount):
    i = 0
    for item in sorted_recom:
        isbn = item[0]
        pocet_lidi = item[1]
        if ((int(pocet_lidi) >= reasonable_amount) and (i < max_recommendations)):
            #print ("jsem uvnitr")
            bname, bauthor = full_name(isbn)
            if (bname and bauthor):            
                print("You should read {} written by {} ({} {}).".format(bname, bauthor, pocet_lidi, what_did))
                i+=1

<h3>Runtime</h3>

<h4>Building dictionaries</h4>

In [117]:
seen_books, seen_users = build_dictionary_seen(df_interact)

In [118]:
min_rating = 8       # minimum rating taken into account
rated_amount_books, rated_amount_users = build_dictionary_ratings_amount(min_rating, df_ratings)

In [86]:
# currently not used
rated_values_books, rated_values_users = build_dictionary_ratings_values(df_ratings)

<h4>Calling various recommendation functions</h4>

In [120]:
# recommendation based on users who clicked on other books

favourite_isbn = "0345339703"   #LOTR FOTR
#favourite_isbn = "0425115801"   #random
#isbn = "bleble"       # not throwing an error!
recommend = recomm_dict_without_ratings(seen_books, seen_users, favourite_isbn)
sorted_recom = sorted(recommend.items(), key=lambda x:x[1], reverse=True)

max_recom = 10    # or 20, or...
reasonable_amount = 5 # don't accept less people having seen this

written_output(sorted_recom, "people have seen", max_recom, reasonable_amount)

You should read The Two Towers (The Lord of the Rings, Part 2) written by J.R.R. TOLKIEN (40 people have seen).
You should read The Return of the King (The Lord of the Rings, Part 3) written by J.R.R. TOLKIEN (39 people have seen).
You should read Wild Animus written by Rich Shapero (23 people have seen).
You should read A Time to Kill written by JOHN GRISHAM (21 people have seen).
You should read The Pelican Brief written by John Grisham (21 people have seen).
You should read The Hobbit : The Enchanting Prelude to The Lord of the Rings written by J.R.R. TOLKIEN (20 people have seen).
You should read The Chamber written by John Grisham (14 people have seen).
You should read The Joy Luck Club written by Amy Tan (14 people have seen).
You should read Jurassic Park written by Michael Crichton (14 people have seen).
You should read The Lovely Bones: A Novel written by Alice Sebold (14 people have seen).


In [122]:
# recommendation based on amount of users who rated highly

isbn = "0345339703"   #LOTR FOTR
#favourite_isbn = "0425115801"   #random
#isbn = "bleble"       # not throwing an error!
recommend = recomm_dict_without_ratings(rated_amount_books, rated_amount_users, favourite_isbn)
sorted_recom = sorted(recommend.items(), key=lambda x:x[1], reverse=True)

max_recom = 10   # or 20, or...
reasonable_amount = 3 # don't accept less people having rated this

written_output(sorted_recom, "people have highly rated", max_recom, reasonable_amount)

You should read The Return of the King (The Lord of the Rings, Part 3) written by J.R.R. TOLKIEN (34 people have highly rated).
You should read The Two Towers (The Lord of the Rings, Part 2) written by J.R.R. TOLKIEN (33 people have highly rated).
You should read The Hobbit : The Enchanting Prelude to The Lord of the Rings written by J.R.R. TOLKIEN (25 people have highly rated).
You should read Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback)) written by J. K. Rowling (10 people have highly rated).
You should read The Da Vinci Code written by Dan Brown (7 people have highly rated).
You should read Harry Potter and the Goblet of Fire (Book 4) written by J. K. Rowling (7 people have highly rated).
You should read Harry Potter and the Chamber of Secrets (Book 2) written by J. K. Rowling (7 people have highly rated).
You should read Harry Potter and the Order of the Phoenix (Book 5) written by J. K. Rowling (7 people have highly rated).
You should read Harry Potter and the P

In [90]:
# work in progress on recommendation with ratings
# not straightforward how to sort

#favourite_isbn = "0345339703"   #LOTR FOTR
#favourite_isbn = "0425115801"   #random
#isbn = "bleble"       # not throwing an error!

#treshold = 7            # minimal rating for fellow readers
#recommend = recomm_with_ratings(rated_values_books, rated_values_users, favourite_isbn, treshold)
#print (recommend)
