In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy.sparse.linalg as spla
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [2]:
# Junction tables must be filled when new user signed up!

book_df = pd.read_csv('./clubs_book.csv', encoding = "Latin1", sep = ",")
user_df = pd.read_csv('./clubs_user.csv', encoding = "Latin1", sep = ",")
club_df = pd.read_csv('./clubs_club.csv', encoding = "Latin1", sep = ",")
club_book_df = pd.read_csv('./clubs_club_books.csv', encoding = "Latin1", sep = ",")
club_user_df = pd.read_csv('./clubs_club_users.csv', encoding = "Latin1", sep = ",")
user_book_df = pd.read_csv('./clubs_user_books.csv', encoding = 'Latin1', sep = ',')

In [3]:
# Merge club_user junction table with user table to get ages of all users

club_user_age_df = club_user_df.merge(user_df, left_on = 'user_id', right_on = 'id')
club_user_age_df = club_user_age_df[['id_x', 'club_id', 'user_id', 'age']]
club_user_age_df = club_user_age_df.rename(columns={'id_x':'club_user_id'}).sort_values('club_user_id', ascending=True)

club_user_age_df

Unnamed: 0,club_user_id,club_id,user_id,age
0,1,2,207351,115
1,2,2,241265,136
3,3,1,11909,41
6,4,5,31325,143
7,5,4,249795,114
...,...,...,...,...
12,496,10,72722,57
455,497,8,216249,103
294,498,2,144369,75
187,499,2,185348,83


In [4]:
# Merge club_user junction table with club table to get locations of all clubs

club_user_location_df = club_user_df.merge(club_df, left_on = 'club_id', right_on = 'id')
club_user_location_df = club_user_location_df[['id_x', 'club_id', 'user_id', 'location']]
club_user_location_df = club_user_location_df.rename(columns={'id_x':'club_user_id'}).sort_values('club_user_id', ascending=True)

club_user_location_df

Unnamed: 0,club_user_id,club_id,user_id,location
0,1,2,207351,"Tokyo, Japan"
1,2,2,241265,"Tokyo, Japan"
63,3,1,11909,"Adelaide, Australia"
108,4,5,31325,"Sharm El Sheikh, Egypt"
156,5,4,249795,"Luxor, Egypt"
...,...,...,...,...
455,496,10,72722,"London, UK"
360,497,8,216249,"Delhi, India"
60,498,2,144369,"Tokyo, Japan"
61,499,2,185348,"Tokyo, Japan"


In [5]:
# Merge the club_user_age and club_user_location tables to have all in one table

club_user_age_location_df = club_user_age_df.merge(club_user_location_df, left_on = 'club_user_id', right_on = 'club_user_id')
club_user_age_location_df = club_user_age_location_df[['club_user_id', 'club_id_x', 'user_id_x', 'age', 'location']]
club_user_age_location_df = club_user_age_location_df.rename(columns={'user_id_x':'user_id', 'club_id_x':'club_id'})

club_user_age_location_df

Unnamed: 0,club_user_id,club_id,user_id,age,location
0,1,2,207351,115,"Tokyo, Japan"
1,2,2,241265,136,"Tokyo, Japan"
2,3,1,11909,41,"Adelaide, Australia"
3,4,5,31325,143,"Sharm El Sheikh, Egypt"
4,5,4,249795,114,"Luxor, Egypt"
...,...,...,...,...,...
495,496,10,72722,57,"London, UK"
496,497,8,216249,103,"Delhi, India"
497,498,2,144369,75,"Tokyo, Japan"
498,499,2,185348,83,"Tokyo, Japan"


In [6]:
# Get average age of each club

average_club_age_df = pd.merge(club_user_age_df, club_df, left_on='club_id', right_on='id') \
    .groupby(['club_id', 'name'])['age'].mean().reset_index(name = 'age')

average_club_age_df = average_club_age_df.rename(columns={'age':'average_age'})

average_club_age_df

Unnamed: 0,club_id,name,average_age
0,1,SarahMatthews's Club,62.4
1,2,HelenGray's Club,69.857143
2,3,RhysNicholson's Club,66.660377
3,4,KarenQuinn's Club,61.72
4,5,CameronHarvey's Club,80.979167
5,6,ArthurShepherd's Club,65.490909
6,7,FrancesPhillips's Club,77.886364
7,8,SylviaFranklin's Club,87.106383
8,9,GeorgeBryan's Club,75.925
9,10,CarolKing's Club,68.672727


In [7]:
# Add column for age difference and return clubs in ascending order of difference from my age

my_age = 65
average_club_age_df['age_difference'] = pd.DataFrame(abs(average_club_age_df['average_age'] - my_age))
average_club_age_df = average_club_age_df.sort_values('age_difference', ascending=True)
average_club_age_df

Unnamed: 0,club_id,name,average_age,age_difference
5,6,ArthurShepherd's Club,65.490909,0.490909
2,3,RhysNicholson's Club,66.660377,1.660377
0,1,SarahMatthews's Club,62.4,2.6
3,4,KarenQuinn's Club,61.72,3.28
9,10,CarolKing's Club,68.672727,3.672727
1,2,HelenGray's Club,69.857143,4.857143
8,9,GeorgeBryan's Club,75.925,10.925
6,7,FrancesPhillips's Club,77.886364,12.886364
4,5,CameronHarvey's Club,80.979167,15.979167
7,8,SylviaFranklin's Club,87.106383,22.106383


In [8]:
# Return top 10 closest aged club IDs

closest_age_clubs_df = average_club_age_df['club_id'].iloc[0:5]
closest_age_clubs_df

5     6
2     3
0     1
3     4
9    10
Name: club_id, dtype: int64

In [9]:
# Merge closest aged club IDs with clubs CSV to get all details
closest_age_clubs_df = closest_age_clubs_df.reset_index().rename(columns={'club_id':'id'})
closest_age_clubs_df = pd.merge(closest_age_clubs_df, club_df, on = 'id')

closest_age_clubs_df

Unnamed: 0,index,id,name,location,description,avg_reading_speed,owner_id
0,5,6,ArthurShepherd's Club,"Ottawa, Canada",Eius labore dicta quaerat.\r\nAssumenda necess...,316,78
1,2,3,RhysNicholson's Club,"Sao Paulo, Brazil",Commodi a debitis beatae repellat sunt nesciun...,79,68227
2,0,1,SarahMatthews's Club,"Adelaide, Australia",Iusto repellat inventore nihil reiciendis. Dis...,364,271772
3,3,4,KarenQuinn's Club,"Luxor, Egypt",Sed dolorum in totam ad dolore. Magnam cum ea ...,384,209308
4,9,10,CarolKing's Club,"London, UK",Repellat iure sed reprehenderit amet similique...,158,10319


In [10]:
# Return clubs with matching location (exact)

my_location = 'Adelaide, Australia'

location_match = club_df['location'] == my_location
closest_location_clubs_df = club_df[location_match]

closest_location_clubs_df

Unnamed: 0,id,name,location,description,avg_reading_speed,owner_id
0,1,SarahMatthews's Club,"Adelaide, Australia",Iusto repellat inventore nihil reiciendis. Dis...,364,271772


In [11]:
# Return clubs with matching location (using fuzzy search)

my_location = 'AdelAustralia'

def get_ratio(row):
    club_location = row['location']
    return fuzz.token_sort_ratio(club_location, my_location)

closest_club_location_fuzzy_df = club_df[club_df.apply(get_ratio, axis=1) > 35]
closest_club_location_fuzzy_df['location_fuzzy_score'] = club_df.apply(get_ratio, axis=1)
closest_club_location_fuzzy_df = closest_club_location_fuzzy_df.sort_values('location_fuzzy_score', ascending=False)

closest_club_location_fuzzy_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  closest_club_location_fuzzy_df['location_fuzzy_score'] = club_df.apply(get_ratio, axis=1)


Unnamed: 0,id,name,location,description,avg_reading_speed,owner_id,location_fuzzy_score
0,1,SarahMatthews's Club,"Adelaide, Australia",Iusto repellat inventore nihil reiciendis. Dis...,364,271772,84
5,6,ArthurShepherd's Club,"Ottawa, Canada",Eius labore dicta quaerat.\r\nAssumenda necess...,316,78,46
8,9,GeorgeBryan's Club,"Ibadan, Nigeria",Voluptatem tempora saepe itaque nostrum recusa...,343,47409,44
7,8,SylviaFranklin's Club,"Delhi, India",Asperiores debitis tenetur natus asperiores. A...,130,128231,42
2,3,RhysNicholson's Club,"Sao Paulo, Brazil",Commodi a debitis beatae repellat sunt nesciun...,79,68227,41


In [12]:
# Perform a many-to-many merge to get the favourite books of each club

club_favourite_books_df = pd.merge(pd.merge(club_df, club_book_df, left_on='id', right_on='club_id'), 
                    pd.merge(book_df, club_book_df, left_on='id', right_on='book_id'), on='book_id', how = 'inner') \
                        .groupby(['club_id_x', 'name', 'ISBN'])['title', 'author'].agg(list).reset_index()

club_favourite_books_df = club_favourite_books_df.rename(columns={'club_id_x':'club_id'})

club_favourite_books_df

  club_favourite_books_df = pd.merge(pd.merge(club_df, club_book_df, left_on='id', right_on='club_id'),


Unnamed: 0,club_id,name,ISBN,title,author
0,1,SarahMatthews's Club,0140016929,[From London Far],[Michael Innes]
1,1,SarahMatthews's Club,0671685244,[DEAD ON TARGET (HB #1) (Hardy Boys Casefiles ...,[Franklin W. Dixon]
2,1,SarahMatthews's Club,0688026826,[Shadow Magic],[Seymour Simon]
3,1,SarahMatthews's Club,0785809880,[The Pre-Raphaelites (Centuries of Style)],[Inc. Book Sales]
4,1,SarahMatthews's Club,0812565665,[A Cure for Gravity],[Arthur Rosenfeld]
5,2,HelenGray's Club,0023376627,[The Conscious Reader],[Caroline Schrodes]
6,2,HelenGray's Club,0192816640,[The Expedition of Humphry Clinker (The World'...,[Tobias Smollett]
7,2,HelenGray's Club,0198319746,[Twelfth Night (Oxford School Shakespeare Seri...,[Roma Gill]
8,2,HelenGray's Club,0764507508,"[Digital Photography for Dummies, Quick Refere...",[David D. Busch]
9,2,HelenGray's Club,0965064573,[Ferocious Romance: What My Encounters With T...,[Donna Minkowitz]


In [13]:
# Perform a many-to-many merge to get the favourite books of each user

user_favourite_books_df = pd.merge(pd.merge(user_df, user_book_df, left_on='id', right_on='user_id'), 
                    pd.merge(book_df, user_book_df, left_on='id', right_on='book_id'), on='book_id', how = 'inner') \
                       .groupby(['user_id_x', 'first_name','last_name', 'ISBN', 'title', 'author'])['title', 'author'].agg(list).reset_index()

user_favourite_books_df = user_favourite_books_df.rename(columns={'user_id_x':'user_id'})
user_favourite_books_df = user_favourite_books_df.drop(0, 1)
user_favourite_books_df

  user_favourite_books_df = pd.merge(pd.merge(user_df, user_book_df, left_on='id', right_on='user_id'),
  user_favourite_books_df = user_favourite_books_df.drop(0, 1)


Unnamed: 0,user_id,first_name,last_name,ISBN,title,author
0,78,Arthur,Shepherd,0373122675,To Marry McCloud (Bachelor Cousins) (Harlequi...,Carole Mortimer
1,78,Arthur,Shepherd,042512214X,Sisters in Crime 3 (Sisters in Crime),Marilyn Wallace
2,78,Arthur,Shepherd,0451522184,The Pilgrims' Progress,John Bunuan
3,78,Arthur,Shepherd,0732250927,Surfside High: Sandy (Surfside High),Virginia Baxter
4,78,Arthur,Shepherd,3499231093,Traum im Herbst. Und andere StÃ?ÃÂ¼cke.,Jon Fosse
...,...,...,...,...,...,...
2476,276432,Ruth,Sharp,0688026826,Shadow Magic,Seymour Simon
2477,276432,Ruth,Sharp,0821773801,Midnight Sun,Kat Martin
2478,276432,Ruth,Sharp,0865650675,A Redoute Treasury: 468 Watercolours from Les ...,Peter Mallary
2479,276432,Ruth,Sharp,0872261670,Solved!: Famous Mystery Writers on Classic Tru...,Richard Glyn Jones


In [14]:
# Get the favourite books and authors of one user (me)

my_id = 78
my_favourite_books_df = user_favourite_books_df.loc[user_favourite_books_df['user_id'] == my_id]
my_favourite_books_df

Unnamed: 0,user_id,first_name,last_name,ISBN,title,author
0,78,Arthur,Shepherd,0373122675,To Marry McCloud (Bachelor Cousins) (Harlequi...,Carole Mortimer
1,78,Arthur,Shepherd,042512214X,Sisters in Crime 3 (Sisters in Crime),Marilyn Wallace
2,78,Arthur,Shepherd,0451522184,The Pilgrims' Progress,John Bunuan
3,78,Arthur,Shepherd,0732250927,Surfside High: Sandy (Surfside High),Virginia Baxter
4,78,Arthur,Shepherd,3499231093,Traum im Herbst. Und andere StÃ?ÃÂ¼cke.,Jon Fosse


In [15]:
# Get the favourite books and authors of one club

club_id = 8
single_club_favourite_books_df = club_favourite_books_df.loc[club_favourite_books_df['club_id'] == club_id]
single_club_favourite_books_df

Unnamed: 0,club_id,name,ISBN,title,author
35,8,SylviaFranklin's Club,6928145,[The Strange Message in the Parchment (The Nan...,[Carolyn Keene]
36,8,SylviaFranklin's Club,373261977,[Love Bytes],[Sally Chapman]
37,8,SylviaFranklin's Club,373286856,"[Stark Lightning (Harlequin Historical, No. 85)]",[Elaine Rome]
38,8,SylviaFranklin's Club,553289322,[Scandal],[Amanda Quick]
39,8,SylviaFranklin's Club,865650675,[A Redoute Treasury: 468 Watercolours from Les...,"[Peter Mallary, Peter Mallary]"


In [16]:
# TO DO: Currently only checks against one favourite book, make sure it checks against all favourite books

# Return clubs with matching favourite books (using fuzzy search)

my_favourite_book = 'The Strange Message in the Parchment'

def get_ratio(row):
    book_title = row['title']
    return fuzz.token_sort_ratio(book_title, my_favourite_book)

closest_club_books_fuzzy_df = club_favourite_books_df[club_favourite_books_df.apply(get_ratio, axis=1) > 60]
closest_club_books_fuzzy_df['favourite_books_fuzzy_score'] = club_favourite_books_df.apply(get_ratio, axis=1)
closest_club_books_fuzzy_df = closest_club_books_fuzzy_df.sort_values('favourite_books_fuzzy_score', ascending=False)

closest_club_books_fuzzy_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  closest_club_books_fuzzy_df['favourite_books_fuzzy_score'] = club_favourite_books_df.apply(get_ratio, axis=1)


Unnamed: 0,club_id,name,ISBN,title,author,favourite_books_fuzzy_score
35,8,SylviaFranklin's Club,6928145,[The Strange Message in the Parchment (The Nan...,[Carolyn Keene],74


In [17]:
# TO DO: Currently only checks against one favourite author, make sure it checks against all favourite authors

# Return clubs with matching favourite authors (using fuzzy search)

my_favourite_author = 'jeff ryman'

def get_ratio(row):
    book_author = row['author']
    return fuzz.token_sort_ratio(book_author, my_favourite_author)

closest_club_authors_fuzzy_df = club_favourite_books_df[club_favourite_books_df.apply(get_ratio, axis=1) > 80]
closest_club_authors_fuzzy_df['favourite_authors_fuzzy_score'] = club_favourite_books_df.apply(get_ratio, axis=1)
closest_club_authors_fuzzy_df = closest_club_authors_fuzzy_df.sort_values('favourite_authors_fuzzy_score', ascending=False).dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

closest_club_authors_fuzzy_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  closest_club_authors_fuzzy_df['favourite_authors_fuzzy_score'] = club_favourite_books_df.apply(get_ratio, axis=1)


Unnamed: 0,club_id,name,ISBN,title,author,favourite_authors_fuzzy_score
27,6,ArthurShepherd's Club,553263447,[The Warrior Who Carried Life],[Geoff Ryman],86


In [18]:
# TO DO: Implement algorithm which uses age, location, favourite books and favourite authors



<h1> Our work ends here </h1>

In [19]:
club_user_age_location_pivot_matrix_df = club_user_age_location_df.pivot(index='user_id', 
                                                          columns='club_id').fillna(0)

club_user_age_location_pivot_matrix_df.head()

ValueError: Index contains duplicate entries, cannot reshape

In [None]:
club_user_age_location_pivot_matrix_df = club_user_age_location_pivot_matrix_df.values
club_user_age_location_pivot_matrix_df

NameError: name 'club_user_age_location_pivot_matrix_df' is not defined

In [None]:
from scipy.sparse.linalg import svds

NUMBER_OF_FACTORS_MF = 15

#Performs matrix factorization of the original user item matrix
U, sigma, Vt = svds(club_user_age_location_pivot_matrix_df, k = NUMBER_OF_FACTORS_MF)

In [None]:
sigma = np.diag(sigma)
sigma.shape

(15, 15)

In [None]:
all_clubs_books = np.dot(np.dot(U, sigma), Vt) 
all_user_predicted_books

array([[ 0.00000000e+00,  3.01145261e-34, -1.44341941e-34, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  3.58780903e-38, -3.19805495e-38, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  5.38921851e-35, -1.80280872e-34, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       ...,
       [ 0.00000000e+00,  1.53681908e-35, -3.57548715e-36, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  3.91903239e-34, -5.66829937e-36, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00, -1.00020459e-35, -1.05741351e-34, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00]])

In [None]:
def top_cosine_similarity(data, club_id, top_n=10):
    index = club_id 
    club_row = data[index, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))
    similarity = np.dot(club_row, data.T) / (magnitude[index] * magnitude)
    sort_indexes = np.argsort(-similarity)
    return sort_indexes[:top_n]

def similar_clubs(club_user_book, club_id, top_indexes):
    print('Recommendations for {0}: \n'.format(
    club_user_book[club_user_book.unique_id_club == club_id]['Book-Title'].values[0]))
    for id in top_indexes + 1:
        print(club_user_book[club_user_book.unique_id_club == id]['Book-Title'].values[0])

In [None]:
k = 50
movie_id =25954  
top_n = 3
sliced = Vt.T[:, :k] # representative data

similar_clubs(club_user_book, 25954, top_cosine_similarity(sliced, movie_id, top_n))

Recommendations for Pulse Points: 

The Witchfinder (Amos Walker Mystery Series)
Alone in a Crowd (Harper Monogram)
Jackie Oh


  """


# Example link:

https://book-recommendation-system-svd.herokuapp.com/

