In [1445]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy.sparse.linalg as spla
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
# from clubs.models import User, Club, Club_Users, Club_Books, Book, User_Books

pd.options.mode.chained_assignment = None  # default='warn'

In [1446]:
# def __init__(self, user_id):
#     self.book_df = pd.DataFrame(list(Book.objects.all().values()))
#     self.user_df = pd.DataFrame(list(User.objects.all().values()))
#     self.club_df = pd.DataFrame(list(Club.objects.all().values()))
#     self.club_book_df = pd.DataFrame(list(Club_Books.objects.all().values()))
#     self.club_user_df = pd.DataFrame(list(Club_Users.objects.all().values()))
#     self.user_book_df = pd.DataFrame(list(User_Books.objects.all().values()))
#     self.user_id = user_id

In [1447]:
# Junction tables must be filled when new user signed up!

book_df = pd.read_csv('./clubs_book.csv', encoding = "Latin1", sep = ",")
user_df = pd.read_csv('./clubs_user.csv', encoding = "Latin1", sep = ",")
club_df = pd.read_csv('./clubs_club.csv', encoding = "Latin1", sep = ",")
club_book_df = pd.read_csv('./clubs_club_books.csv', encoding = "Latin1", sep = ",")
club_user_df = pd.read_csv('./clubs_club_users.csv', encoding = "Latin1", sep = ",")
user_book_df = pd.read_csv('./clubs_user_books.csv', encoding = 'Latin1', sep = ',')

In [1448]:
# Merge club_user junction table with user table to get ages of all users

club_user_age_df = club_user_df.merge(user_df, left_on = 'user_id', right_on = 'id')
club_user_age_df = club_user_age_df[['id_x', 'club_id', 'user_id', 'age']]
club_user_age_df = club_user_age_df.rename(columns={'id_x':'club_user_id'}).sort_values('club_user_id', ascending=True)

club_user_age_df

Unnamed: 0,club_user_id,club_id,user_id,age
0,1,2,207351,115
1,2,2,241265,136
3,3,1,11909,41
6,4,5,31325,143
7,5,4,249795,114
...,...,...,...,...
12,496,10,72722,57
455,497,8,216249,103
294,498,2,144369,75
187,499,2,185348,83


In [1449]:
# Merge club_user junction table with club table to get locations of all clubs

club_user_location_df = club_user_df.merge(club_df, left_on = 'club_id', right_on = 'id')
club_user_location_df = club_user_location_df[['id_x', 'club_id', 'user_id', 'location']]
club_user_location_df = club_user_location_df.rename(columns={'id_x':'club_user_id'}).sort_values('club_user_id', ascending=True)

club_user_location_df

Unnamed: 0,club_user_id,club_id,user_id,location
0,1,2,207351,"Tokyo, Japan"
1,2,2,241265,"Tokyo, Japan"
63,3,1,11909,"Adelaide, Australia"
108,4,5,31325,"Sharm El Sheikh, Egypt"
156,5,4,249795,"Luxor, Egypt"
...,...,...,...,...
455,496,10,72722,"London, UK"
360,497,8,216249,"Delhi, India"
60,498,2,144369,"Tokyo, Japan"
61,499,2,185348,"Tokyo, Japan"


In [1450]:
# Merge the club_user_age and club_user_location tables to have all in one table

club_user_age_location_df = club_user_age_df.merge(club_user_location_df, left_on = 'club_user_id', right_on = 'club_user_id')
club_user_age_location_df = club_user_age_location_df[['club_user_id', 'club_id_x', 'user_id_x', 'age', 'location']]

club_user_age_location_df = club_user_age_location_df.rename(columns={'user_id_x':'user_id', 'club_id_x':'club_id'})

club_user_age_location_df

Unnamed: 0,club_user_id,club_id,user_id,age,location
0,1,2,207351,115,"Tokyo, Japan"
1,2,2,241265,136,"Tokyo, Japan"
2,3,1,11909,41,"Adelaide, Australia"
3,4,5,31325,143,"Sharm El Sheikh, Egypt"
4,5,4,249795,114,"Luxor, Egypt"
...,...,...,...,...,...
495,496,10,72722,57,"London, UK"
496,497,8,216249,103,"Delhi, India"
497,498,2,144369,75,"Tokyo, Japan"
498,499,2,185348,83,"Tokyo, Japan"


In [1451]:
# Get location and average age of each club

average_club_age_df = pd.merge(club_user_age_df, club_df, left_on='club_id', right_on='id') \
    .groupby(['club_id', 'name', 'location'])['age'].mean().reset_index(name = 'average_age')

average_club_age_df

Unnamed: 0,club_id,name,location,average_age
0,1,SarahMatthews's Club,"Adelaide, Australia",62.4
1,2,HelenGray's Club,"Tokyo, Japan",69.857143
2,3,RhysNicholson's Club,"Sao Paulo, Brazil",66.660377
3,4,KarenQuinn's Club,"Luxor, Egypt",61.72
4,5,CameronHarvey's Club,"Sharm El Sheikh, Egypt",80.979167
5,6,ArthurShepherd's Club,"Ottawa, Canada",65.490909
6,7,FrancesPhillips's Club,"Belo Horizonte, Brazil",77.886364
7,8,SylviaFranklin's Club,"Delhi, India",87.106383
8,9,GeorgeBryan's Club,"Ibadan, Nigeria",75.925
9,10,CarolKing's Club,"London, UK",68.672727


In [1452]:
# Add column for age difference and return clubs in ascending order of difference from my age

my_age = 65
average_club_age_df['age_difference'] = pd.DataFrame(abs(average_club_age_df['average_age'] - my_age).apply(np.floor))
average_club_age_difference_df = average_club_age_df.sort_values('age_difference', ascending=True)
average_club_age_difference_df

Unnamed: 0,club_id,name,location,average_age,age_difference
5,6,ArthurShepherd's Club,"Ottawa, Canada",65.490909,0.0
2,3,RhysNicholson's Club,"Sao Paulo, Brazil",66.660377,1.0
0,1,SarahMatthews's Club,"Adelaide, Australia",62.4,2.0
3,4,KarenQuinn's Club,"Luxor, Egypt",61.72,3.0
9,10,CarolKing's Club,"London, UK",68.672727,3.0
1,2,HelenGray's Club,"Tokyo, Japan",69.857143,4.0
8,9,GeorgeBryan's Club,"Ibadan, Nigeria",75.925,10.0
6,7,FrancesPhillips's Club,"Belo Horizonte, Brazil",77.886364,12.0
4,5,CameronHarvey's Club,"Sharm El Sheikh, Egypt",80.979167,15.0
7,8,SylviaFranklin's Club,"Delhi, India",87.106383,22.0


In [1453]:
# Return top 10 closest aged club IDs

closest_age_clubs_df = average_club_age_difference_df['club_id'].iloc[0:5]
closest_age_clubs_df

5     6
2     3
0     1
3     4
9    10
Name: club_id, dtype: int64

In [1454]:
# Merge closest aged club IDs with clubs CSV to get all details
closest_age_clubs_df = closest_age_clubs_df.reset_index().rename(columns={'club_id':'id'})
closest_age_clubs_df = pd.merge(closest_age_clubs_df, club_df, on = 'id')

closest_age_clubs_df

Unnamed: 0,index,id,name,location,description,avg_reading_speed,owner_id
0,5,6,ArthurShepherd's Club,"Ottawa, Canada",Eius labore dicta quaerat.\r\nAssumenda necess...,316,78
1,2,3,RhysNicholson's Club,"Sao Paulo, Brazil",Commodi a debitis beatae repellat sunt nesciun...,79,68227
2,0,1,SarahMatthews's Club,"Adelaide, Australia",Iusto repellat inventore nihil reiciendis. Dis...,364,271772
3,3,4,KarenQuinn's Club,"Luxor, Egypt",Sed dolorum in totam ad dolore. Magnam cum ea ...,384,209308
4,9,10,CarolKing's Club,"London, UK",Repellat iure sed reprehenderit amet similique...,158,10319


In [1455]:
# Get user count of each club

club_user_count_df = pd.merge(club_user_age_df, club_df, left_on='club_id', right_on='id') \
    .groupby(['club_id', 'name'])['user_id'].count().reset_index(name = 'user_count')

club_user_count_df

Unnamed: 0,club_id,name,user_count
0,1,SarahMatthews's Club,45
1,2,HelenGray's Club,63
2,3,RhysNicholson's Club,53
3,4,KarenQuinn's Club,50
4,5,CameronHarvey's Club,48
5,6,ArthurShepherd's Club,55
6,7,FrancesPhillips's Club,44
7,8,SylviaFranklin's Club,47
8,9,GeorgeBryan's Club,40
9,10,CarolKing's Club,55


In [1456]:
# Return clubs with matching location (exact)

my_location = 'Adelaide, Australia'

location_match = club_df['location'] == my_location
closest_location_clubs_df = club_df[location_match]

closest_location_clubs_df

Unnamed: 0,id,name,location,description,avg_reading_speed,owner_id
0,1,SarahMatthews's Club,"Adelaide, Australia",Iusto repellat inventore nihil reiciendis. Dis...,364,271772


In [1457]:
# Return all clubs with boolean value for matching location (using fuzzy search)

user_location = 'Adelaide, Australia'

# def get_ratio(row):
#     club_location = row['location']
#     return fuzz.token_sort_ratio(club_location, my_location)

club_locations_df = club_user_location_df
closest_club_location_fuzzy_df = pd.DataFrame()
location_matches_df = pd.DataFrame()
location_matches_df['matching_location'] = club_locations_df['location']
match_values = []


for club_location in club_locations_df['location']:
            match_value = int(fuzz.token_sort_ratio(club_location, user_location))
            if match_value > 90:
                match_values.append(match_value)

matching_locations = pd.DataFrame()
matching_locations['match_score'] = match_values

club_recs = pd.concat([location_matches_df, club_locations_df], axis=1).drop('location', axis=1)

closest_club_location_fuzzy_df = pd.concat([club_recs, matching_locations], axis=1)
closest_club_location_fuzzy_df = closest_club_location_fuzzy_df.sort_values('match_score', ascending=False).dropna(how='any',axis=0)




# closest_club_location_fuzzy_df = club_df[club_df.apply(get_ratio, axis=1) > 80]
# closest_club_location_fuzzy_df['location_fuzzy_score'] = club_df.apply(get_ratio, axis=1)
# closest_club_location_fuzzy_df = closest_club_location_fuzzy_df.sort_values('location_fuzzy_score', ascending=False)
# closest_club_location_fuzzy_df = closest_club_location_fuzzy_df.rename(columns={'id':'club_id'})

closest_club_location_fuzzy_df

   club_id             location  location_match_score  \
0        1  Adelaide, Australia                  88.0   

                                             user_id  
0  [120642, 143396, 246569, 79766, 106813, 25628,...  


In [1458]:
# Perform a many-to-many merge to get the favourite books of each club

club_favourite_books_df = pd.merge(pd.merge(club_df, club_book_df, left_on='id', right_on='club_id'), 
                    pd.merge(book_df, club_book_df, left_on='id', right_on='book_id'), on='book_id', how = 'inner') \
                        .groupby(['club_id_x', 'name', 'ISBN'])['title', 'author'].agg(list).reset_index()

club_favourite_books_df = club_favourite_books_df.rename(columns={'club_id_x':'club_id'})

club_favourite_books_df['title'] = club_favourite_books_df['title'].str[0]
club_favourite_books_df['author'] = club_favourite_books_df['author'].str[0]

club_favourite_books_df

  club_favourite_books_df = pd.merge(pd.merge(club_df, club_book_df, left_on='id', right_on='club_id'),


Unnamed: 0,club_id,name,ISBN,title,author
0,1,SarahMatthews's Club,0140016929,From London Far,Michael Innes
1,1,SarahMatthews's Club,0671685244,DEAD ON TARGET (HB #1) (Hardy Boys Casefiles (...,Franklin W. Dixon
2,1,SarahMatthews's Club,0688026826,Shadow Magic,Seymour Simon
3,1,SarahMatthews's Club,0785809880,The Pre-Raphaelites (Centuries of Style),Inc. Book Sales
4,1,SarahMatthews's Club,0812565665,A Cure for Gravity,Arthur Rosenfeld
5,2,HelenGray's Club,0023376627,The Conscious Reader,Caroline Schrodes
6,2,HelenGray's Club,0192816640,The Expedition of Humphry Clinker (The World's...,Tobias Smollett
7,2,HelenGray's Club,0198319746,Twelfth Night (Oxford School Shakespeare Series),Roma Gill
8,2,HelenGray's Club,0764507508,"Digital Photography for Dummies, Quick Reference",David D. Busch
9,2,HelenGray's Club,0965064573,Ferocious Romance: What My Encounters With Th...,Donna Minkowitz


In [1459]:
# Perform a many-to-many merge to get the favourite books of each user

user_favourite_books_df = pd.merge(pd.merge(user_df, user_book_df, left_on='id', right_on='user_id'), 
                    pd.merge(book_df, user_book_df, left_on='id', right_on='book_id'), on='book_id', how = 'inner') \
                       .groupby(['user_id_x', 'first_name','last_name', 'ISBN', 'title', 'author'])['title', 'author'].agg(list).reset_index()

user_favourite_books_df = user_favourite_books_df.rename(columns={'user_id_x':'user_id'}).drop(0, 1)
user_favourite_books_df

  user_favourite_books_df = pd.merge(pd.merge(user_df, user_book_df, left_on='id', right_on='user_id'),
  user_favourite_books_df = user_favourite_books_df.rename(columns={'user_id_x':'user_id'}).drop(0, 1)


Unnamed: 0,user_id,first_name,last_name,ISBN,title,author
0,78,Arthur,Shepherd,0373122675,To Marry McCloud (Bachelor Cousins) (Harlequi...,Carole Mortimer
1,78,Arthur,Shepherd,042512214X,Sisters in Crime 3 (Sisters in Crime),Marilyn Wallace
2,78,Arthur,Shepherd,0451522184,The Pilgrims' Progress,John Bunuan
3,78,Arthur,Shepherd,0732250927,Surfside High: Sandy (Surfside High),Virginia Baxter
4,78,Arthur,Shepherd,3499231093,Traum im Herbst. Und andere StÃ?ÃÂ¼cke.,Jon Fosse
...,...,...,...,...,...,...
2476,276432,Ruth,Sharp,0688026826,Shadow Magic,Seymour Simon
2477,276432,Ruth,Sharp,0821773801,Midnight Sun,Kat Martin
2478,276432,Ruth,Sharp,0865650675,A Redoute Treasury: 468 Watercolours from Les ...,Peter Mallary
2479,276432,Ruth,Sharp,0872261670,Solved!: Famous Mystery Writers on Classic Tru...,Richard Glyn Jones


In [1460]:
# Get the favourite books and authors of one user (me)

my_id = 142448
my_favourite_books_df = user_favourite_books_df.loc[user_favourite_books_df['user_id'] == my_id]
my_favourite_books_df

Unnamed: 0,user_id,first_name,last_name,ISBN,title,author
1324,142448,Lisa,Anderson,0374205094,The Meaning of Consuelo : A Novel,Judith Ortiz Cofer
1325,142448,Lisa,Anderson,0785809880,The Pre-Raphaelites (Centuries of Style),Inc. Book Sales
1326,142448,Lisa,Anderson,0843944722,Ungrateful Dead,Gary L. Holleman
1327,142448,Lisa,Anderson,087788739X,Running on Empty: Refilling Your Spirit at the...,Jill Briscoe
1328,142448,Lisa,Anderson,3788602724,"Was ist was?, Bd.32, Meereskunde",Rainer Crummenerl


In [1461]:
# Get the favourite books and authors of one club

club_id = 1
single_club_favourite_books_df = club_favourite_books_df.loc[club_favourite_books_df['club_id'] == club_id]
single_club_favourite_books_df

Unnamed: 0,club_id,name,ISBN,title,author
0,1,SarahMatthews's Club,140016929,From London Far,Michael Innes
1,1,SarahMatthews's Club,671685244,DEAD ON TARGET (HB #1) (Hardy Boys Casefiles (...,Franklin W. Dixon
2,1,SarahMatthews's Club,688026826,Shadow Magic,Seymour Simon
3,1,SarahMatthews's Club,785809880,The Pre-Raphaelites (Centuries of Style),Inc. Book Sales
4,1,SarahMatthews's Club,812565665,A Cure for Gravity,Arthur Rosenfeld


In [1462]:
# Return all matching favourite books between a single user and multiple clubs (using fuzzy search)

# def get_ratio(row):
#     book_title = row['title']
#     return fuzz.token_sort_ratio(book_title, my_favourite_book)

# my_favourite_book = my_favourite_books_df.iloc[0]['title']
# book_0_matches_df = club_favourite_books_df[club_favourite_books_df.apply(get_ratio, axis=1) > 80]
# book_0_matches_df['book_fuzzy_match_score'] = club_favourite_books_df.apply(get_ratio, axis=1)

# my_favourite_book = my_favourite_books_df.iloc[1]['title']
# book_1_matches_df = club_favourite_books_df[club_favourite_books_df.apply(get_ratio, axis=1) > 80]
# book_1_matches_df['book_fuzzy_match_score'] = club_favourite_books_df.apply(get_ratio, axis=1)

# my_favourite_book = my_favourite_books_df.iloc[2]['title']
# book_2_matches_df = club_favourite_books_df[club_favourite_books_df.apply(get_ratio, axis=1) > 80]
# book_2_matches_df['book_fuzzy_match_score'] = club_favourite_books_df.apply(get_ratio, axis=1)

# my_favourite_book = my_favourite_books_df.iloc[3]['title']
# book_3_matches_df = club_favourite_books_df[club_favourite_books_df.apply(get_ratio, axis=1) > 80]
# book_3_matches_df['book_fuzzy_match_score'] = club_favourite_books_df.apply(get_ratio, axis=1)

# my_favourite_book = my_favourite_books_df.iloc[4]['title']
# book_4_matches_df = club_favourite_books_df[club_favourite_books_df.apply(get_ratio, axis=1) > 80]
# book_4_matches_df['book_fuzzy_match_score'] = club_favourite_books_df.apply(get_ratio, axis=1)

# all_book_matches_df = pd.concat([book_0_matches_df, book_1_matches_df, book_2_matches_df, book_3_matches_df, book_4_matches_df])
# all_book_matches_df = all_book_matches_df.sort_values('book_fuzzy_match_score', ascending=False).dropna(how='any',axis=0)

# all_book_matches_df

In [1463]:
# Return all matching favourite books between a single user and multiple clubs (using fuzzy search)

match_values = []
book_matches_df = pd.DataFrame()
book_matches_df['book_title'] = club_favourite_books_df['title']

for i in range(5):
    my_favourite_book = str(my_favourite_books_df.iloc[i]['title'])
    # print(my_favourite_book)

    for title in club_favourite_books_df['title']:
        match_value = int(fuzz.token_sort_ratio(my_favourite_book, title))
        if match_value > 50:
            match_values.append(match_value)

book_match_values = pd.DataFrame()
book_match_values['match_score'] = match_values 

club_recs = pd.concat([book_matches_df, club_favourite_books_df], axis=1).drop('title', axis=1)


all_book_matches_df = pd.concat([club_recs, book_match_values], axis=1)        
all_book_matches_df = all_book_matches_df.sort_values('match_score', ascending=False).dropna(how='any',axis=0)
all_book_matches_df

KeyError: 0

In [None]:
# Return all matching favourite books between a single user and multiple clubs (using fuzzy search)

# for i in range(5):
#     my_favourite_book = my_favourite_books_df.iloc[i]['title']
#     print(my_favourite_book)

#     for index, row in club_favourite_books_df.iterrows():
#         book_title = row['title']
#         ratio = int(fuzz.token_sort_ratio(book_title, my_favourite_book))
#         ratio_high_enough = ratio > 30

#         if i == 0:
#             filtered_rows = club_favourite_books_df.apply(ratio_high_enough, axis = 1)
#             book_0_matches_df = club_favourite_books_df
#             book_0_matches_df['book_fuzzy_match_score'] = club_favourite_books_df.apply(ratio, axis=1)
#         elif i == 1:
#             filtered_rows = club_favourite_books_df.apply(ratio_high_enough, axis = 1)
#             book_1_matches_df = club_favourite_books_df[filtered_rows]
#             book_1_matches_df['book_fuzzy_match_score'] = club_favourite_books_df.apply(ratio, axis=1)
#         elif i == 2:
#             filtered_rows = club_favourite_books_df.apply(ratio_high_enough, axis = 1)
#             book_2_matches_df = club_favourite_books_df[filtered_rows]
#             book_2_matches_df['book_fuzzy_match_score'] = club_favourite_books_df.apply(ratio, axis=1)
#         elif i == 3:
#             filtered_rows = club_favourite_books_df.apply(ratio_high_enough, axis = 1)
#             book_3_matches_df = club_favourite_books_df[filtered_rows]
#             book_3_matches_df['book_fuzzy_match_score'] = club_favourite_books_df.apply(ratio, axis=1)
#         else:
#             filtered_rows = club_favourite_books_df.apply(ratio_high_enough, axis = 1)
#             book_4_matches_df = club_favourite_books_df[filtered_rows]
#             book_4_matches_df['book_fuzzy_match_score'] = club_favourite_books_df.apply(ratio, axis=1)

# all_book_matches_df = pd.concat([book_0_matches_df, book_1_matches_df, book_2_matches_df, book_3_matches_df, book_4_matches_df])
# all_book_matches_df = all_book_matches_df.sort_values('book_fuzzy_match_score', ascending=False).dropna(how='any',axis=0)

# all_book_matches_df

In [None]:
# Return a list of clubs in order of which have the most matching favourite books with the user

club_average_book_match_df = all_book_matches_df.groupby(['club_id', 'name'])['match_score'] \
    .count().reset_index(name = 'book_match_count') \
    .sort_values('book_match_count', ascending=False) \
    .rename(columns={'name':'club_book_name'})

club_average_book_match_df

Unnamed: 0,club_id,club_book_name,book_match_count
0,1,SarahMatthews's Club,1


In [None]:
# Return all matching favourite authors between a single user and multiple clubs (using fuzzy search)
# This works by checking the authors of all of a club's favourite books agains the authors of all of a user's favourite books

# def get_ratio(row):
#     book_author = row['author']
#     return fuzz.token_sort_ratio(book_author, my_favourite_author)

# my_favourite_author = my_favourite_books_df.iloc[0]['author']
# author_0_matches_df = club_favourite_books_df[club_favourite_books_df.apply(get_ratio, axis=1) > 80]
# author_0_matches_df['author_fuzzy_match_score'] = club_favourite_books_df.apply(get_ratio, axis=1)

# my_favourite_author = my_favourite_books_df.iloc[1]['author']
# author_1_matches_df = club_favourite_books_df[club_favourite_books_df.apply(get_ratio, axis=1) > 80]
# author_1_matches_df['author_fuzzy_match_score'] = club_favourite_books_df.apply(get_ratio, axis=1)

# my_favourite_author = my_favourite_books_df.iloc[2]['author']
# author_2_matches_df = club_favourite_books_df[club_favourite_books_df.apply(get_ratio, axis=1) > 80]
# author_2_matches_df['author_fuzzy_match_score'] = club_favourite_books_df.apply(get_ratio, axis=1)

# my_favourite_author = my_favourite_books_df.iloc[3]['author']
# author_3_matches_df = club_favourite_books_df[club_favourite_books_df.apply(get_ratio, axis=1) > 80]
# author_3_matches_df['author_fuzzy_match_score'] = club_favourite_books_df.apply(get_ratio, axis=1)

# my_favourite_author = my_favourite_books_df.iloc[4]['author']
# author_4_matches_df = club_favourite_books_df[club_favourite_books_df.apply(get_ratio, axis=1) > 80]
# author_4_matches_df['author_fuzzy_match_score'] = club_favourite_books_df.apply(get_ratio, axis=1)

# all_author_matches_df = pd.concat([author_0_matches_df, author_1_matches_df, author_2_matches_df, author_3_matches_df, author_4_matches_df])
# all_author_matches_df = all_author_matches_df.sort_values('author_fuzzy_match_score', ascending=False).dropna(how='any',axis=0)



club_favourite_authors_df = club_favourite_books_df
# user_favourite_books = self.get_fav_books_and_authors_per_user()
author_match_df = pd.DataFrame()
author_match_df['book_author'] = club_favourite_authors_df['author']
all_author_matches_df = pd.DataFrame()
match_values = []

for i in range(5):
    my_favourite_author = user_favourite_books_df.iloc[i]['author']

    for author in club_favourite_authors_df['author']:
        match_value = int(fuzz.token_sort_ratio(my_favourite_author, author))
        if match_value > 50:
            match_values.append(match_value)
    
matching_authors = pd.DataFrame()
matching_authors['match_score'] = match_values

club_recs = pd.concat([author_match_df, club_favourite_books_df], axis=1).drop('author', axis=1)

all_author_matches_df = pd.concat([club_recs, matching_authors], axis=1) 

all_author_matches_df = all_author_matches_df.sort_values('match_score', ascending=False).dropna(how='any',axis=0)
all_author_matches_df


Unnamed: 0,book_author,club_id,name,ISBN,title,match_score
3,Inc. Book Sales,1,SarahMatthews's Club,785809880,The Pre-Raphaelites (Centuries of Style),77.0
0,Michael Innes,1,SarahMatthews's Club,140016929,From London Far,56.0
2,Seymour Simon,1,SarahMatthews's Club,688026826,Shadow Magic,56.0
1,Franklin W. Dixon,1,SarahMatthews's Club,671685244,DEAD ON TARGET (HB #1) (Hardy Boys Casefiles (...,54.0


In [None]:
# Return a list of clubs in order of which have the most matching favourite authors with the user

club_average_author_match_df = all_author_matches_df.groupby(['club_id', 'name'])['match_score'] \
    .count().unstack(fill_value=0).stack() \
    .reset_index(name = 'author_match_count') \
    .sort_values('author_match_count', ascending=False) \
    .rename(columns={'name':'club_author_name'}) \
    
club_average_author_match_df

Unnamed: 0,club_id,club_author_name,author_match_count
0,1,SarahMatthews's Club,4


In [None]:
# Get all columns into one dataframe

# Location: average_club_age_difference_df
# Age difference: average_club_age_difference_df
# User count: club_user_count_df
# Favourite books: club_average_book_match_df
# Favourite authors: club_average_author_match_df

best_clubs_df = club_user_count_df.merge(average_club_age_difference_df, how = 'left', left_on = 'club_id', right_on = 'club_id')
best_clubs_df = best_clubs_df.merge(closest_club_location_fuzzy_df, how = 'left', left_on = 'club_id', right_on = 'club_id')
best_clubs_df = best_clubs_df.merge(club_average_book_match_df, how = 'left', left_on = 'club_id', right_on = 'club_id')
best_clubs_df = best_clubs_df.merge(club_average_author_match_df, how = 'left', left_on = 'club_id', right_on = 'club_id')

best_clubs_df = best_clubs_df[['club_id', 'club_author_name', 'match_score', 'age_difference', 'user_count', 'book_match_count', 'author_match_count']]
best_clubs_df = best_clubs_df.rename(columns={'club_author_name':'club_name'})

best_clubs_df['book_match_count'] = best_clubs_df['book_match_count'].fillna(0)
best_clubs_df['author_match_count'] = best_clubs_df['author_match_count'].fillna(0)
best_clubs_df['match_score'] = best_clubs_df['match_score'].fillna(0)

# Dirty way of getting name back...
best_clubs_df = best_clubs_df.merge(average_club_age_difference_df, on = 'club_id')
best_clubs_df = best_clubs_df[['club_id', 'name', 'match_score', 'age_difference_y', 'user_count', 'book_match_count', 'author_match_count']]
best_clubs_df = best_clubs_df.rename(columns={'age_difference_y':'age_difference'}, )

best_clubs_df

Unnamed: 0,club_id,name,match_score,age_difference,user_count,book_match_count,author_match_count
0,1,SarahMatthews's Club,0.0,2.0,45,1.0,4.0
1,2,HelenGray's Club,100.0,4.0,63,0.0,0.0
2,2,HelenGray's Club,100.0,4.0,63,0.0,0.0
3,2,HelenGray's Club,100.0,4.0,63,0.0,0.0
4,2,HelenGray's Club,100.0,4.0,63,0.0,0.0
5,2,HelenGray's Club,100.0,4.0,63,0.0,0.0
6,2,HelenGray's Club,100.0,4.0,63,0.0,0.0
7,2,HelenGray's Club,100.0,4.0,63,0.0,0.0
8,2,HelenGray's Club,100.0,4.0,63,0.0,0.0
9,2,HelenGray's Club,100.0,4.0,63,0.0,0.0


<h3><b>To do</b></h3>
<p>Implement algorithm which recommends 10 clubs based on user count, average age, location,
favourite books and favourite authors</p>
<br>
<b>Weighting if online only checkbox checked:</b>
<ul>
<li>Favourite books 0.3</li>
<li>Favourite authors 0.3</li>
<li>User count: 0.2</li>
<li>Age difference: 0.1</li>
<li>Location 0.1</li>
</ul>
<br>
<b>Weighting if online only checkbox <i>un</i>checked:</b>
<ul>
<li>Location 0.4</li>
<li>Favourite books 0.2</li>
<li>Favourite authors 0.2</li>
<li>User count: 0.1</li>
<li>Age difference: 0.1</li>
</ul>
<br>
<b>Order:</b>
<ul>
<li>Location - Matching(?), alphabetised/ascending</li>
<li>Favourite books - Matching(?), alphabetised/ascending</li>
<li>Favourite authors - Matching(?), alphabetised/ascending</li>
<li>User count - descending</li>
<li>Age difference - ascending</li>
</ul>

In [None]:
# Order if location matters

# TO DO: Find a way to convert location to distance

best_clubs_df = best_clubs_df.sort_values(["match_score", "book_match_count", "author_match_count", "age_difference", "user_count"], \
    ascending = [False, False, False, True, False])
best_clubs_df

Unnamed: 0,club_id,name,match_score,age_difference,user_count,book_match_count,author_match_count
1,2,HelenGray's Club,100.0,4.0,63,0.0,0.0
2,2,HelenGray's Club,100.0,4.0,63,0.0,0.0
3,2,HelenGray's Club,100.0,4.0,63,0.0,0.0
4,2,HelenGray's Club,100.0,4.0,63,0.0,0.0
5,2,HelenGray's Club,100.0,4.0,63,0.0,0.0
6,2,HelenGray's Club,100.0,4.0,63,0.0,0.0
7,2,HelenGray's Club,100.0,4.0,63,0.0,0.0
8,2,HelenGray's Club,100.0,4.0,63,0.0,0.0
9,2,HelenGray's Club,100.0,4.0,63,0.0,0.0
10,2,HelenGray's Club,100.0,4.0,63,0.0,0.0


In [None]:
# Order if online only

best_clubs_df = best_clubs_df.sort_values(["book_match_count", "author_match_count", "age_difference", "user_count"], \
    ascending = [False, False, True, False])
best_clubs_df

Unnamed: 0,club_id,name,match_score,age_difference,user_count,book_match_count,author_match_count
0,1,SarahMatthews's Club,0.0,2.0,45,1.0,4.0
49,6,ArthurShepherd's Club,0.0,0.0,55,0.0,0.0
46,3,RhysNicholson's Club,0.0,1.0,53,0.0,0.0
53,10,CarolKing's Club,0.0,3.0,55,0.0,0.0
47,4,KarenQuinn's Club,0.0,3.0,50,0.0,0.0
1,2,HelenGray's Club,100.0,4.0,63,0.0,0.0
2,2,HelenGray's Club,100.0,4.0,63,0.0,0.0
3,2,HelenGray's Club,100.0,4.0,63,0.0,0.0
4,2,HelenGray's Club,100.0,4.0,63,0.0,0.0
5,2,HelenGray's Club,100.0,4.0,63,0.0,0.0
