### Baseline model for predicting reccomended book

In [None]:
import pandas as pd
import numpy as np

In [2]:
import sys
sys.path.append('../')

from functions.fun_model import recommend_books, not_recommended_books

### Import dataset, split into train/val/test

In [3]:
df = pd.read_csv('model_data/model_data_wo_eXtern.csv')

df.head(5)

Unnamed: 0,User-ID,Age,City,Region/State,Country,Age_Missing,ISBN,Book-Rating,Rating_Categ,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,2,18.0,stockton,california,usa,0,195153448,0,Not rated,Classical Mythology,Mark P. O. Morford,2002.0,Oxford University Press
1,7,-1.0,washington,dc,usa,1,34542252,0,Not rated,,,,
2,8,-1.0,timmins,ontario,canada,1,2005018,5,Avarege (4-5),Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada
3,8,-1.0,timmins,ontario,canada,1,60973129,0,Not rated,Decision in Normandy,Carlo D'Este,1991.0,HarperPerennial
4,8,-1.0,timmins,ontario,canada,1,374157065,0,Not rated,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999.0,Farrar Straus Giroux


### Remove not rated

In [4]:
### Unique values and counts from rating_categ
df['Rating_Categ'].value_counts()

Rating_Categ
Not rated           716109
Excellent (8-10)    249887
Good (6-7)          113381
Avarege (4-5)        59878
Bad (1-3)            10525
Name: count, dtype: int64

In [None]:
### Create a new binary category for rating (everything above 8 is 1, otherwise 0)
df['Recommend'] = np.where(df['Book-Rating'] >= 7, 1, 0)

df['Not_Recommend'] = np.where((df['Book-Rating'] <= 6) & (df['Book-Rating'] > 0), 1, 0)
df['Recommend'].value_counts()

Recommend
0    823436
1    326344
Name: count, dtype: int64

In [6]:
df['Not_Recommend'].value_counts()

Not_Recommend
0    1042453
1     107327
Name: count, dtype: int64

### Association rules

In [23]:
print(df.shape)
df.columns

(1149780, 15)


Index(['User-ID', 'Age', 'City', 'Region/State', 'Country', 'Age_Missing',
       'ISBN', 'Book-Rating', 'Rating_Categ', 'Book-Title', 'Book-Author',
       'Year-Of-Publication', 'Publisher', 'Recommend', 'Not_Recommend'],
      dtype='object')

In [24]:
### Unique User-IDs
user_ids = len(df['User-ID'].unique())
print(f"Unique User-IDs: {user_ids}")

### Unique ISBNs
ISBNs = len(df['ISBN'].unique())
print(f"Unique ISBNs: {ISBNs}")

Unique User-IDs: 105283
Unique ISBNs: 340556


### Filter boooks

In [26]:
# Get the top N most rated books
n_books = 1000

top_n_books = df.groupby('ISBN').size().sort_values(ascending=False).head(1000).index

# Filter the DataFrame to include only the top N books
subset_df_books = df[df['ISBN'].isin(top_n_books)].copy()


print(f"Unique ISBNs: {len(subset_df_books['ISBN'].unique())}")


Unique ISBNs: 1000


### Filter users

In [27]:
# Step 1: Count the number of ratings per user
user_counts = df.groupby('User-ID').size()

n = 25
# Step 2: Filter users who have at least 50 ratings (or any threshold)
users_with_at_least_n_ratings = user_counts[user_counts >= 25].index

subset_df_book_users = subset_df_books[subset_df_books['User-ID'].isin(users_with_at_least_n_ratings)]

print(f"Unique User-IDs: {len(subset_df_book_users['User-ID'].unique())}")
print(f"Unique ISBNs: {len(subset_df_book_users['ISBN'].unique())}")


Unique User-IDs: 5498
Unique ISBNs: 1000


In [28]:
user_book_matrix = subset_df_book_users.pivot(index='User-ID', columns='ISBN', values='Recommend').fillna(0)

In [29]:
user_book_matrix.iloc[0:10,0:10]

ISBN,000649840X,002542730X,0060008032,0060096195,006016848X,0060173289,0060175400,0060188731,006019491X,0060199652
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
243,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
446,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
487,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
503,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
638,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
user_book_matrix.sum().sort_values(ascending=False).head(10)

ISBN
0316666343    281.0
0385504209    228.0
059035342X    165.0
0312195516    159.0
043935806X    139.0
0679781587    138.0
0142001740    136.0
0446672211    131.0
0060928336    126.0
0446310786    122.0
dtype: float64

### Simple reccomnendation system

In [None]:
# Example: User selects ISBN "12345"
#selected_book = "059035342X" # Harry Potter and the Sorcerer's Stone

selected_book = '0385504209' # The Da Vinci Code

### Look up book name for selected book
selected_book_name = df[df['ISBN'] == selected_book]['Book-Title'].values[0]
print(f"Selected Book: {selected_book_name}")

recommendations = recommend_books(selected_book, subset_df_book_users)

print(recommendations.head(100))

Selected Book: The Da Vinci Code
                                            Book-Title  Count_recommend
53                                 Angels &amp; Demons               34
648                          The Lovely Bones: A Novel               32
691                            The Secret Life of Bees               25
681                The Red Tent (Bestselling Backlist)               19
328                                         Life of Pi               17
..                                                 ...              ...
592  The Fellowship of the Ring (The Lord of the Ri...                7
719                                  The Tommyknockers                7
233                           From a Buick 8 : A Novel                7
702                                  The Street Lawyer                7
280                           I Know This Much Is True                7

[100 rows x 2 columns]


In [38]:
not_reccomendations = not_recommended_books(selected_book, subset_df_book_users)
print(not_reccomendations.head(10))

                                           Book-Title  Count_not_recommend
28                                   Lucky : A Memoir                    2
0                                           4 Blondes                    1
2                                      ANGELA'S ASHES                    1
3             Along Came a Spider (Alex Cross Novels)                    1
4   Angus, Thongs and Full-Frontal Snogging: Confe...                    1
1   A Is for Alibi (Kinsey Millhone Mysteries (Pap...                    1
5                                      Ashes to Ashes                    1
6              At Home in Mitford (The Mitford Years)                    1
8                                   Breathing Lessons                    1
7                                           Bleachers                    1


In [39]:
### Join the two dataframes by the ISBN
merge_reccomendations = recommendations.merge(not_reccomendations, how='left', left_on='Book-Title', right_on='Book-Title').fillna(0)

## Calculate reccomneded %%!
merge_reccomendations['Recommendation_Percentage'] =100* merge_reccomendations['Count_recommend'] / (merge_reccomendations['Count_recommend'] + merge_reccomendations['Count_not_recommend'])


In [40]:
merge_reccomendations



Unnamed: 0,Book-Title,Count_recommend,Count_not_recommend,Recommendation_Percentage
0,Angels &amp; Demons,34,0.0,100.000000
1,The Lovely Bones: A Novel,32,0.0,100.000000
2,The Secret Life of Bees,25,1.0,96.153846
3,The Red Tent (Bestselling Backlist),19,1.0,95.000000
4,Life of Pi,17,0.0,100.000000
...,...,...,...,...
803,Valhalla Rising (Dirk Pitt Adventures (Paperba...,1,0.0,100.000000
804,Vector,1,0.0,100.000000
805,Visions of Sugar Plums: A Stephanie Plum Holid...,1,0.0,100.000000
806,Waiting to Exhale,1,0.0,100.000000


## Objective function

In [41]:
def rank_books(df, count_col="Count_Recommend", percent_col="Percent_Recommend", top_n=10, w1=0.5, w2=0.5):
    """Ranks books based on Count_Recommend and % Recommendation."""
    
    # Normalize values between 0 and 1
    df[count_col + "_norm"] = (df[count_col] - df[count_col].min()) / (df[count_col].max() - df[count_col].min())
    df[percent_col + "_norm"] = (df[percent_col] - df[percent_col].min()) / (df[percent_col].max() - df[percent_col].min())

    # Compute score
    df["Score"] = w1 * df[count_col + "_norm"] + w2 * df[percent_col + "_norm"]

    # Select top books
    top_books = df.sort_values(by="Score", ascending=False).head(top_n)

    return top_books[["Book-Title", "Score", count_col, percent_col]]

In [44]:
rank_books(merge_reccomendations, count_col="Count_recommend", percent_col="Recommendation_Percentage", top_n=100, w1=0.8, w2=0.2)

Unnamed: 0,Book-Title,Score,Count_recommend,Recommendation_Percentage
0,Angels &amp; Demons,1.000000,34,100.000000
1,The Lovely Bones: A Novel,0.951515,32,100.000000
2,The Secret Life of Bees,0.766434,25,96.153846
3,The Red Tent (Bestselling Backlist),0.616364,19,95.000000
4,Life of Pi,0.587879,17,100.000000
...,...,...,...,...
86,Pay It Forward,0.345455,7,100.000000
72,The Queen of the Damned (Vampire Chronicles (P...,0.345455,7,100.000000
87,Confessions of a Shopaholic (Summer Display Op...,0.345455,7,100.000000
93,The Last Precinct,0.345455,7,100.000000
