### Baseline model for predicting reccomended book

In [110]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE


### Import dataset, split into train/val/test

In [111]:
df = pd.read_csv('model_data/model_data_wo_eXtern.csv')

df.head(5)

Unnamed: 0,User-ID,Age,City,Region/State,Country,Age_Missing,ISBN,Book-Rating,Rating_Categ,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,2,18.0,stockton,california,usa,0,195153448,0,Not rated,Classical Mythology,Mark P. O. Morford,2002.0,Oxford University Press
1,7,-1.0,washington,dc,usa,1,34542252,0,Not rated,,,,
2,8,-1.0,timmins,ontario,canada,1,2005018,5,Avarege (4-5),Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada
3,8,-1.0,timmins,ontario,canada,1,60973129,0,Not rated,Decision in Normandy,Carlo D'Este,1991.0,HarperPerennial
4,8,-1.0,timmins,ontario,canada,1,374157065,0,Not rated,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999.0,Farrar Straus Giroux


### Remove not rated

In [112]:
### Unique values and counts from rating_categ
df['Rating_Categ'].value_counts()

Rating_Categ
Not rated           716109
Excellent (8-10)    249887
Good (6-7)          113381
Avarege (4-5)        59878
Bad (1-3)            10525
Name: count, dtype: int64

In [113]:
### Create a new binary category for rating (everything above 8 is 1, otherwise 0)

df['Recommend'] = np.where(df['Book-Rating'] >= 7, 1, 0)

df['Not_Recommend'] = np.where((df['Book-Rating'] <= 6) & (df['Book-Rating'] > 0), 1, 0)
df['Recommend'].value_counts()

Recommend
0    823436
1    326344
Name: count, dtype: int64

In [114]:
df['Not_Recommend'].value_counts()

Not_Recommend
0    1042453
1     107327
Name: count, dtype: int64

Split into train/test/val

Select features and targets

In [115]:
#features = ['Age', 'City', 'Region/State', 'Country', 'Age_Missing','Year-Of-Publication']
features = ['Age', 'Age_Missing','Year-Of-Publication']
target_class = 'Recommend'
target_reg = 'Book-Rating'

### Association rules

In [116]:
print(df.shape)
df.columns

(1149780, 15)


Index(['User-ID', 'Age', 'City', 'Region/State', 'Country', 'Age_Missing',
       'ISBN', 'Book-Rating', 'Rating_Categ', 'Book-Title', 'Book-Author',
       'Year-Of-Publication', 'Publisher', 'Recommend', 'Not_Recommend'],
      dtype='object')

In [117]:
### Unique User-IDs
user_ids = len(df['User-ID'].unique())
print(f"Unique User-IDs: {user_ids}")

### Unique ISBNs
ISBNs = len(df['ISBN'].unique())
print(f"Unique ISBNs: {ISBNs}")

Unique User-IDs: 105283
Unique ISBNs: 340556


In [118]:
### Filter boooks

In [119]:
# Get the top N most rated books
n_books = 1000

top_n_books = df.groupby('ISBN').size().sort_values(ascending=False).head(1000).index

# Filter the DataFrame to include only the top N books
subset_df_books = df[df['ISBN'].isin(top_n_books)].copy()


print(f"Unique ISBNs: {len(subset_df_books['ISBN'].unique())}")


Unique ISBNs: 1000


Filter user

In [120]:
# Step 1: Count the number of ratings per user
user_counts = df.groupby('User-ID').size()

n = 25
# Step 2: Filter users who have at least 50 ratings (or any threshold)
users_with_at_least_n_ratings = user_counts[user_counts >= 25].index

subset_df_book_users = subset_df_books[subset_df_books['User-ID'].isin(users_with_at_least_n_ratings)]

print(f"Unique User-IDs: {len(subset_df_book_users['User-ID'].unique())}")
print(f"Unique ISBNs: {len(subset_df_book_users['ISBN'].unique())}")


Unique User-IDs: 5498
Unique ISBNs: 1000


In [121]:
user_book_matrix = subset_df_book_users.pivot(index='User-ID', columns='ISBN', values='Recommend').fillna(0)

In [122]:
user_book_matrix.iloc[0:10,0:10]

ISBN,000649840X,002542730X,0060008032,0060096195,006016848X,0060173289,0060175400,0060188731,006019491X,0060199652
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
243,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
446,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
487,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
503,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
638,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [123]:
user_book_matrix.sum().sort_values(ascending=False).head(10)

ISBN
0316666343    281.0
0385504209    228.0
059035342X    165.0
0312195516    159.0
043935806X    139.0
0679781587    138.0
0142001740    136.0
0446672211    131.0
0060928336    126.0
0446310786    122.0
dtype: float64

### Simple reccomnendation system

In [None]:
def recommend_books(selected_ISBN, df):
    """Finds books liked by users who also liked the selected ISBN."""
    
    # Step 1: Find users who liked the selected book
    users_who_liked = df[(df['ISBN'] == selected_ISBN) & (df['Recommend'])]
    users_list = users_who_liked['User-ID'].unique()

    # Step 2: Find other books these users also liked
    other_books = df[(df['User-ID'].isin(users_list)) & (df['Recommend']) & (df['ISBN'] != selected_ISBN)]

    # Step 3: Count occurrences of each recommended book
    book_counts = other_books['ISBN'].value_counts().reset_index()
    book_counts.columns = ['ISBN', 'Count_recommend']

    ## Look up book nammes
    book_counts = book_counts.merge(df[['ISBN', 'Book-Title']].drop_duplicates(), how='left', left_on='ISBN', right_on='ISBN')


  # *Step 5: Aggregate by book title** (summing counts for books with different ISBNs but the same title)
    book_counts = book_counts.groupby('Book-Title', as_index=False).agg({'Count_recommend': 'sum'})

    # Step 6: Sort by recommendation count
    book_counts = book_counts.sort_values(by='Count_recommend', ascending=False)


    return book_counts



In [152]:
def not_recommended_books(selected_ISBN, df):
    """Finds books not liked by users who also did not like the selected ISBN, aggregating by book title."""
    
    # Step 1: Find users who did not recommend the selected book
    users_who_disliked = df[(df['ISBN'] == selected_ISBN) & (df['Not_Recommend'])]
    users_list = users_who_disliked['User-ID'].unique()

    # Step 2: Find other books these users also did not like
    other_books = df[(df['User-ID'].isin(users_list)) & (df['Not_Recommend']) & (df['ISBN'] != selected_ISBN)]

    # Step 3: Count occurrences of each not recommended book (by ISBN)
    book_counts = other_books['ISBN'].value_counts().reset_index()
    book_counts.columns = ['ISBN', 'Count_not_recommend']

    # Step 4: Merge to get book titles
    book_counts = book_counts.merge(df[['ISBN', 'Book-Title']].drop_duplicates(), how='left', on='ISBN')

    # Step 5: Aggregate by book title (summing counts for books with different ISBNs but the same title)
    book_counts = book_counts.groupby('Book-Title', as_index=False).agg({'Count_not_recommend': 'sum'})

    # Step 6: Sort by not recommendation count
    book_counts = book_counts.sort_values(by='Count_not_recommend', ascending=False)

    return book_counts



In [153]:
# Example: User selects ISBN "12345"
selected_book = "059035342X"

### Look up book name for selected book
selected_book_name = df[df['ISBN'] == selected_book]['Book-Title'].values[0]
print(f"Selected Book: {selected_book_name}")

recommendations = recommend_books(selected_book, subset_df_book_users)

print(recommendations.head(100))

Selected Book: Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))
                                            Book-Title  Count_recommend
248   Harry Potter and the Chamber of Secrets (Book 2)               69
251  Harry Potter and the Prisoner of Azkaban (Book 3)               55
249       Harry Potter and the Goblet of Fire (Book 4)               51
250  Harry Potter and the Order of the Phoenix (Boo...               32
100                              Bridget Jones's Diary               14
..                                                 ...              ...
53                                         Animal Farm                5
600                            The God of Small Things                5
203  Fast Food Nation: The Dark Side of the All-Ame...                5
390         One for the Money (A Stephanie Plum Novel)                5
113                   Charlotte's Web (Trophy Newbery)                5

[100 rows x 2 columns]


In [154]:
not_reccomendations = not_recommended_books(selected_book, subset_df_book_users)
print(not_reccomendations.head(10))

                                           Book-Title  Count_not_recommend
14   Harry Potter and the Chamber of Secrets (Book 2)                    2
42                               The Cradle Will Fall                    2
0   A Beautiful Mind: The Life of Mathematical Gen...                    1
3                                         Animal Farm                    1
1                                      A Time to Kill                    1
5                                             Beloved                    1
6                                          Boy's Life                    1
7                                    Call of the Wild                    1
2                                  A Walk to Remember                    1
8                                          Disclosure                    1


In [157]:
### Join the two dataframes by the ISBN
merge_reccomendations = recommendations.merge(not_reccomendations, how='left', left_on='Book-Title', right_on='Book-Title').fillna(0)


## Calculate reccomneded %%!
merge_reccomendations['Recommendation_Percentage'] =100* merge_reccomendations['Count_recommend'] / (merge_reccomendations['Count_recommend'] + merge_reccomendations['Count_not_recommend'])


In [158]:
merge_reccomendations



Unnamed: 0,Book-Title,Count_recommend,Count_not_recommend,Recommendation_Percentage
0,Harry Potter and the Chamber of Secrets (Book 2),69,2.0,97.183099
1,Harry Potter and the Prisoner of Azkaban (Book 3),55,0.0,100.000000
2,Harry Potter and the Goblet of Fire (Book 4),51,0.0,100.000000
3,Harry Potter and the Order of the Phoenix (Boo...,32,0.0,100.000000
4,Bridget Jones's Diary,14,0.0,100.000000
...,...,...,...,...
795,Vinegar Hill (Oprah's Book Club (Paperback)),1,0.0,100.000000
796,Violets Are Blue,1,0.0,100.000000
797,Visions of Sugar Plums,1,0.0,100.000000
798,Waiting (Vintage International),1,0.0,100.000000


In [None]:
### Recco

## Objective function

In [160]:
def rank_books(df, count_col="Count_Recommend", percent_col="Percent_Recommend", top_n=10, w1=0.5, w2=0.5):
    """Ranks books based on Count_Recommend and % Recommendation."""
    
    # Normalize values between 0 and 1
    df[count_col + "_norm"] = (df[count_col] - df[count_col].min()) / (df[count_col].max() - df[count_col].min())
    df[percent_col + "_norm"] = (df[percent_col] - df[percent_col].min()) / (df[percent_col].max() - df[percent_col].min())

    # Compute score
    df["Score"] = w1 * df[count_col + "_norm"] + w2 * df[percent_col + "_norm"]

    # Select top books
    top_books = df.sort_values(by="Score", ascending=False).head(top_n)

    return top_books[["Book-Title", "Score", count_col, percent_col]]

In [161]:
rank_books(merge_reccomendations, count_col="Count_recommend", percent_col="Recommendation_Percentage", top_n=10, w1=0.2, w2=0.8)

Unnamed: 0,Book-Title,Score,Count_recommend,Recommendation_Percentage
1,Harry Potter and the Prisoner of Azkaban (Book 3),0.958824,55,100.0
0,Harry Potter and the Chamber of Secrets (Book 2),0.95493,69,97.183099
2,Harry Potter and the Goblet of Fire (Book 4),0.947059,51,100.0
3,Harry Potter and the Order of the Phoenix (Boo...,0.891176,32,100.0
4,Bridget Jones's Diary,0.838235,14,100.0
6,The Da Vinci Code,0.835294,13,100.0
9,The Joy Luck Club,0.832353,12,100.0
12,The Secret Life of Bees,0.829412,11,100.0
10,The Red Tent (Bestselling Backlist),0.829412,11,100.0
14,The Nanny Diaries: A Novel,0.829412,11,100.0
