### Baseline model for predicting reccomended book

In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE


### Import dataset, split into train/val/test

In [4]:
df = pd.read_csv('model_data/model_data_wo_eXtern.csv')

df.head(5)

Unnamed: 0,User-ID,Age,City,Region/State,Country,Age_Missing,ISBN,Book-Rating,Rating_Categ,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,2,18.0,stockton,california,usa,0,195153448,0,Not rated,Classical Mythology,Mark P. O. Morford,2002.0,Oxford University Press
1,7,-1.0,washington,dc,usa,1,34542252,0,Not rated,,,,
2,8,-1.0,timmins,ontario,canada,1,2005018,5,Avarege (4-5),Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada
3,8,-1.0,timmins,ontario,canada,1,60973129,0,Not rated,Decision in Normandy,Carlo D'Este,1991.0,HarperPerennial
4,8,-1.0,timmins,ontario,canada,1,374157065,0,Not rated,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999.0,Farrar Straus Giroux


### Remove not rated

In [5]:
### Unique values and counts from rating_categ
df['Rating_Categ'].value_counts()

Rating_Categ
Not rated           716109
Excellent (8-10)    249887
Good (6-7)          113381
Avarege (4-5)        59878
Bad (1-3)            10525
Name: count, dtype: int64

In [6]:
### Create a new binary category for rating (everything above 8 is 1, otherwise 0)

df['Reccomnend'] = np.where(df['Book-Rating'] >= 7, 1, 0)
df['Reccomnend'].value_counts()

Reccomnend
0    823436
1    326344
Name: count, dtype: int64

Split into train/test/val

Select features and targets

In [7]:
#features = ['Age', 'City', 'Region/State', 'Country', 'Age_Missing','Year-Of-Publication']
features = ['Age', 'Age_Missing','Year-Of-Publication']
target_class = 'Reccomnend'
target_reg = 'Book-Rating'

### Association rules

In [8]:
print(df.shape)
df.columns

(1149780, 14)


Index(['User-ID', 'Age', 'City', 'Region/State', 'Country', 'Age_Missing',
       'ISBN', 'Book-Rating', 'Rating_Categ', 'Book-Title', 'Book-Author',
       'Year-Of-Publication', 'Publisher', 'Reccomnend'],
      dtype='object')

In [9]:
### Unique User-IDs
user_ids = len(df['User-ID'].unique())
print(f"Unique User-IDs: {user_ids}")

### Unique ISBNs
ISBNs = len(df['ISBN'].unique())
print(f"Unique ISBNs: {ISBNs}")

Unique User-IDs: 105283
Unique ISBNs: 340556


In [10]:
### Filter boooks

In [11]:
# Get the top N most rated books
n_books = 1000

top_n_books = df.groupby('ISBN').size().sort_values(ascending=False).head(1000).index

# Filter the DataFrame to include only the top N books
subset_df_books = df[df['ISBN'].isin(top_n_books)].copy()


print(f"Unique ISBNs: {len(subset_df_books['ISBN'].unique())}")


Unique ISBNs: 1000


Filter user

In [12]:
# Step 1: Count the number of ratings per user
user_counts = df.groupby('User-ID').size()

n = 25
# Step 2: Filter users who have at least 50 ratings (or any threshold)
users_with_at_least_n_ratings = user_counts[user_counts >= 25].index

subset_df_book_users = subset_df_books[subset_df_books['User-ID'].isin(users_with_at_least_n_ratings)]

print(f"Unique User-IDs: {len(subset_df_book_users['User-ID'].unique())}")
print(f"Unique ISBNs: {len(subset_df_book_users['ISBN'].unique())}")


Unique User-IDs: 5498
Unique ISBNs: 1000


In [13]:
user_book_matrix = subset_df_book_users.pivot(index='User-ID', columns='ISBN', values='Reccomnend').fillna(0)

In [14]:
user_book_matrix.iloc[0:10,0:10]

ISBN,000649840X,002542730X,0060008032,0060096195,006016848X,0060173289,0060175400,0060188731,006019491X,0060199652
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
243,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
446,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
487,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
503,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
638,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
user_book_matrix.sum().sort_values(ascending=False).head(10)

ISBN
0316666343    281.0
0385504209    228.0
059035342X    165.0
0312195516    159.0
043935806X    139.0
0679781587    138.0
0142001740    136.0
0446672211    131.0
0060928336    126.0
0446310786    122.0
dtype: float64

### Simple reccomnendation system

In [18]:
def recommend_books(selected_ISBN, df):
    """Finds books liked by users who also liked the selected ISBN."""
    
    # Step 1: Find users who liked the selected book
    users_who_liked = df[(df['ISBN'] == selected_ISBN) & (df['Reccomnend'])]
    users_list = users_who_liked['User-ID'].unique()

    # Step 2: Find other books these users also liked
    other_books = df[(df['User-ID'].isin(users_list)) & (df['Reccomnend']) & (df['ISBN'] != selected_ISBN)]

    # Step 3: Count occurrences of each recommended book
    book_counts = other_books['ISBN'].value_counts().reset_index()
    book_counts.columns = ['Recommended ISBN', 'Count']

    ## Look up book nammes
    book_counts = book_counts.merge(df[['ISBN', 'Book-Title']].drop_duplicates(), how='left', left_on='Recommended ISBN', right_on='ISBN')

    book_counts = book_counts.drop('ISBN', axis=1)	
    return book_counts



In [23]:
# Example: User selects ISBN "12345"
selected_book = "059035342X"

### Look up book name for selected book
selected_book_name = df[df['ISBN'] == selected_book]['Book-Title'].values[0]
print(f"Selected Book: {selected_book_name}")

recommendations = recommend_books(selected_book, subset_df_book_users)

print(recommendations.head(100))

Selected Book: Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))
   Recommended ISBN  Count                                         Book-Title
0        0439064872     49   Harry Potter and the Chamber of Secrets (Book 2)
1        0439136369     33  Harry Potter and the Prisoner of Azkaban (Book 3)
2        043935806X     32  Harry Potter and the Order of the Phoenix (Boo...
3        0439139597     27       Harry Potter and the Goblet of Fire (Book 4)
4        0439139600     24       Harry Potter and the Goblet of Fire (Book 4)
..              ...    ...                                                ...
95       0451172817      5                                     Needful Things
96       0446525537      5                                 A Walk to Remember
97       0804114986      5                          The Bonesetter's Daughter
98       0451153553      5                                             Misery
99       044651652X      5                      The Bridges of

In [30]:
### Select 5 from the top 10 books and 5 from the other 90

top_10 = recommendations.head(10)
further_90 = recommendations.iloc[10:100]


### Select 5 random books from the top 10
top_5 = top_10.sample(5)['Book-Title'].values

### Select 5 random books from the other 90
further_5 = further_90.sample(5)['Book-Title'].values

### Combine the two lists and remove duplicates
final_recommendations = top_5.tolist() + further_5.tolist()
## Remove duplicates
final_recommendations = list(set(final_recommendations))



print(final_recommendations)

['The Da Vinci Code', 'Harry Potter and the Order of the Phoenix (Book 5)', 'One for the Money (Stephanie Plum Novels (Paperback))', 'Harry Potter and the Prisoner of Azkaban (Book 3)', 'A Time to Kill', 'Interview with the Vampire', "Don't Sweat the Small Stuff and It's All Small Stuff : Simple Ways to Keep the Little Things from Taking Over Your Life (Don't Sweat the Small Stuff Series)", 'Outlander', 'Harry Potter and the Chamber of Secrets (Book 2)', 'The Street Lawyer']


(852, 3)