# Content-Based Filtering
with following Model:
- TF-IDF with Cosine Similarity

### Import

In [4]:
# Install the surprise package
!pip install -q -U scikit-surprise
from surprise import Dataset, Reader
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import arff
from surprise import KNNWithMeans
from surprise.dataset import DatasetAutoFolds
from surprise.model_selection import GridSearchCV
from surprise import KNNBasic
from surprise import SVD
from surprise.model_selection import cross_validate
from surprise import accuracy
import random
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('stopwords')
from collections import namedtuple

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Leonie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Data Preparation and Preprocessing Text

In [5]:
path = os.path.expanduser('../data/data_kindle_preprocessed_smaller.xlsx')
data_preprocessed = pd.read_excel(path, index_col=[0], dtype={'publication_year': str, 'book_info': str})

In [6]:
data_preprocessed.head()

Unnamed: 0,rating,reviewerID,asin,title,brand,language,print_length_category,publication_year,category_string,paid_free,book_info
527748,4,A1TVJ9WDRP0UZS,B00HY2KN04,Just Jelly Beans and Jealousy (The Reed Brothe...,Visit Amazon's Tammy Falkner Page,English,small,2014,"Kindle Store, Kindle eBooks, Literature & Fiction",Free,"Kindle Store, Kindle eBooks, Literature & Fict..."
12432,5,A3IQ0P3M39IY8U,B004BLJ9IS,The You I&ve Always Dreamed Of (Finding Famil...,Visit Amazon's Alison Kent Page,English,small,2010,"Kindle Store, Kindle eBooks, Literature & Fiction",Paid,"Kindle Store, Kindle eBooks, Literature & Fict..."
438646,5,AIQWMQ4JWKZ3T,B00FQAQQ9I,Wyoming Wildflowers: The Beginning: A Prequel ...,Visit Amazon's Patricia McLinn Page,English,small,2014,"Kindle Store, Kindle eBooks, Literature & Fiction",Free,"Kindle Store, Kindle eBooks, Literature & Fict..."
620082,4,A3CMIEYL0TJLC2,B00K31DCD8,Finding My Prince Charming (The Prince Charmin...,Visit Amazon's J. S. Cooper Page,English,small,2014,"Kindle Store, Kindle eBooks, Literature & Fiction",Paid,"Kindle Store, Kindle eBooks, Literature & Fict..."
590077,5,A1L9WQBSQ5SEFE,B00JD5BO8K,Cowboy Wedding Mix-up - Kindle edition,Visit Amazon's JT Schultz Page,English,small,2014,"Kindle Store, Kindle eBooks, Romance",Paid,"Kindle Store, Kindle eBooks, Romance Visit Am..."


In [7]:
# used in case for content analysis

def preprocess_text(text):
    # lowercasing
    lowercased_text = text.lower()

    # cleaning 
    import re 
    remove_punctuation = re.sub(r'[^\w\s]', '', lowercased_text)
    remove_white_space = remove_punctuation.strip()

    # Tokenization = Breaking down each sentence into an array
    from nltk.tokenize import word_tokenize
    tokenized_text = word_tokenize(remove_white_space)

    # Stop Words/filtering = Removing irrelevant words
    from nltk.corpus import stopwords
    stopwords = set(stopwords.words('english'))
    stopwords_removed = [word for word in tokenized_text if word not in stopwords]

    # Stemming = Transforming words into their base form
    from nltk.stem import PorterStemmer
    ps = PorterStemmer()
    stemmed_text = [ps.stem(word) for word in stopwords_removed]
    
    return stemmed_text  # Return only the stemmed text


# Ensure the 'book_info' column is treated as strings
data_preprocessed['book_info'] = data_preprocessed['book_info'].astype(str)

# Apply preprocess_text function to book_info
data_preprocessed['book_info'] = data_preprocessed['book_info'].apply(preprocess_text)

print(data_preprocessed['book_info'])

527748     [kindl, store, kindl, ebook, literatur, fictio...
12432      [kindl, store, kindl, ebook, literatur, fictio...
438646     [kindl, store, kindl, ebook, literatur, fictio...
620082     [kindl, store, kindl, ebook, literatur, fictio...
590077     [kindl, store, kindl, ebook, romanc, visit, am...
                                 ...                        
215730     [kindl, store, kindl, ebook, literatur, fictio...
1133251    [kindl, store, kindl, ebook, literatur, fictio...
1092850    [kindl, store, kindl, ebook, romanc, visit, am...
283061     [kindl, store, kindl, ebook, romanc, visit, am...
1171136    [kindl, store, kindl, ebook, literatur, fictio...
Name: book_info, Length: 11639, dtype: object


In [8]:
#create subset of data_preprocessing to create new subset of columns
data_contentBased = data_preprocessed[["asin", "book_info"]]

In [9]:
data_contentBased['book_info'] = data_contentBased['book_info'].apply(lambda x: ' '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_contentBased['book_info'] = data_contentBased['book_info'].apply(lambda x: ' '.join(x))


In [10]:
data_contentBased

Unnamed: 0,asin,book_info
527748,B00HY2KN04,kindl store kindl ebook literatur fiction visi...
12432,B004BLJ9IS,kindl store kindl ebook literatur fiction visi...
438646,B00FQAQQ9I,kindl store kindl ebook literatur fiction visi...
620082,B00K31DCD8,kindl store kindl ebook literatur fiction visi...
590077,B00JD5BO8K,kindl store kindl ebook romanc visit amazon jt...
...,...,...
215730,B00AMOOE0Q,kindl store kindl ebook literatur fiction visi...
1133251,B0196TRK8K,kindl store kindl ebook literatur fiction visi...
1092850,B0173QJGLW,kindl store kindl ebook romanc visit amazon ta...
283061,B00C16YL10,kindl store kindl ebook romanc visit amazon wi...


# Content Recommender System

### With TF-IDF Verctorizer and Cosine Similarity

In [11]:
# Function to get top 10 similar products for each product in the dataframe
def tf_idf_recommender(data_preprocessed):
    n_recommendations=10
    
    # Compute TF-IDF matrix
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(data_preprocessed['book_info'])
    
    # Calculate cosine similarity matrix
    cosine_sim_matrix = cosine_similarity(tfidf_matrix)
    
    # Create a DataFrame for cosine similarity matrix for better readability
    cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=data_preprocessed.index, columns=data_preprocessed.index)

    # Recommendation
    # Create a dictionary to store similar ASINs for each ASIN
    similar_asins_dict = {}
    
    # Iterate through each ASIN
    for i, asin in enumerate(data_preprocessed['asin']):
        # Find index of current ASIN
        asin_index = data_preprocessed.index[i]
        
        # Find top similar ASINs based on similarity scores
        similar_asins_indices = cosine_sim_df[asin_index].argsort()[:-n_recommendations-1:-1]  # Top n similar ASINs
        similar_asins = data_preprocessed.iloc[similar_asins_indices]['asin'].tolist()
        
        # Store similar ASINs in the dictionary
        similar_asins_dict[asin] = similar_asins

    # Create a new dictionary for filtered recommendations
    filtered_recommendations = {}
    
    # Iterate over the original recommendations
    for key, value in similar_asins_dict.items():
        # Filter out values that are similar to the key
        filtered_values = [v for v in value if v != key]
        # Add to the filtered recommendations if there are any remaining values
        if filtered_values:
            filtered_recommendations[key] = filtered_values

    # Iterate over the dictionary and remove duplicate values
    for key, value in filtered_recommendations.items():
        filtered_recommendations[key] = list(set(value))
    
    # Print or use similar_asins_dict as needed
    return filtered_recommendations

In [12]:
tf_idf_recommendations = tf_idf_recommender(data_contentBased)

In [13]:
tf_idf_recommendations

{'B00HY2KN04': ['B00JORD84K', 'B00HVRDDQ8'],
 'B004BLJ9IS': ['B00MJOVIDI', 'B00DMR0NTQ'],
 'B00FQAQQ9I': ['B00457VKIA'],
 'B00K31DCD8': ['B00HNWEA9K', 'B00N8G9PFO'],
 'B00JD5BO8K': ['B00IHQAYYQ', 'B019UH78JY', 'B00HY0GTYK'],
 'B005OY0JFI': ['B00DD88VJI', 'B006S3FAR6'],
 'B00CDZU7SU': ['B00HE4Z51Y',
  'B00BL6CDCA',
  'B0090W7ZB6',
  'B00C2L7N4G',
  'B00P32ZUAK',
  'B0069FJE4S',
  'B005LVV6DI'],
 'B01CIWXAQI': ['B01HFUF1GK'],
 'B00UKKMKD0': ['B00V8CETHO'],
 'B00IU087O8': ['B00FVDMDOC', 'B00EJXIO7U'],
 'B014HGM9YM': ['B00UDQTS40'],
 'B00ZDHZKKA': ['B01DTBHY46', 'B01FP256UQ'],
 'B00BR3AT9G': ['B00L9Q1Y7C'],
 'B01FEI4OAE': ['B01EYW7SAE', 'B00H0NX3KE', 'B01FSDI7H6', 'B004WSQNBG'],
 'B00N05R8GG': ['B005S1XTEA', 'B00H2ZK6FU'],
 'B00QOWH556': ['B00J9YWGTQ', 'B00R1O4KWW'],
 'B00JNELJPY': ['B00JNK590Y', 'B00MZI379Q', 'B00KBN0F3S'],
 'B00E4T0P3U': ['B004W0C520', 'B01EJ2BV0W'],
 'B00KNGAX0I': ['B00O3GPDKO', 'B00MQDVWL0', 'B00HGI16I4'],
 'B00DFIE9NI': ['B00CNQ7BG0', 'B00QE1J5HI', 'B00IWTSKBW', 'B00B

In [14]:
tf_idf_recommendations.get('B00CDZU7SU')

['B00HE4Z51Y',
 'B00BL6CDCA',
 'B0090W7ZB6',
 'B00C2L7N4G',
 'B00P32ZUAK',
 'B0069FJE4S',
 'B005LVV6DI']

In [15]:
# Create a dictionary to map ASINs to titles
asin_to_title = dict(zip(data_preprocessed['asin'], data_preprocessed['title']))

recommendations_with_titles = {}

# Iterate over the dictionary with ASIN values and replace both keys and values with titles
for key, value in tf_idf_recommendations.items():
    key_title = asin_to_title[key]  # Get the title corresponding to the key (ASIN)
    value_titles = [asin_to_title[asin] for asin in value]  # Get the titles corresponding to the values (ASINs)
    recommendations_with_titles[key_title] = value_titles


In [16]:
print("Top Recommendations based on the book 'Ward of the Vampire (Ward of the Vampire Serial Book 1) - Kindle edition':\n")

for i in recommendations_with_titles.get('Ward of the Vampire (Ward of the Vampire Serial Book 1) - Kindle edition'):
    print(i)

Top Recommendations based on the book 'Ward of the Vampire (Ward of the Vampire Serial Book 1) - Kindle edition':

A Twitch of Tail (The Wiccan-Were-Bear Series Book 6) - Kindle edition
Loving Lachlyn (Ashland Pride Two) - Kindle edition
Seducing Samantha (Ashland Pride One) - Kindle edition
Every Dawn Forever (Hyena Heat Two) - Kindle edition


# Hybrid Approach

In [17]:
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
import pandas as pd
import numpy as np

### Create artificially a colds-start Problem
The cold start problem in recommender systems refers to the challenge of providing accurate recommendations for new users or items with limited interaction history.

In [18]:
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
import pandas as pd

# Convert the DataFrame to a Surprise Dataset
reader = Reader(rating_scale=(1, 5))
data_ground_truth = Dataset.load_from_df(data_preprocessed[['reviewerID', 'asin', 'rating']], reader)

# Extract 10% of users and their entries
unique_users = data_preprocessed['reviewerID'].unique()
num_test_users = int(0.1 * len(unique_users))
test_users = np.random.choice(unique_users, num_test_users, replace=False)
test_entries = data_preprocessed[data_preprocessed['reviewerID'].isin(test_users)]

# Remove test users and their entries from the dataset
data_remaining = data_preprocessed[~data_preprocessed['reviewerID'].isin(test_users)]
#print(data_remaining['reviewerID'])

# Convert the DataFrame to a Surprise Dataset
data = Dataset.load_from_df(data_remaining[['reviewerID', 'asin', 'rating']], reader)

# Split the data into train and test sets using Surprise's train_test_split
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Convert test_entries to the same format as testset
test_entries_tuples = [tuple(x) for x in test_entries[['reviewerID', 'asin', 'rating']].values]


# Load the additional rows into the Surprise Dataset object
testset+= test_entries_tuples

In [19]:
# Initialize empty sets to collect unique reviewer IDs
unique_users_train = set()
unique_users_test = set()

# Iterate through the trainset ratings and collect unique reviewer IDs
for (user_id, _, _) in trainset.all_ratings():
    unique_users_train.add(user_id)

# Iterate through the testset ratings and collect unique reviewer IDs
for (user_id, _, _) in testset:
    unique_users_test.add(user_id)

# Get the number of unique reviewer IDs in train and test sets
num_unique_users_train = len(unique_users_train)
num_unique_users_test = len(unique_users_test)
test_only_users_count = len(unique_users_test) - len(unique_users_train)
test_only_users = unique_users_test - unique_users_train

print("Number of unique reviewer IDs in train set:", num_unique_users_train)
print("Number of unique reviewer IDs in test set:", num_unique_users_test)
print("Users in test set but not in train set:", test_only_users_count)
print("Top ten users in test set but not in train set:")
for i, user_id in enumerate(test_only_users):
    if i < 20:
        print(user_id)
    else:
        break

Number of unique reviewer IDs in train set: 314
Number of unique reviewer IDs in test set: 348
Users in test set but not in train set: 34
Top ten users in test set but not in train set:
A1VF3B672MSQ4C
A2W3RQOSC87P4A
AR6WIPHVS5G3I
A2C7W167DVNUKH
A39N24TTLP6I48
A376U5S8TKOE69
A2JPS7EMNFKXOV
AIQWMQ4JWKZ3T
A3O7EX5CU264Y1
A3QVW8NFY4C4I1
A13LERNQ8R7267
A3A7FF87LEVCQ1
A36PA4XPATJSKX
A13JQT6036JF
A1SBQ0F7FSLWB0
A1JLU5H1CCENWX
A2UNMDJYXPEQZ3
A1I96OYAUJ3HQE
A916DXE9W36GF
AV4HVQ5WUQ1Z1


Here we can see, that trainset contains user ids, that are not in the trainset. Therefore a cold-start problem will occur for memory-based approaches

## Hybrid Model
This model consists of a Collaborative Filtering Model of choice, in this case, we used the User-Based Filtering with KNNWithMeans. The other recommendation are the top most popular books based on rating and number of reviews. The hybrid apporach aims to solve cold-start problem, meaning that when there is no data known about a user, the most popular books are going to be recommended. 

In [20]:
def user_based_KNNWithMeans_recommender_system(trainset, testset, data):
    
    #Choose best algorithm based on grid search
    algo = KNNWithMeans(k=1, sim_options={'name': 'pearson', 'user_based': False})
    
    # Train the best model with the new parameters and evaluate the trained model on the test set
    test_pred = algo.fit(trainset).test(testset)

    # Calculate evaluation metrics
    print("User-based Model with KNNWithMeans: Test Set")
    accuracy.rmse(test_pred, verbose=True)
    accuracy.mae(test_pred, verbose=True)

    return test_pred

In [21]:
prediction_memory_based = user_based_KNNWithMeans_recommender_system(trainset,testset, data_ground_truth)

Computing the pearson similarity matrix...
Done computing similarity matrix.
User-based Model with KNNWithMeans: Test Set
RMSE: 0.9608
MAE:  0.7442


In [32]:
# Define the user ID to filter
target_user_id = 'A3QVW8NFY4C4I1'

# Filter predictions for the target user ID
user_predictions = [pred for pred in prediction_memory_based if pred.uid == target_user_id]

# Print the filtered predictions
for pred in user_predictions:
    print(pred)


user: A3QVW8NFY4C4I1 item: B00K6S71RG r_ui = 5.00   est = 4.27   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: A3QVW8NFY4C4I1 item: B00BBQ2JPQ r_ui = 5.00   est = 4.27   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: A3QVW8NFY4C4I1 item: B00BPB9L3U r_ui = 5.00   est = 4.27   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: A3QVW8NFY4C4I1 item: B00DE50LTS r_ui = 5.00   est = 4.27   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: A3QVW8NFY4C4I1 item: B00KBH7JDI r_ui = 5.00   est = 4.27   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: A3QVW8NFY4C4I1 item: B00IMNWM3K r_ui = 5.00   est = 4.27   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: A3QVW8NFY4C4I1 item: B00K40M50E r_ui = 5.00   est = 4.27   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: A3QVW8NFY4C4I1 item: B004EYT0UO r_ui = 4.00   est = 4.27   {'was_impos

In this output we can see: Cold-start problem leads to poor prediction

In [33]:
def top_popular_books(data_preprocessed, n=5):
    # Group by 'asin' and calculate the average rating and count of ratings for each book
    popular_books = data_preprocessed.groupby('asin').agg({'rating': ['mean', 'count']})
    popular_books.columns = ['avg_rating', 'rating_count']
    
    # Sort the books based on rating count and average rating
    popular_books = popular_books.sort_values(by=['rating_count', 'avg_rating'], ascending=False)
    
    # Get the top n popular books
    top_books = popular_books.head(n)
    
    # Return the top books as a list
    return top_books.index.tolist()

# Example usage:
top_popular_books = top_popular_books(data_preprocessed)

In [42]:
top_popular_books

['B00J0CAR16', 'B00EC197UC', 'B00D9CMLNA', 'B00M0DQIEC', 'B00EAHZR5W']

In [53]:
from surprise import Prediction
from collections import defaultdict

def hybrid_recommendation(prediction_memory_based, top_popular_books):
    # Dictionary to store the count of deleted predictions for each user
    deleted_predictions_count = defaultdict(int)
    
    # New list to store valid predictions
    valid_predictions = []
    
    # Loop through all predictions
    for prediction in prediction_memory_based:
        # Check if the prediction was impossible
        if prediction.details['was_impossible']:
            # Increment the count of deleted predictions for this user
            deleted_predictions_count[prediction.uid] += 1
        else:
            # Add valid predictions to the new list
            valid_predictions.append(prediction)
    
    # Loop through deleted predictions count for each user
    for user_id, count in deleted_predictions_count.items():
        # Add new predictions based on top popular books for this user
        for i in range(min(count, 5)):
            if i < len(top_popular_books):
                new_prediction = Prediction(uid=user_id, iid=top_popular_books[i], r_ui=None, est=5, details={'actual_k': 0, 'was_impossible': False})
                valid_predictions.append(new_prediction)
    
    return valid_predictions


In [54]:
predictions = hybrid_recommendation(prediction_memory_based, top_popular_books)

In [56]:
# Define the user ID to filter
#target_user_id = 'A1JLU5H1CCENWX' using the same target id

# Filter predictions for the target user ID
user_predictions_hybrid = [pred for pred in predictions if pred.uid == target_user_id]

# Print the filtered predictions
for pred in user_predictions_hybrid:
    print(pred)

user: A3QVW8NFY4C4I1 item: B00J0CAR16 r_ui = None   est = 5.00   {'actual_k': 0, 'was_impossible': False}
user: A3QVW8NFY4C4I1 item: B00EC197UC r_ui = None   est = 5.00   {'actual_k': 0, 'was_impossible': False}
user: A3QVW8NFY4C4I1 item: B00D9CMLNA r_ui = None   est = 5.00   {'actual_k': 0, 'was_impossible': False}
user: A3QVW8NFY4C4I1 item: B00M0DQIEC r_ui = None   est = 5.00   {'actual_k': 0, 'was_impossible': False}
user: A3QVW8NFY4C4I1 item: B00EAHZR5W r_ui = None   est = 5.00   {'actual_k': 0, 'was_impossible': False}
user: A3QVW8NFY4C4I1 item: B00J0CAR16 r_ui = None   est = 5.00   {'actual_k': 0, 'was_impossible': False}
user: A3QVW8NFY4C4I1 item: B00EC197UC r_ui = None   est = 5.00   {'actual_k': 0, 'was_impossible': False}
user: A3QVW8NFY4C4I1 item: B00D9CMLNA r_ui = None   est = 5.00   {'actual_k': 0, 'was_impossible': False}
user: A3QVW8NFY4C4I1 item: B00M0DQIEC r_ui = None   est = 5.00   {'actual_k': 0, 'was_impossible': False}
user: A3QVW8NFY4C4I1 item: B00EAHZR5W r_ui = N

In [57]:
import json
def serialize_predictions(predictions, filename):
    """
    Serialize a list of Prediction objects to JSON and save it to a file.

    Args:
    - predictions (list): List of Prediction objects.
    - filename (str): Name of the file to save the serialized predictions.

    Returns:
    - None
    """
    serialized_predictions = []
    for pred in predictions:
        serialized_prediction = {
            "uid": pred.uid,
            "iid": pred.iid,
            "r_ui": pred.r_ui,
            "est": pred.est,
            "details": pred.details
        }
        serialized_predictions.append(serialized_prediction)

    with open(filename, 'w') as json_file:
        json.dump(serialized_predictions, json_file)

In [58]:
serialize_predictions(predictions, '../data/hybrid_recommender_system.json')