# Content-Based Filtering
with following Model:
- TF-IDF with Cosine Similarity

### Import

In [1]:
# Install the surprise package
!pip install -q -U scikit-surprise
from surprise import Dataset, Reader
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import arff
from surprise import KNNWithMeans
from surprise.dataset import DatasetAutoFolds
from surprise.model_selection import GridSearchCV
from surprise import KNNBasic
from surprise import SVD
from surprise.model_selection import cross_validate
from surprise import accuracy
import random
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('stopwords')
from collections import namedtuple

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Leonie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Data Preparation

In [97]:
path = os.path.expanduser('../data/data_kindle_preprocessed_smaller.xlsx')
data_preprocessed = pd.read_excel(path, index_col=[0], dtype={'publication_year': str})

In [98]:
data_preprocessed['book_info']

367733     Kindle Store, Kindle eBooks, Romance  Visit Am...
853589     Kindle Store, Kindle eBooks, Literature & Fict...
1034265    Kindle Store, Kindle eBooks, Literature & Fict...
450140     Kindle Store, Kindle eBooks, Literature & Fict...
558308     Kindle Store, Kindle eBooks, Literature & Fict...
                                 ...                        
864078     Kindle Store, Kindle eBooks, Literature & Fict...
97789      Kindle Store, Kindle eBooks, Mystery, Thriller...
248284     Kindle Store, Kindle eBooks, Literature & Fict...
619502     Kindle Store, Kindle eBooks, Literature & Fict...
901171     Kindle Store, Kindle eBooks, Literature & Fict...
Name: book_info, Length: 33036, dtype: object

In [99]:
# used in case for content analysis

def preprocess_text(text):
    # lowercasing
    lowercased_text = text.lower()

    # cleaning 
    import re 
    remove_punctuation = re.sub(r'[^\w\s]', '', lowercased_text)
    remove_white_space = remove_punctuation.strip()

    # Tokenization = Breaking down each sentence into an array
    from nltk.tokenize import word_tokenize
    tokenized_text = word_tokenize(remove_white_space)

    # Stop Words/filtering = Removing irrelevant words
    from nltk.corpus import stopwords
    stopwords = set(stopwords.words('english'))
    stopwords_removed = [word for word in tokenized_text if word not in stopwords]

    # Stemming = Transforming words into their base form
    from nltk.stem import PorterStemmer
    ps = PorterStemmer()
    stemmed_text = [ps.stem(word) for word in stopwords_removed]
    
    return stemmed_text  # Return only the stemmed text


# Apply preprocess_text function to book_info
data_preprocessed['book_info'] = data_preprocessed['book_info'].apply(preprocess_text)

In [100]:
#create subset of data_preprocessing to create new subset of columns
data_contentBased = data_preprocessed[["reviewerID", "asin", "rating", "book_info"]]

In [101]:
data_contentBased['book_info'] = data_contentBased['book_info'].apply(lambda x: ' '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_contentBased['book_info'] = data_contentBased['book_info'].apply(lambda x: ' '.join(x))


In [102]:
data_contentBased

Unnamed: 0,reviewerID,asin,rating,book_info
367733,A1P44JJW47E6QN,B00DVWRDN6,5,kindl store kindl ebook romanc visit amazon sh...
853589,A1BOK1EC36Q0WV,B00RE0O2OO,5,kindl store kindl ebook literatur fiction visi...
1034265,A1ZV5XBKG7RVZU,B013GZOXPS,5,kindl store kindl ebook literatur fiction visi...
450140,A1MXW3BGAZW44D,B00G5IG3RK,4,kindl store kindl ebook literatur fiction visi...
558308,A1VDXZASTPFIAE,B00IL9ZCBY,4,kindl store kindl ebook literatur fiction visi...
...,...,...,...,...
864078,A39YNUAPSSOD82,B00RY2IAMM,4,kindl store kindl ebook literatur fiction visi...
97789,A1AMO4S4I575LQ,B006LWJ75K,4,kindl store kindl ebook mysteri thriller suspe...
248284,A1UVH2I7WHDWLD,B00BBQ2JPQ,5,kindl store kindl ebook literatur fiction kell...
619502,A2B9C3FMYW18UN,B00K31DCD8,3,kindl store kindl ebook literatur fiction visi...


In [103]:
from sklearn.model_selection import train_test_split
# Splitting the data into training and testing sets
X = data_contentBased['book_info']  # Independent variable - contains category_string, brand, paid_free, print_length_category, publication_year, language
y = data_contentBased['asin']  # Dependent variable (product ID)

# split training and testdata. random_state 42 is the same as for the split of the collaborative filtering models, to make sure to use the same training/testsplit
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.25, random_state=42)

# Content Recommender System

### With TF-IDF Verctorizer and Cosine Similarity

In [104]:
def tf_idf_recommender(data_contentBased):
    
    # Define the number of similar ASINs to retrieve for each ASIN
    n_recommendations = 10
    
    # Feature Extraction
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    X_tfidf = tfidf_vectorizer.fit_transform(data_contentBased['book_info'])
    
    # Similarity Calculation
    similarity_matrix = cosine_similarity(X_tfidf)
    
    # Recommendation
    # Create a dictionary to store similar ASINs for each ASIN
    similar_asins_dict = {}
    
    # Iterate through each ASIN
    for i, asin in enumerate(data_contentBased['asin']):
        # Find index of current ASIN
        asin_index = data_contentBased.index[data_contentBased['asin'] == asin].tolist()[0]
        
        # Find top similar ASINs based on similarity scores
        similar_asins_indices = similarity_matrix[asin_index].argsort()[:-n_recommendations-1:-1]  # Top n similar ASINs
        similar_asins = data_contentBased.iloc[similar_asins_indices]['asin'].tolist()
        
        # Store similar ASINs in the dictionary
        similar_asins_dict[asin] = similar_asins
    
    # Print or use similar_asins_dict as needed
    return similar_asins_dict


In [None]:
[Prediction(uid='AI38GSFB90M4L', iid='B00QZABE5O', r_ui=2.0, est=2.875, details={'actual_k': 0, 'was_impossible': False}),
 Prediction(uid='A1OC44Q6V9TT0G', iid='B007XYIGR0', r_ui=5.0, est=4.571428571428571, details={'actual_k': 0, 'was_impossible': False}),
 Prediction(uid='A2Z7HILLWSTWML', iid='B00DMQ8IJY', r_ui=4.0, est=3.6363636363636362, details={'actual_k': 0, 'was_impossible': False}),
 Prediction(uid='A2HZ6MGQ6TPNO0', iid='B0070JOQHM', r_ui=5.0, est=4.165094339622642, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid='A2BOKBB1IZA13L', iid='B00LS6MCMY', r_ui=5.0, est=5, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid='A3MTU1QAMYEAX3', iid='B00FFLB6MU', r_ui=5.0, est=3.6845090393477484, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid='A293W2MU6STRGP', iid='B00C7HG9KY', r_ui=4.0, est=4.966911764705882, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid='A3HN8JIDYGZVVH', iid='B017JA1SFO', r_ui=5.0, est=5, details={'actual_k': 0, 'was_impossible': False}),
 Prediction(uid='A1UPRXXTKHCA5', iid='B00M0DQIEC', r_ui=4.0, est=4.057317359642941, details={'actual_k': 1, 'was_impossible': Fa


i have this prediction of user-based recommendation called 'prediction_memory_based '. walk through all predictions. as soon as a prediction has k of 0, then another approach is implemented for these predictions. otherwise, if k >0, prediction can be kept. 




i have another prediction called 'prediction_tf_idf'. this consists of a dict, where books are key and the value are similar books as a list.

{asin:[asin1, asin2...], ...}




In [111]:
prediction_tf_idf = tf_idf_recommender(data_contentBased)

IndexError: index 367733 is out of bounds for axis 0 with size 33036

In [None]:
#prediction_tf_idf

# Hybrid Approach

### User-Based Collaborative Filtering + Content-Based Filtering
Run after these predictions are calculated

In [34]:
# Create a dummy algorithm class that does nothing for the DummyAlgorithm class 
# that initializes the trainset attribute with a dummy value. Here's how you can modify the code:

class DummyAlgorithm(AlgoBase):
    def __init__(self):
        pass

    def fit(self, trainset):
        self.trainset = trainset  # Initialize trainset attribute with a dummy value
        pass

    def estimate(self, u, i):
        return 5  # Return a dummy rating of 5 for all predictions


def hybrid_recommender_system(prediction, data_contentBased, data):
    # Convert prediction to a dictionary with user IDs as keys
    user_item_recs = {}
    for pred in prediction:
        user_id = pred.uid
        item_id = pred.iid
        if user_id not in user_item_recs:
            user_item_recs[user_id] = []
        user_item_recs[user_id].append(item_id)

    # Initialize dictionary for hybrid recommendations
    hybrid_recs = {}

    # For each user, generate recommendations
    for user_id, items in user_item_recs.items():
        # Initialize list to store recommendations for the user
        user_recommendations = items.copy()  # Store original recommendations

        # For each item recommended to the user
        for item in items:
            # Get similar item recommendations from TF-IDF
            similar_items = data_contentBased.get(item)

            # Check if similar_items is not None before iterating
            if similar_items is not None:
                # Extend user_recommendations with similar items (excluding already recommended items)
                user_recommendations.extend([item for item in similar_items if item not in items])

        # Count occurrences of each item ID (ASIN) in user_recommendations
        item_counts = {}
        for item in user_recommendations:
            if item in item_counts:
                item_counts[item] += 1
            else:
                item_counts[item] = 1

        # Sort items by count (descending order) and prioritize original recommendations if counts are equal
        sorted_items = sorted(item_counts.items(), key=lambda x: (-x[1], x[0]))

        # Take the top ten ASINs from sorted_items
        top_ten_asins = [item[0] for item in sorted_items[:10]]

        # Add user recommendations to the hybrid recommendations
        hybrid_recs[user_id] = top_ten_asins

    # Cross-validate the hybrid recommender system
    dummy_algo = DummyAlgorithm()
    cross_val_results = cross_validate(dummy_algo, data, measures=['mae', 'rmse'], cv=5, verbose=True)
    avg_mae = np.mean(cross_val_results['test_mae'])
    avg_rmse = np.mean(cross_val_results['test_rmse'])

    # Calculate MAE and RMSE on the test data
    actual_ratings = []
    predicted_ratings = []
    for user_id, items in hybrid_recs.items():
        for item in items:
            actual_rating = [rating for (uid, iid, rating, _) in data.raw_ratings if uid == user_id and iid == item]
            if actual_rating:  # Check if actual rating exists
                actual_ratings.append(actual_rating[0])
                predicted_ratings.append(5)  # Assuming all predicted ratings are 5 (can be replaced with actual predictions)

    mae = mean_absolute_error(actual_ratings, predicted_ratings)
    rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))

    # Print MAE and RMSE
    print("MAE:", mae)
    print("RMSE:", rmse)

    return hybrid_recs, avg_mae, avg_rmse, mae, rmse

# Call the hybrid recommender system function
prediction_hybrid, avg_mae, avg_rmse, mae, rmse = hybrid_recommender_system(prediction_user_based_KNNWithMeans, pred_content_based_recommender_system, data)


Evaluating MAE, RMSE of algorithm DummyAlgorithm on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.6498  0.6231  0.6362  0.6226  0.6094  0.6282  0.0137  
RMSE (testset)    1.1467  1.1137  1.1173  1.1089  1.0936  1.1161  0.0173  
Fit time          0.00    0.00    0.01    0.01    0.01    0.00    0.00    
Test time         0.05    0.06    0.06    0.06    0.05    0.06    0.00    
MAE: 0.6052631578947368
RMSE: 1.038723913473187


# Hybrid Approach

In [105]:
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
import pandas as pd
import numpy as np

### Create artificially a colds-start Problem

In [106]:
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
import pandas as pd

# Convert the DataFrame to a Surprise Dataset
reader = Reader(rating_scale=(1, 5))
data_ground_truth = Dataset.load_from_df(data_preprocessed[['reviewerID', 'asin', 'rating']], reader)

# Extract 10% of users and their entries
unique_users = data_preprocessed['reviewerID'].unique()
num_test_users = int(0.1 * len(unique_users))
test_users = np.random.choice(unique_users, num_test_users, replace=False)
test_entries = data_preprocessed[data_preprocessed['reviewerID'].isin(test_users)]

# Remove test users and their entries from the dataset
data_remaining = data_preprocessed[~data_preprocessed['reviewerID'].isin(test_users)]
#print(data_remaining['reviewerID'])

# Convert the DataFrame to a Surprise Dataset
data = Dataset.load_from_df(data_remaining[['reviewerID', 'asin', 'rating']], reader)

# Split the data into train and test sets using Surprise's train_test_split
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Convert test_entries to the same format as testset
test_entries_tuples = [tuple(x) for x in test_entries[['reviewerID', 'asin', 'rating']].values]


# Load the additional rows into the Surprise Dataset object
testset+= test_entries_tuples

In [107]:
# Initialize empty sets to collect unique reviewer IDs
unique_users_train = set()
unique_users_test = set()

# Iterate through the trainset ratings and collect unique reviewer IDs
for (user_id, _, _) in trainset.all_ratings():
    unique_users_train.add(user_id)

# Iterate through the testset ratings and collect unique reviewer IDs
for (user_id, _, _) in testset:
    unique_users_test.add(user_id)

# Get the number of unique reviewer IDs in train and test sets
num_unique_users_train = len(unique_users_train)
num_unique_users_test = len(unique_users_test)
test_only_users_count = len(unique_users_test) - len(unique_users_train)
test_only_users = unique_users_test - unique_users_train

print("Number of unique reviewer IDs in train set:", num_unique_users_train)
print("Number of unique reviewer IDs in test set:", num_unique_users_test)
print("Users in test set but not in train set:", test_only_users_count)
print("Top ten users in test set but not in train set:")
for i, user_id in enumerate(test_only_users):
    if i < 10:
        print(user_id)
    else:
        break

Number of unique reviewer IDs in train set: 812
Number of unique reviewer IDs in test set: 902
Users in test set but not in train set: 90
Top ten users in test set but not in train set:
A29A1CPI3AOYSD
A2D9S6V1AFXOAO
ACATGMB8JTGU9
A73IK4VT0XZS6
A1DC00T32U26HC
AYXKCV0BGC0PU
A2W3RQOSC87P4A
A2UNMDJYXPEQZ3
A2MIR34MGFG3EU
A1XNUZ8KUU5UYX


Here we can see, that trainset contains user ids, that are not in the trainset. Therefore a cold-start problem will occur for memory-based approaches

In [108]:
def user_based_KNNWithMeans_recommender_system(trainset, testset, data):
    
    #Choose best algorithm based on grid search
    algo = KNNWithMeans(k=1, sim_options={'name': 'pearson', 'user_based': True})
    
    # Train the best model with the new parameters and evaluate the trained model on the test set
    test_pred = algo.fit(trainset).test(testset)

    # Calculate evaluation metrics
    print("User-based Model with KNNWithMeans: Test Set")
    accuracy.rmse(test_pred, verbose=True)
    accuracy.mae(test_pred, verbose=True)

    return test_pred

In [109]:
prediction_memory_based = user_based_KNNWithMeans_recommender_system(trainset,testset, data_ground_truth)

Computing the pearson similarity matrix...
Done computing similarity matrix.
User-based Model with KNNWithMeans: Test Set
RMSE: 0.8764
MAE:  0.6396


In [110]:
# Define the user ID to filter
target_user_id = 'A2XNID09ZCM4CP'

# Filter predictions for the target user ID
user_predictions = [pred for pred in prediction_memory_based if pred.uid == target_user_id]

# Print the filtered predictions
for pred in user_predictions:
    print(pred)


In this output we can see: Cold-start problem leads to poor prediction