# Content-Based Filtering
with following Model:
- TF-IDF with Cosine Similarity

### Import

In [29]:
# Install the surprise package
!pip install -q -U scikit-surprise
from surprise import Dataset, Reader
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import arff
from surprise import KNNWithMeans
from surprise.dataset import DatasetAutoFolds
from surprise.model_selection import GridSearchCV
from surprise import KNNBasic
from surprise import SVD
from surprise.model_selection import cross_validate
from surprise import accuracy
import random
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('stopwords')
from collections import namedtuple

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Leonie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Data Preparation

In [2]:
path = os.path.expanduser('../data/data_kindle_preprocessed.xlsx')
data_preprocessed = pd.read_excel(path, index_col=[0], dtype={'publication_year': str})

In [3]:
data_preprocessed['book_info']

0        Kindle Store, Kindle eBooks, Literature & Fict...
1        Kindle Store, Kindle eBooks, History  King of ...
2        Kindle Store, Kindle eBooks, Romance  Leanne B...
3        Kindle Store, Kindle eBooks, Romance  Leanne B...
4        Kindle Store, Kindle eBooks, Romance  Visit Am...
                               ...                        
19563    Kindle Store, Kindle eBooks, Literature & Fict...
19564    Kindle Store, Kindle eBooks, Literature & Fict...
19565    Kindle Store, Kindle eBooks, Science Fiction &...
19566    Kindle Store, Kindle eBooks, Literature & Fict...
19567    Kindle Store, Kindle eBooks, Teen & Young Adul...
Name: book_info, Length: 19568, dtype: object

In [6]:
# used in case for content analysis

def preprocess_text(text):
    # lowercasing
    lowercased_text = text.lower()

    # cleaning 
    import re 
    remove_punctuation = re.sub(r'[^\w\s]', '', lowercased_text)
    remove_white_space = remove_punctuation.strip()

    # Tokenization = Breaking down each sentence into an array
    from nltk.tokenize import word_tokenize
    tokenized_text = word_tokenize(remove_white_space)

    # Stop Words/filtering = Removing irrelevant words
    from nltk.corpus import stopwords
    stopwords = set(stopwords.words('english'))
    stopwords_removed = [word for word in tokenized_text if word not in stopwords]

    # Stemming = Transforming words into their base form
    from nltk.stem import PorterStemmer
    ps = PorterStemmer()
    stemmed_text = [ps.stem(word) for word in stopwords_removed]
    
    return stemmed_text  # Return only the stemmed text


# Apply preprocess_text function to book_info
data_preprocessed['book_info'] = data_preprocessed['book_info'].apply(preprocess_text)

In [14]:
#create subset of data_preprocessing to create new subset of columns
data_contentBased = data_preprocessed[["reviewerID", "asin", "rating", "book_info"]]

In [16]:
data_contentBased['book_info'] = data_contentBased['book_info'].apply(lambda x: ' '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_contentBased['book_info'] = data_contentBased['book_info'].apply(lambda x: ' '.join(x))


In [17]:
data_contentBased

Unnamed: 0,reviewerID,asin,rating,book_info
0,A3OC8ZG1S3OAVA,B0015Z7VFQ,1.0,kindl store kindl ebook literatur fiction visi...
1,A2U8YWPP1PYHJM,B0017HNV1U,4.0,kindl store kindl ebook histori king babylon h...
2,A3361XGKYF17S3,B001892EI8,3.0,kindl store kindl ebook romanc leann bank paid...
3,AVGYENZU56KBR,B001892EI8,4.0,kindl store kindl ebook romanc leann bank paid...
4,A3361XGKYF17S3,B001892DGG,3.0,kindl store kindl ebook romanc visit amazon da...
...,...,...,...,...
19563,A1EQY74OFGE4NE,B01HIGNUGE,4.0,kindl store kindl ebook literatur fiction visi...
19564,A1EQY74OFGE4NE,B01HINH1WQ,3.0,kindl store kindl ebook literatur fiction visi...
19565,A1SVA69J57MX2A,B01HIOR0S0,5.0,kindl store kindl ebook scienc fiction fantasi...
19566,A3FVMG7SWNF7QR,B01HIULQXY,5.0,kindl store kindl ebook literatur fiction visi...


In [18]:
from sklearn.model_selection import train_test_split
# Splitting the data into training and testing sets
X = data_contentBased['book_info']  # Independent variable - contains category_string, brand, paid_free, print_length_category, publication_year, language
y = data_contentBased['asin']  # Dependent variable (product ID)

# split training and testdata. random_state 42 is the same as for the split of the collaborative filtering models, to make sure to use the same training/testsplit
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.25, random_state=42)

# Content Recommender System

### With TF-IDF Verctorizer and Cosine Similarity

In [25]:
def tf_idf_recommender(data_contentBased):
    
    # Define the number of similar ASINs to retrieve for each ASIN
    n_recommendations = 10
    
    # 1. Feature Extraction
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    X_tfidf = tfidf_vectorizer.fit_transform(data_contentBased['book_info'])
    
    # 2. Similarity Calculation
    similarity_matrix = cosine_similarity(X_tfidf)
    
    # 3. Recommendation
    # Create a dictionary to store similar ASINs for each ASIN
    similar_asins_dict = {}
    
    # Iterate through each ASIN
    for i, asin in enumerate(data_contentBased['asin']):
        # Find index of current ASIN
        asin_index = data_contentBased.index[data_contentBased['asin'] == asin].tolist()[0]
        
        # Find top similar ASINs based on similarity scores
        similar_asins_indices = similarity_matrix[asin_index].argsort()[:-n_recommendations-1:-1]  # Top n similar ASINs
        similar_asins = data_contentBased.iloc[similar_asins_indices]['asin'].tolist()
        
        # Store similar ASINs in the dictionary
        similar_asins_dict[asin] = similar_asins
    
    # Print or use similar_asins_dict as needed
    return similar_asins_dict


In [26]:
prediction_tf_idf = tf_idf_recommender(data_contentBased)

In [30]:
#prediction_tf_idf

In [31]:
# format the hybrid prediction to match the other predictions

# Define the Prediction namedtuple
Prediction = namedtuple('Prediction', ['uid', 'iid', 'r_ui', 'est', 'details'])

# Initialize an empty list to store Prediction objects
prediction_hybrid_formatted = []

# Iterate over each user ID and their corresponding list of item IDs
for user_id, item_ids in prediction_hybrid.items():
    for item_id in item_ids:
        # Create a Prediction object with empty values for r_ui, est, and details
        prediction = Prediction(uid=user_id, iid=item_id, r_ui=None, est=None, details=None)
        # Append the Prediction object to the list
        prediction_hybrid_formatted.append(prediction)

NameError: name 'prediction_hybrid' is not defined

# Other Approaches

## Popularity-based Recommendation

In [22]:
from collections import defaultdict
from sklearn.utils import shuffle

def generate_predictions_sklearn(data_preprocessed):
    """
    Generates predictions using scikit-learn based on book popularity.

    Parameters:
    - data_preprocessed (DataFrame): Preprocessed DataFrame containing reviewer IDs and book IDs.

    Returns:
    - predictions (list of dicts): Predictions for each user with the recommended books.
    """
    # Step 1: Shuffle the data_preprocessed DataFrame to randomize the order of ratings
    data_preprocessed_shuffled = shuffle(data_preprocessed)

    # Step 2: Calculate the popularity of each book
    book_popularity = defaultdict(int)
    for rating in data_preprocessed_shuffled['asin']:
        book_popularity[rating] += 1

    # Step 3: Sort the books based on their popularity
    sorted_books = sorted(book_popularity.items(), key=lambda x: x[1], reverse=True)

    # Step 4: Get recommendations for each reviewer ID
    recommendations = {}
    for reviewer_id in data_preprocessed_shuffled['reviewerID'].unique():
        top_books = [book[0] for book in sorted_books[:10]]
        recommendations[reviewer_id] = top_books

    # Step 5: Create predictions for each user with the recommended books
    predictions = []
    for reviewer_id, recommended_books in recommendations.items():
        for book in recommended_books:
            # Create a prediction object
            prediction = {'uid': reviewer_id, 'iid': book, 'r_ui': None, 'est': 5, 'details': {'was_impossible': False, 'reason': ''}}
            predictions.append(prediction)

    return predictions

# Example usage:

predictions = generate_predictions_sklearn(data_contentBased)
for prediction in predictions:
    print(prediction)


{'uid': 'A1RKD07ST2BLOQ', 'iid': 'B00DOFHHFO', 'r_ui': None, 'est': 5, 'details': {'was_impossible': False, 'reason': ''}}
{'uid': 'A1RKD07ST2BLOQ', 'iid': 'B00629ZTOU', 'r_ui': None, 'est': 5, 'details': {'was_impossible': False, 'reason': ''}}
{'uid': 'A1RKD07ST2BLOQ', 'iid': 'B00JNON0LU', 'r_ui': None, 'est': 5, 'details': {'was_impossible': False, 'reason': ''}}
{'uid': 'A1RKD07ST2BLOQ', 'iid': 'B005C5YZ86', 'r_ui': None, 'est': 5, 'details': {'was_impossible': False, 'reason': ''}}
{'uid': 'A1RKD07ST2BLOQ', 'iid': 'B00EEPZHI2', 'r_ui': None, 'est': 5, 'details': {'was_impossible': False, 'reason': ''}}
{'uid': 'A1RKD07ST2BLOQ', 'iid': 'B00I52PPT6', 'r_ui': None, 'est': 5, 'details': {'was_impossible': False, 'reason': ''}}
{'uid': 'A1RKD07ST2BLOQ', 'iid': 'B00SPA1Z5Q', 'r_ui': None, 'est': 5, 'details': {'was_impossible': False, 'reason': ''}}
{'uid': 'A1RKD07ST2BLOQ', 'iid': 'B00EVSB3N0', 'r_ui': None, 'est': 5, 'details': {'was_impossible': False, 'reason': ''}}
{'uid': 'A1RKD07

# Hybrid Approach

### User-Based Collaborative Filtering + Content-Based Filtering
Run after these predictions are calculated

In [34]:
# Create a dummy algorithm class that does nothing for the DummyAlgorithm class 
# that initializes the trainset attribute with a dummy value. Here's how you can modify the code:

class DummyAlgorithm(AlgoBase):
    def __init__(self):
        pass

    def fit(self, trainset):
        self.trainset = trainset  # Initialize trainset attribute with a dummy value
        pass

    def estimate(self, u, i):
        return 5  # Return a dummy rating of 5 for all predictions


def hybrid_recommender_system(prediction, data_contentBased, data):
    # Convert prediction to a dictionary with user IDs as keys
    user_item_recs = {}
    for pred in prediction:
        user_id = pred.uid
        item_id = pred.iid
        if user_id not in user_item_recs:
            user_item_recs[user_id] = []
        user_item_recs[user_id].append(item_id)

    # Initialize dictionary for hybrid recommendations
    hybrid_recs = {}

    # For each user, generate recommendations
    for user_id, items in user_item_recs.items():
        # Initialize list to store recommendations for the user
        user_recommendations = items.copy()  # Store original recommendations

        # For each item recommended to the user
        for item in items:
            # Get similar item recommendations from TF-IDF
            similar_items = data_contentBased.get(item)

            # Check if similar_items is not None before iterating
            if similar_items is not None:
                # Extend user_recommendations with similar items (excluding already recommended items)
                user_recommendations.extend([item for item in similar_items if item not in items])

        # Count occurrences of each item ID (ASIN) in user_recommendations
        item_counts = {}
        for item in user_recommendations:
            if item in item_counts:
                item_counts[item] += 1
            else:
                item_counts[item] = 1

        # Sort items by count (descending order) and prioritize original recommendations if counts are equal
        sorted_items = sorted(item_counts.items(), key=lambda x: (-x[1], x[0]))

        # Take the top ten ASINs from sorted_items
        top_ten_asins = [item[0] for item in sorted_items[:10]]

        # Add user recommendations to the hybrid recommendations
        hybrid_recs[user_id] = top_ten_asins

    # Cross-validate the hybrid recommender system
    dummy_algo = DummyAlgorithm()
    cross_val_results = cross_validate(dummy_algo, data, measures=['mae', 'rmse'], cv=5, verbose=True)
    avg_mae = np.mean(cross_val_results['test_mae'])
    avg_rmse = np.mean(cross_val_results['test_rmse'])

    # Calculate MAE and RMSE on the test data
    actual_ratings = []
    predicted_ratings = []
    for user_id, items in hybrid_recs.items():
        for item in items:
            actual_rating = [rating for (uid, iid, rating, _) in data.raw_ratings if uid == user_id and iid == item]
            if actual_rating:  # Check if actual rating exists
                actual_ratings.append(actual_rating[0])
                predicted_ratings.append(5)  # Assuming all predicted ratings are 5 (can be replaced with actual predictions)

    mae = mean_absolute_error(actual_ratings, predicted_ratings)
    rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))

    # Print MAE and RMSE
    print("MAE:", mae)
    print("RMSE:", rmse)

    return hybrid_recs, avg_mae, avg_rmse, mae, rmse

# Call the hybrid recommender system function
prediction_hybrid, avg_mae, avg_rmse, mae, rmse = hybrid_recommender_system(prediction_user_based_KNNWithMeans, pred_content_based_recommender_system, data)


Evaluating MAE, RMSE of algorithm DummyAlgorithm on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.6498  0.6231  0.6362  0.6226  0.6094  0.6282  0.0137  
RMSE (testset)    1.1467  1.1137  1.1173  1.1089  1.0936  1.1161  0.0173  
Fit time          0.00    0.00    0.01    0.01    0.01    0.00    0.00    
Test time         0.05    0.06    0.06    0.06    0.05    0.06    0.00    
MAE: 0.6052631578947368
RMSE: 1.038723913473187


# Interface

In [36]:
# format the hybrid prediction to match the other predictions

# Define the Prediction namedtuple
Prediction = namedtuple('Prediction', ['uid', 'iid', 'r_ui', 'est', 'details'])

# Initialize an empty list to store Prediction objects
prediction_hybrid_formatted = []

# Iterate over each user ID and their corresponding list of item IDs
for user_id, item_ids in prediction_hybrid.items():
    for item_id in item_ids:
        # Create a Prediction object with empty values for r_ui, est, and details
        prediction = Prediction(uid=user_id, iid=item_id, r_ui=None, est=None, details=None)
        # Append the Prediction object to the list
        prediction_hybrid_formatted.append(prediction)


In [37]:
rec_sys_dict = {'Item-based KNNWithMeans': prediction_item_based_KNNWithMeans, 
                'Item-based KNNBasic': prediction_item_based_KNNBasic,
                'User-based KNNWithMeans': prediction_user_based_KNNWithMeans,
                'User-based KNNBasic': prediction_user_based_KNNBasic,
                'Singular Value Decomposition (SVD)': prediction_SVD,
                'Hybrid Approach' : prediction_hybrid_formatted}

In [38]:
import tkinter as tk
from tkinter import ttk

def create_recommendation_interface(rec_sys_dict):
    # Function to create GUI for book recommendation system
    global notebook, text_results, comboboxes, data_preprocessed, current_userID

    current_userID = 'Enter User ID'
    
    # Create the root window
    root = tk.Tk()
    root.title("Book Recommendation System")
    root.tk_setPalette(background='#f0f0f0', foreground='#2e3440', activeBackground='#ff8c00', activeForeground='#2e3440')

    # Create a notebook (tabs) to switch between recommender systems
    notebook = ttk.Notebook(root)
    notebook.pack(fill='both', expand=True)

    # Event handler for tab selection
    notebook.bind("<<NotebookTabChanged>>", on_tab_selected)
    
    # Dictionary to store comboboxes for each tab
    comboboxes = {}
    text_results = {}

    for system_name, system_pred in rec_sys_dict.items():
        # Create a frame for the current tab
        frame = tk.Frame(notebook)
        frame.configure(background='#e0e0e0')  # Set a slightly darker grey for the frame
        notebook.add(frame, text=system_name)
        
        # Create a label and combobox for entering the user ID
        label_user_id = tk.Label(frame, text="Enter User ID:", font=("Helvetica", 12), background='#e0e0e0', foreground='#ff8c00')
        label_user_id.pack(pady=5)
        
        combobox_user_id = ttk.Combobox(frame, font=("Helvetica", 12))
        combobox_user_id.pack(pady=5)
        
        # Store the combobox in the dictionary
        comboboxes[system_name] = combobox_user_id

        # Get the unique user IDs for the current tab's system
        user_ids = set([pred.uid for pred in system_pred])
        
        # Update the combobox with the user IDs
        combobox_user_id['values'] = sorted(user_ids)

        # Set the initial value of combobox to current_userID
        combobox_user_id.set(current_userID)

        # Create a search button
        button_search = tk.Button(frame, text="Search", command=lambda system_name=system_name: search_books(system_name), font=("Helvetica", 12), bg="#ff8c00", fg="#2e3440", activebackground="#ffa31a", activeforeground="#2e3440")
        button_search.pack(pady=5)

        # Create a text widget to display results
        text_results[system_name] = tk.Text(frame, height=15, width=130 , font=("Helvetica", 12), bg="#f0f0f0", fg="#2e3440", selectbackground="#ff8c00", selectforeground="#2e3440")
        text_results[system_name].pack(pady=10, padx=10)
        
        # Insert default message
        text_results[system_name].insert(tk.END, "Please select a user ID and click 'Search' to display results.\n")

    # Run the main event loop
    root.mainloop()

def on_tab_selected(event):
    # Event handler for tab selection
    global comboboxes, notebook, rec_sys_dict, current_userID

    #print('changed')

    # Get the system name of the currently selected tab
    system_name = notebook.tab(notebook.select(), "text")
    
    # Get the combobox for the current tab's system
    combobox_user_id = comboboxes[system_name]

    # Set combobox to last used userID
    combobox_user_id.set(current_userID)

    # Call search_books directly after setting the combobox
    search_books(system_name)

def search_books(system_name):

    # Function to search books for a given user ID
    global text_results, comboboxes, current_userID  # Declare global variables
    
    # Get the user ID entered by the user
    user_id = comboboxes[system_name].get()
    
    #print('Search')
    #print(user_id)
    
    # Check if the user ID is valid
    if user_id.strip() == '':
        #print('Error: Please enter a valid User ID.')
        messagebox.showerror("Error", "Please enter a valid User ID.")
        return
    
    # Clear the current contents of the text widget
    text_results[system_name].delete('1.0', tk.END)
    
    #print(system_name)
    
    # Get the predictions for the current tab's system
    selected_pred = rec_sys_dict[system_name]

    if (system_name != 'Hybrid Approach'):

        # Initialize an empty list to store predictions for the target UID
        predictions_for_uid = []
        
        # Iterate through each prediction in the selected prediction
        for prediction in selected_pred:
            # Check if the UID of the prediction matches the target UID
            if prediction.uid == user_id:
                # If it matches, append the prediction to the list
                predictions_for_uid.append(prediction)
    
        # Sort the predictions_for_uid list based on the estimated rating (est)
        predictions_for_uid_sorted = sorted(predictions_for_uid, key=lambda x: x.est, reverse=True)
    
        # Get the top ten predictions
        top_ten_predictions = predictions_for_uid_sorted[:10]
        #print(top_ten_predictions)
    
        # Insert the top ten predictions into the text widget
        text_results[system_name].insert(tk.END, f"Top Ten Predictions for User ID: {user_id}\n\n")
        for i, prediction in enumerate(top_ten_predictions, 1):
            # Get the title corresponding to the item ID (ASIN)
            title = data_preprocessed[data_preprocessed['asin'] == prediction.iid]['title'].values[0]
            text_results[system_name].insert(tk.END, f"{i}. Title: {title}\n")#, Item ID: {prediction.iid}, Estimated Rating: {prediction.est}\n")        
    else:
        #print(user_id)

        # Iterate over each Prediction object in selected_pred and get top ten predictions
        for prediction in selected_pred:            
            # Get the top ten predictions for the current user ID
            top_ten_predictions = [p for p in selected_pred if p.uid == user_id][:10]

        # Insert the top ten predictions into the text widget
        text_results[system_name].insert(tk.END, f"Top Ten Predictions for User ID: {user_id}\n\n")
        # Iterate over the top ten predictions for the current user ID
        for i, prediction in enumerate(top_ten_predictions, 1):
            # Get the title corresponding to the item ID (iid)
            title = data_preprocessed[data_preprocessed['asin'] == prediction.iid]['title'].values[0]
            text_results[system_name].insert(tk.END, f"{i}. Title: {title}\n")#, Item ID: {prediction.iid}, Estimated Rating: {prediction.est}\n")        
        
    #save current user for changing tabs
    current_userID = comboboxes[system_name].get()

# Example usage
create_recommendation_interface(rec_sys_dict)
