## Import

In [1]:
# Install the surprise package
!pip install -q -U scikit-surprise
from surprise import Dataset, Reader
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import arff
from surprise import KNNWithMeans
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise import KNNBasic
from surprise import SVD
from surprise.model_selection import cross_validate
from surprise import accuracy
import random
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
from surprise import AlgoBase

# Load Data

In [2]:
path = os.path.expanduser('~/Documents/Studium/Master/Web Mining/Project/data_kindle_preprocessed.xlsx')
data_preprocessed = pd.read_excel(path, index_col=[0])

## Data Preparation for Memory-Based and Model-Based

In [3]:
#Reading the dataset
#df_filtered = data.filter(['reviewerID', 'asin', 'rating'], axis=1)
#reader = Reader(rating_scale=(1, 5))
#data_filtered = Dataset.load_from_df(df_filtered,reader)
#Splitting the dataset
#trainset, testset = train_test_split(data_filtered, test_size=0.3,random_state=10)

In [4]:
# Define the reader
reader = Reader(rating_scale=(1, 5))

# Load the dataset from DataFrame
data = Dataset.load_from_df(data_preprocessed[["reviewerID", "asin", "rating"]], reader)

# Get the full training set
trainset = data.build_full_trainset()

# Create test set
testset = trainset.build_testset()

## Data Preparation for Content-Based

In [5]:
data_contentBased = data_preprocessed.copy()

In [6]:
# Replace non-finite values with NaN
data_contentBased['publication_year'] = data_contentBased['publication_year'].replace([np.inf, -np.inf], np.nan)

# Convert NaN to a placeholder value (e.g., -1)
data_contentBased['publication_year'].fillna(-1, inplace=True)

# Convert the column to integers
data_contentBased['publication_year'] = data_contentBased['publication_year'].astype(int).astype(str)

#preprare the textfiled 'book_info' to make vector
data_contentBased["book_info"] =  data_contentBased['category_string'] + '  ' + data_contentBased['brand'] + '  ' + data_contentBased['paid_free']+ ' ' + data_contentBased['print_length_category'] + ' ' + data_contentBased['publication_year'] + '  ' + data_contentBased['language'] 
data_contentBased.drop(['rating', 'brand', 'reviewerID', 'language','print_length_category', 'publication_year', 'category_string', 'paid_free'],axis=1,inplace=True)
data_contentBased.drop_duplicates(subset=['asin', 'title'], inplace=True)
data_contentBased.head()

Unnamed: 0,asin,title,book_info
0,B0015Z7VFQ,Look What Santa Brought (The Perfect Gift) - K...,"Kindle Store, Kindle eBooks, Literature & Fict..."
1,B0017HNV1U,Babylonian Laws- The Oldest Code of Laws in th...,"Kindle Store, Kindle eBooks, History King of ..."
2,B001892EI8,The Billionaire&s Baby (Harlequin Mini # 19) -...,"Kindle Store, Kindle eBooks, Romance Leanne B..."
4,B001892DGG,The Wallflower (Halle Puma Book 1) - Kindle ed...,"Kindle Store, Kindle eBooks, Romance Visit Am..."
5,B001BRD238,Secrets: a PsyCop Novel - Kindle edition,"Kindle Store, Kindle eBooks, Romance Visit Am..."


# Collaborative Filtering Recommender System (Memory-Based)

## 1. Item-based collaborative filtering with k-NN
KNNWithMeans with cosine, c-fold:5
- Mean RMSE-Validation: 0.8901
- RMSE-Test: 0.8879494024817297

In [7]:
def item_based_KNNWithMeans_recommender_system(trainset, testset, data):

    # Define the parameter grid with an increased range of k and more CV folds
    param_grid = {'k': [3, 5, 10, 15, 20, 30, 35, 40],
                  'sim_options': {'name': ['cosine'],
                                  'user_based': [False]}
                  }
    
    # Use GridSearchCV to find the best model
    gs = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse', 'mae'], cv=5)
    gs.fit(data)
    
    # Get the best model from grid search
    best_algo = gs.best_estimator['rmse']
    print("Best RMSE:", gs.best_score['rmse'])
    print("Best Parameters:", gs.best_params['rmse'])
    
    # Evaluate the best model using cross-validation
    cross_validate(best_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    
    # Train the best model with the new parameters
    algo = KNNWithMeans(k=gs.best_params['rmse']['k'], sim_options={'name': 'cosine', 'user_based': False})
    algo.fit(trainset)
    
    # Evaluate the trained model on the test set
    test_pred = algo.test(testset)
    print("Item-based Model : Test Set")
    accuracy.rmse(test_pred, verbose=True)
    
    # run the trained model against the testset
    test_pred = algo.test(testset)
    
    return test_pred

In [8]:
def item_based_KNNWithMeans_recommender_system_test(trainset, testset, data):

    # Define the parameter grid with an increased range of k and more CV folds
    param_grid = {'k': [3, 5, 10, 15, 20, 30, 35, 40],
                  'sim_options': {'name': ['cosine'],
                                  'user_based': [False]}
                  }
    
    # Use GridSearchCV to find the best model
    gs = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse', 'mae'], cv=5)
    gs.fit(data)
    
    # Get the best model from grid search
    best_algo = gs.best_estimator['rmse']
    print("Best RMSE:", gs.best_score['rmse'])
    print("Best Parameters:", gs.best_params['rmse'])
    
    # Evaluate the best model using cross-validation
    cross_validate(best_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    
    # Train the best model with the new parameters
    # raus lassen -> algo = KNNWithMeans(k=gs.best_params['rmse']['k'], sim_options={'name': 'cosine', 'user_based': False})
    best_algo.fit(trainset) # best algo statt algo
    
    # Evaluate the trained model on the test set
    test_pred = best_algo.test(testset)
    print("Item-based Model : Test Set")
    accuracy.rmse(test_pred, verbose=True)
    
    return test_pred

## 2. Item-based collaborative filtering with k-NN
KNNBasic with cosine or pearson, c-fold:5
Best Parameters: {'k': 40, 'sim_options': {'name': 'cosine', 'user_based': False}}
- Mean RMSE-Validation: 0.7899
- RMSE-Test: 0.7978694214800277

In [9]:
def item_based_KNNBasic_recommender_system(trainset, testset, data):

    # Define the parameter grid with an increased range of k and more CV folds
    param_grid = {'k': [3, 5, 10, 15, 20, 30, 35, 40],
                  'sim_options': {'name': ['cosine', 'pearson'],
                                  'user_based': [False]}
                  }
    
    # Use GridSearchCV to find the best model
    gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=5)
    gs.fit(data)
    
    # Get the best model from grid search
    best_algo = gs.best_estimator['rmse']
    print("Best RMSE:", gs.best_score['rmse'])
    print("Best Parameters:", gs.best_params['rmse'])
    
    # Evaluate the best model using cross-validation
    cross_validate(best_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    
    # Train the best model with the new parameters
    algo = KNNBasic(k=gs.best_params['rmse']['k'], sim_options={'name': 'cosine', 'user_based': False})
    algo.fit(trainset)
    
    # Evaluate the trained model on the test set
    test_pred = algo.test(testset)
    print("Item-based Model : Test Set")
    accuracy.rmse(test_pred, verbose=True)

    # run the trained model against the testset
    test_pred = algo.test(testset)
    
    return test_pred

## 1. User-Based Collaborative Filtering with Knn 
KNNBasic with pearson, cv-fold:5
- RMSE Validation: 0.9869196859331819
- RMSE Test: 0.4940767427403984}}

In [10]:
def user_based_KNNBasic_pearson_recommender_system(trainset, testset, data):
    
    param_grid = {'k': [1,3,10, 20, 25, 30, 35, 40],
              'sim_options': {'name': ['pearson'],
                              'user_based': [True]}
              }

    #'bsl_options': {'method': ['als'],
    #                              'n_epochs': [1, 3, 5, 10],
    #                              'reg_u': [15, 20, 25],
    #                              'reg_i': [25, 30, 35, 40]
    #                              },
    #sim_options = {'name': 'pearson', 'user_based': False, 'min_support': 1}
    #knn =  KNNWithMeans(sim_options = sim_options)

    # Use GridSearchCV to find the best model
    gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=5)
    gs.fit(data)

    # Get the best model from grid search
    best_algo = gs.best_estimator['rmse']  # pass the best model to algo
    print("Best RMSE:", gs.best_score['rmse'])
    print("Best Parameters:", gs.best_params['rmse'])
    
    #Evaluate the best model using cross-validation
    cross_validate(best_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

    # Train the best model with the new parameters
    algo = KNNBasic(k=gs.best_params['rmse']['k'], sim_options={'name': 'cosine', 'user_based': True})
    algo.fit(trainset)

    # Evaluate the trained model on the test set
    test_pred = algo.test(testset)
    print("Item-based Model : Test Set")
    accuracy.rmse(test_pred, verbose=True)

    # run the trained model against the testset
    test_pred = algo.test(testset)
    
    return test_pred

## 3. User-Based Collaborative Filtering with Knn
KNNBasic with pearson or cosine, cv: 5

Best Parameters: {'k': 10, 'sim_options': {'name': 'pearson', 'user_based': True}}

- Mean RMSE-Validation: 0.7897
- RMSE-Test: 0.2329391001651323

In [11]:
def user_based_KNNBasic_pearson_cosine_recommender_system(trainset, testset, data):
    
    param_grid = {'k': [1, 3, 10, 20, 25, 30, 35, 40],
                  'sim_options': {'name': ['pearson', 'cosine'],
                                  'user_based': [True]}
                  }

    # Use GridSearchCV to find the best model
    gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=5, refit='rmse')
    gs.fit(data)

    # Get the best model from grid search
    best_algo = gs.best_estimator['rmse']  # pass the best model to algo
    print("Best RMSE:", gs.best_score['rmse'])
    print("Best Parameters:", gs.best_params['rmse'])

    #Evaluate the best model using cross-validation
    cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

    # Train the best model with the new parameters
    algo = KNNBasic(k=gs.best_params['rmse']['k'], sim_options={'name': 'cosine', 'user_based': True})
    algo.fit(trainset)

    # Evaluate the trained model on the test set
    test_pred = algo.test(testset)
    print("User-based Model : Test Set")
    accuracy.rmse(test_pred, verbose=True)

    # run the trained model against the testset
    test_pred = algo.test(testset)
    
    return test_pred

# Model-Based Recommender System

### Content-based filtering system

In [12]:
def SVD_recommender_system(trainset, testset, data):
    """
    Trains an SVD-based recommender system using grid search for hyperparameter tuning,
    and evaluates it on the provided test set.
    
    Parameters:
    - trainset (surprise.Trainset): Training set for the recommender system.
    - testset (list of tuples): Test set for evaluation.
    - data (surprise.Dataset): Full dataset for grid search.
    
    Returns:
    - test_pred (list of Prediction objects): Predictions on the test set.
    """
    
    # Define the parameter grid for grid search
    param_grid = {
        "n_factors": [25, 40, 55],
        "n_epochs": [10, 20],
        "lr_all": [0.005, 0.025, 0.125],
        "reg_all": [0.08, 0.16, 0.32],
        "random_state": [0],
    }
    
    # Perform grid search
    grid_search = GridSearchCV(
        SVD,
        param_grid,
        measures=["rmse", "mae"],
        cv=3,
        refit=True,
        n_jobs=-1,
        joblib_verbose=2
    )
    
    grid_search.fit(data)
    
    # Store grid search results for analysis
    results_df = pd.DataFrame.from_dict(grid_search.cv_results)
    
    # Print and return best parameters
    best_rmse = grid_search.best_score['rmse']
    best_params = grid_search.best_params['rmse']
    print("Best RMSE:", best_rmse)
    print("Best Parameters:", best_params)
    
    # Perform cross-validation on the best estimator
    cross_validate(grid_search.best_estimator['rmse'], data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    
    # Test the best model on the provided test set
    test_pred = grid_search.best_estimator['rmse'].test(testset)
    print("Content-based Model: Test Set")
    accuracy.rmse(test_pred, verbose=True)
    
    return test_pred


# Content Recommender System

### TF-IDF Verctorizer and Cosine Similarity

In [13]:
def tfidf_recommender_system(data_contentBased):
    # Vectorize the text data using TfidfVectorizer
    tfidf_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=1, stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(data_contentBased['book_info'])
    
    # Split the data into train, validation, and test sets
    train_data, test_data = train_test_split(data_contentBased, test_size=0.2, random_state=42)
    train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)
    
    # Debugging print
    print("Train Data Shape:", train_data.shape)
    print("Validation Data Shape:", val_data.shape)
    print("Test Data Shape:", test_data.shape)
    
    # Calculate cosine similarity matrices
    tfidf_matrix_train = tfidf_vectorizer.transform(train_data['book_info'])
    tfidf_matrix_val = tfidf_vectorizer.transform(val_data['book_info'])
    tfidf_matrix_test = tfidf_vectorizer.transform(test_data['book_info'])
    
    # Calculate cosine similarity matrices with correct dimensions
    cosine_sim_train = cosine_similarity(tfidf_matrix_train, tfidf_matrix_train)
    cosine_sim_val = cosine_similarity(tfidf_matrix_val, tfidf_matrix_train)
    cosine_sim_test = cosine_similarity(tfidf_matrix_test, tfidf_matrix_train)
    
    # Debugging print
    print("Cosine similarity matrix size (Train):", cosine_sim_train.shape)
    print("Cosine similarity matrix size (Validation):", cosine_sim_val.shape)
    print("Cosine similarity matrix size (Test):", cosine_sim_test.shape)
    
    # Implement function to recommend books
    def recommend(asin, cosine_sim_matrix, train_data):
        indices = train_data[train_data['asin'] == asin].index.tolist()
        if not indices:
            return []
        
        recommended_books = []
        for idx in indices:
            try:
                sim_scores = list(enumerate(cosine_sim_matrix[idx]))
                sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
                sim_scores = sim_scores[1:11]  # Top 10 similar items
                book_indices = [i[0] for i in sim_scores]
                recommended_books.extend(train_data['asin'].iloc[book_indices].tolist())
            except IndexError:
                continue
        
        return recommended_books if recommended_books else []  # Return empty list if no recommendations found
    
    # Pre-calculate recommendations for all books where recommendations are available
    all_asins = train_data['asin'].unique()
    pred_content_based_recommender_system = {asin: recommend(asin, cosine_sim_train, train_data)[:10] for asin in all_asins}
    
    # Evaluate the recommender system
    def evaluate_recommender(test_data, cosine_sim_matrix, train_data):
        y_true = []
        y_pred = []
        for asin in test_data['asin']:
            true_books = set(test_data[test_data['asin'] == asin]['asin'])
            recommended_books = set(pred_content_based_recommender_system.get(asin, []))  # Get precalculated recommendations
            y_true.append(len(true_books))
            y_pred.append(len(recommended_books))
    
        mae = mean_absolute_error(y_true, y_pred)
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        return mae, rmse
    
    # Evaluate the recommender system
    mae_val, rmse_val = evaluate_recommender(val_data, cosine_sim_val, train_data)
    mae_test, rmse_test = evaluate_recommender(test_data, cosine_sim_test, train_data)
    
    print("Validation MAE:", mae_val)
    print("Validation RMSE:", rmse_val)
    print("Test MAE:", mae_test)
    print("Test RMSE:", rmse_test)

    return pred_content_based_recommender_system


# Run All predictions - Compare RMSE and MAE

In [14]:
prediction_SVD = SVD_recommender_system(trainset, testset, data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 162 out of 162 | elapsed:   11.5s finished


Best RMSE: 0.7484365727024492
Best Parameters: {'n_factors': 25, 'n_epochs': 20, 'lr_all': 0.005, 'reg_all': 0.08, 'random_state': 0}
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7338  0.7485  0.7339  0.7567  0.7651  0.7476  0.0124  
MAE (testset)     0.5253  0.5297  0.5241  0.5357  0.5400  0.5310  0.0061  
Fit time          0.11    0.09    0.11    0.10    0.10    0.10    0.01    
Test time         0.02    0.10    0.02    0.01    0.02    0.03    0.03    
Content-based Model: Test Set
RMSE: 0.6735


In [15]:
prediction_item_based_KNNWithMeans =  item_based_KNNWithMeans_recommender_system(trainset, testset, data)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing th

In [16]:
prediction_item_based_KNNWithMeans_test = item_based_KNNWithMeans_recommender_system_test(trainset, testset, data)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing th

In [17]:
prediction_item_based_KNNBasic =  item_based_KNNBasic_recommender_system(trainset, testset, data)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computi

In [18]:
prediction_user_based_KNNBasic_pearson = user_based_KNNBasic_pearson_recommender_system(trainset, testset, data)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.

In [19]:
pred_content_based_recommender_system = tfidf_recommender_system(data_contentBased)

Train Data Shape: (8700, 3)
Validation Data Shape: (2900, 3)
Test Data Shape: (2901, 3)
Cosine similarity matrix size (Train): (8700, 8700)
Cosine similarity matrix size (Validation): (2900, 8700)
Cosine similarity matrix size (Test): (2901, 8700)
Validation MAE: 1.0
Validation RMSE: 1.0
Test MAE: 1.0
Test RMSE: 1.0


# Hybrid Approach

### User-Based Collaborative Filtering + Content-Based Filtering
Run after these predictions are calculated

In [39]:
# Create a dummy algorithm class that does nothing for the DummyAlgorithm class 
# that initializes the trainset attribute with a dummy value. Here's how you can modify the code:

class DummyAlgorithm(AlgoBase):
    def __init__(self):
        pass

    def fit(self, trainset):
        self.trainset = trainset  # Initialize trainset attribute with a dummy value
        pass

    def estimate(self, u, i):
        return 5  # Return a dummy rating of 5 for all predictions


def hybrid_recommender_system(prediction, data_contentBased, data):
    # Convert prediction to a dictionary with user IDs as keys
    user_item_recs = {}
    for pred in prediction:
        user_id = pred.uid
        item_id = pred.iid
        if user_id not in user_item_recs:
            user_item_recs[user_id] = []
        user_item_recs[user_id].append(item_id)

    # Initialize dictionary for hybrid recommendations
    hybrid_recs = {}

    # For each user, generate recommendations
    for user_id, items in user_item_recs.items():
        # Initialize list to store recommendations for the user
        user_recommendations = items.copy()  # Store original recommendations

        # For each item recommended to the user
        for item in items:
            # Get similar item recommendations from TF-IDF
            similar_items = data_contentBased.get(item)

            # Check if similar_items is not None before iterating
            if similar_items is not None:
                # Extend user_recommendations with similar items (excluding already recommended items)
                user_recommendations.extend([item for item in similar_items if item not in items])

        # Count occurrences of each item ID (ASIN) in user_recommendations
        item_counts = {}
        for item in user_recommendations:
            if item in item_counts:
                item_counts[item] += 1
            else:
                item_counts[item] = 1

        # Sort items by count (descending order) and prioritize original recommendations if counts are equal
        sorted_items = sorted(item_counts.items(), key=lambda x: (-x[1], x[0]))

        # Take the top ten ASINs from sorted_items
        top_ten_asins = [item[0] for item in sorted_items[:10]]

        # Add user recommendations to the hybrid recommendations
        hybrid_recs[user_id] = top_ten_asins

    # Cross-validate the hybrid recommender system
    dummy_algo = DummyAlgorithm()
    cross_val_results = cross_validate(dummy_algo, data, measures=['mae', 'rmse'], cv=5, verbose=True)
    avg_mae = np.mean(cross_val_results['test_mae'])
    avg_rmse = np.mean(cross_val_results['test_rmse'])

    # Calculate MAE and RMSE on the test data
    actual_ratings = []
    predicted_ratings = []
    for user_id, items in hybrid_recs.items():
        for item in items:
            actual_rating = [rating for (uid, iid, rating, _) in data.raw_ratings if uid == user_id and iid == item]
            if actual_rating:  # Check if actual rating exists
                actual_ratings.append(actual_rating[0])
                predicted_ratings.append(5)  # Assuming all predicted ratings are 5 (can be replaced with actual predictions)

    mae = mean_absolute_error(actual_ratings, predicted_ratings)
    rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))

    # Print MAE and RMSE
    print("MAE:", mae)
    print("RMSE:", rmse)

    return hybrid_recs, avg_mae, avg_rmse, mae, rmse

# Call the hybrid recommender system function
prediction_hybrid, avg_mae, avg_rmse, mae, rmse = hybrid_recommender_system(prediction_item_based_KNNWithMeans_test, pred_content_based_recommender_system, data)


Evaluating MAE, RMSE of algorithm DummyAlgorithm on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.6375  0.6538  0.6175  0.6106  0.6216  0.6282  0.0156  
RMSE (testset)    1.1371  1.1445  1.0971  1.0857  1.1153  1.1160  0.0225  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.01    0.01    0.01    0.02    0.01    0.01    0.00    
MAE: 0.6
RMSE: 1.02469507659596


In [40]:
prediction_hybrid

{'A3OC8ZG1S3OAVA': ['B00GW6S0DU',
  'B00HND20QY',
  'B00K1RB5JW',
  'B00RC83ATG',
  'B00KSND3LW',
  'B00M3GWXHW',
  'B00MAX4ZPQ',
  'B015TYIW42',
  'B019PJ5LI2',
  'B01AKXRKO0'],
 'A2U8YWPP1PYHJM': ['B00ND67XBC',
  'B00A946MNQ',
  'B00L2X37QS',
  'B00N6CY244',
  'B00IHN8U0Y',
  'B00NLKJ4O4',
  'B00O0A6Y74',
  'B00Q10EP3Q',
  'B00UR3GQ1M',
  'B00V5AP3OW'],
 'A3361XGKYF17S3': ['B00M3GWXHW',
  'B00MAX4ZPQ',
  'B00HND20QY',
  'B00QSTMEOW',
  'B00UR3GQ1M',
  'B00YDI785U',
  'B015A9WZOE',
  'B015KXNYRM',
  'B019FZFFK0',
  'B007WNXQS6'],
 'AVGYENZU56KBR': ['B00HND20QY',
  'B00D2XOTN6',
  'B00JD5B6A6',
  'B004UWU9IM',
  'B00F5FBRD4',
  'B00FPTE110',
  'B005WVPKMU',
  'B00D01DLRK',
  'B00E6ZL3D8',
  'B00ENU0NDM'],
 'A13Q7A1UWMNUU6': ['B00D2XOTN6',
  'B00HKTD5RY',
  'B00MZAIT6K',
  'B00W05SHZS',
  'B00WRBQPE0',
  'B016ARHVIU',
  'B019DZKCTQ',
  'B0019BI4XE',
  'B001BRD238',
  'B002HE1GS2'],
 'A310KT1UQC5UNU': ['B00MAX4ZPQ',
  'B00HND20QY',
  'B00M3GWXHW',
  'B01AKXRKO0',
  'B01D3C6IAW',
  'B00M7

# Interface

In [27]:
prediction_user_based_KNNBasic_pearson

[Prediction(uid='A3OC8ZG1S3OAVA', iid='B0015Z7VFQ', r_ui=1.0, est=1, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid='A3OC8ZG1S3OAVA', iid='B003BEDV08', r_ui=3.0, est=3.0000000000000004, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid='A3OC8ZG1S3OAVA', iid='B00405R608', r_ui=3.0, est=3.0, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid='A3OC8ZG1S3OAVA', iid='B0043GX2HU', r_ui=3.0, est=3.0, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid='A3OC8ZG1S3OAVA', iid='B004C44556', r_ui=4.0, est=2.858804453446679, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid='A3OC8ZG1S3OAVA', iid='B004EYT9CS', r_ui=5.0, est=5, details={'actual_k': 1, 'was_impossible': False}),
 Prediction(uid='A3OC8ZG1S3OAVA', iid='B004UC6H7O', r_ui=5.0, est=4.999999999999999, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid='A3OC8ZG1S3OAVA', iid='B004W0C520', r_ui=4.0, est=3.9991865250332443, details={'actual_k': 

In [44]:
from collections import namedtuple

# Define the Prediction namedtuple
Prediction = namedtuple('Prediction', ['uid', 'iid', 'r_ui', 'est', 'details'])

# Initialize an empty list to store Prediction objects
prediction_hybrid_formatted = []

# Iterate over each user ID and their corresponding list of item IDs
for user_id, item_ids in prediction_hybrid.items():
    for item_id in item_ids:
        # Create a Prediction object with empty values for r_ui, est, and details
        prediction = Prediction(uid=user_id, iid=item_id, r_ui=None, est=None, details=None)
        # Append the Prediction object to the list
        prediction_hybrid_formatted.append(prediction)

# Print the first few elements of the formatted predictions
for prediction in prediction_hybrid_formatted:
    print(prediction)


Prediction(uid='A3OC8ZG1S3OAVA', iid='B00GW6S0DU', r_ui=None, est=None, details=None)
Prediction(uid='A3OC8ZG1S3OAVA', iid='B00HND20QY', r_ui=None, est=None, details=None)
Prediction(uid='A3OC8ZG1S3OAVA', iid='B00K1RB5JW', r_ui=None, est=None, details=None)
Prediction(uid='A3OC8ZG1S3OAVA', iid='B00RC83ATG', r_ui=None, est=None, details=None)
Prediction(uid='A3OC8ZG1S3OAVA', iid='B00KSND3LW', r_ui=None, est=None, details=None)
Prediction(uid='A3OC8ZG1S3OAVA', iid='B00M3GWXHW', r_ui=None, est=None, details=None)
Prediction(uid='A3OC8ZG1S3OAVA', iid='B00MAX4ZPQ', r_ui=None, est=None, details=None)
Prediction(uid='A3OC8ZG1S3OAVA', iid='B015TYIW42', r_ui=None, est=None, details=None)
Prediction(uid='A3OC8ZG1S3OAVA', iid='B019PJ5LI2', r_ui=None, est=None, details=None)
Prediction(uid='A3OC8ZG1S3OAVA', iid='B01AKXRKO0', r_ui=None, est=None, details=None)
Prediction(uid='A2U8YWPP1PYHJM', iid='B00ND67XBC', r_ui=None, est=None, details=None)
Prediction(uid='A2U8YWPP1PYHJM', iid='B00A946MNQ', r_u

In [45]:
rec_sys_dict = {'SVD': prediction_SVD, 
                'Item-based KNNWithMeans': prediction_item_based_KNNWithMeans, 
                'Item-based KNNBasic': prediction_item_based_KNNBasic,
                'User-based KNNBasic': prediction_user_based_KNNBasic_pearson,
                'Hybrid Approach' : prediction_hybrid_formatted}

In [None]:
import tkinter as tk
from tkinter import ttk

def create_recommendation_interface(rec_sys_dict):
    # Function to create GUI for book recommendation system
    global notebook, text_results, comboboxes, data_preprocessed, current_userID

    current_userID = 'Enter User ID'
    
    # Create the root window
    root = tk.Tk()
    root.title("Book Recommendation System")
    root.tk_setPalette(background='#f0f0f0', foreground='#2e3440', activeBackground='#ff8c00', activeForeground='#2e3440')

    # Create a notebook (tabs) to switch between recommender systems
    notebook = ttk.Notebook(root)
    notebook.pack(fill='both', expand=True)

    # Event handler for tab selection
    notebook.bind("<<NotebookTabChanged>>", on_tab_selected)
    
    # Dictionary to store comboboxes for each tab
    comboboxes = {}
    text_results = {}

    for system_name, system_pred in rec_sys_dict.items():
        # Create a frame for the current tab
        frame = tk.Frame(notebook)
        frame.configure(background='#e0e0e0')  # Set a slightly darker grey for the frame
        notebook.add(frame, text=system_name)
        
        # Create a label and combobox for entering the user ID
        label_user_id = tk.Label(frame, text="Enter User ID:", font=("Helvetica", 12), background='#e0e0e0', foreground='#ff8c00')
        label_user_id.pack(pady=5)
        
        combobox_user_id = ttk.Combobox(frame, font=("Helvetica", 12))
        combobox_user_id.pack(pady=5)
        
        # Store the combobox in the dictionary
        comboboxes[system_name] = combobox_user_id

        # Get the unique user IDs for the current tab's system
        user_ids = set([pred.uid for pred in system_pred])
        
        # Update the combobox with the user IDs
        combobox_user_id['values'] = sorted(user_ids)

        # Set the initial value of combobox to current_userID
        combobox_user_id.set(current_userID)

        # Create a search button
        button_search = tk.Button(frame, text="Search", command=lambda system_name=system_name: search_books(system_name), font=("Helvetica", 12), bg="#ff8c00", fg="#2e3440", activebackground="#ffa31a", activeforeground="#2e3440")
        button_search.pack(pady=5)

        # Create a text widget to display results
        text_results[system_name] = tk.Text(frame, height=15, width=130 , font=("Helvetica", 12), bg="#f0f0f0", fg="#2e3440", selectbackground="#ff8c00", selectforeground="#2e3440")
        text_results[system_name].pack(pady=10, padx=10)
        
        # Insert default message
        text_results[system_name].insert(tk.END, "Please select a user ID and click 'Search' to display results.\n")

    # Run the main event loop
    root.mainloop()

def on_tab_selected(event):
    # Event handler for tab selection
    global comboboxes, notebook, rec_sys_dict, current_userID

    print('changed')

    # Get the system name of the currently selected tab
    system_name = notebook.tab(notebook.select(), "text")
    
    # Get the combobox for the current tab's system
    combobox_user_id = comboboxes[system_name]

    # Set combobox to last used userID
    combobox_user_id.set(current_userID)

    # Call search_books directly after setting the combobox
    search_books(system_name)

def search_books(system_name):

    # Function to search books for a given user ID
    global text_results, comboboxes, current_userID  # Declare global variables
    
    # Get the user ID entered by the user
    user_id = comboboxes[system_name].get()
    
    print('Search')
    print(user_id)
    
    # Check if the user ID is valid
    if user_id.strip() == '':
        #print('Error: Please enter a valid User ID.')
        messagebox.showerror("Error", "Please enter a valid User ID.")
        return
    
    # Clear the current contents of the text widget
    text_results[system_name].delete('1.0', tk.END)
    
    print(system_name)
    
    # Get the predictions for the current tab's system
    selected_pred = rec_sys_dict[system_name]

    if (system_name != 'Hybrid Approach'):

        # Initialize an empty list to store predictions for the target UID
        predictions_for_uid = []
        
        # Iterate through each prediction in the selected prediction
        for prediction in selected_pred:
            # Check if the UID of the prediction matches the target UID
            if prediction.uid == user_id:
                # If it matches, append the prediction to the list
                predictions_for_uid.append(prediction)
    
        # Sort the predictions_for_uid list based on the estimated rating (est)
        predictions_for_uid_sorted = sorted(predictions_for_uid, key=lambda x: x.est, reverse=True)
    
        # Get the top ten predictions
        top_ten_predictions = predictions_for_uid_sorted[:10]
        #print(top_ten_predictions)
    
        # Insert the top ten predictions into the text widget
        text_results[system_name].insert(tk.END, f"Top Ten Predictions for User ID: {user_id}\n\n")
        for i, prediction in enumerate(top_ten_predictions, 1):
            # Get the title corresponding to the item ID (ASIN)
            title = data_preprocessed[data_preprocessed['asin'] == prediction.iid]['title'].values[0]
            text_results[system_name].insert(tk.END, f"{i}. Title: {title}\n")#, Item ID: {prediction.iid}, Estimated Rating: {prediction.est}\n")        
    else:
        print(user_id)

        # Iterate over each Prediction object in selected_pred and get top ten predictions
        for prediction in selected_pred:            
            # Get the top ten predictions for the current user ID
            top_ten_predictions = [p for p in selected_pred if p.uid == user_id][:10]

        # Insert the top ten predictions into the text widget
        text_results[system_name].insert(tk.END, f"Top Ten Predictions for User ID: {user_id}\n\n")
        # Iterate over the top ten predictions for the current user ID
        for i, prediction in enumerate(top_ten_predictions, 1):
            # Get the title corresponding to the item ID (iid)
            title = data_preprocessed[data_preprocessed['asin'] == prediction.iid]['title'].values[0]
            text_results[system_name].insert(tk.END, f"{i}. Title: {title}\n")#, Item ID: {prediction.iid}, Estimated Rating: {prediction.est}\n")        
        
    #save current user for changing tabs
    current_userID = comboboxes[system_name].get()

# Example usage
create_recommendation_interface(rec_sys_dict)


changed
Search
Enter User ID
SVD
changed
Search
Enter User ID
Hybrid Approach
Enter User ID
Top Ten Predictions for User ID: A3OC8ZG1S3OAVA
1. Title: Alpha Wolf (Black Mesa Wolves Book 2) - Kindle edition
2. Title:  Shadow of Doubt - Part 1 eBook
3. Title: Hollywood Blood: A Hollywood Alphabet Series Thriller - Kindle edition
4. Title: Wild Wolf (Black Mesa Wolves Book 4) - Kindle edition
5. Title: Hollywood Crazy: A Holllywood Alphabet Series Thriller (A Hollywood Alphabet Series Thriller Book 3) - Kindle edition
6. Title: The Billionaire&s Christmas: A Sinclair Novella (The Sinclairs) - Kindle edition
7. Title:  Hunting Wolf (Black Mesa Wolves Book 3) eBook
8. Title: Hollywood Murder: A Hollywood Alphabet Series Thriller - Kindle edition
9. Title: Ruled
10. Title: Reunion - Kindle edition
Top Ten Predictions for User ID: A3OC8ZG1S3OAVA
1. Title: Alpha Wolf (Black Mesa Wolves Book 2) - Kindle edition
2. Title:  Shadow of Doubt - Part 1 eBook
3. Title: Hollywood Blood: A Hollywood Alph