## Import

In [48]:
# Install the surprise package
!pip install -q -U scikit-surprise
from surprise import Dataset, Reader
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import arff
from surprise import KNNWithMeans
from surprise.dataset import DatasetAutoFolds
from surprise.model_selection import GridSearchCV
from surprise import KNNBasic
from surprise import SVD
from surprise.model_selection import cross_validate
from surprise import accuracy
import random
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
from surprise import AlgoBase
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Load Data

In [63]:
path = os.path.expanduser('../data/data_kindle_preprocessed.xlsx')
data_preprocessed = pd.read_excel(path, index_col=[0], dtype={'publication_year': str})

## Data Preparation for Memory-Based and Model-Based

In [64]:
from surprise.model_selection import train_test_split

reader = Reader(rating_scale=(1,5)) # rating scale range

# Load the dataset from DataFrame
data = Dataset.load_from_df(data_preprocessed[["reviewerID", "asin", "rating"]], reader)

# Split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.25, random_state=42)

## Data Preparation for Content-Based

In [65]:
data_preprocessed['book_info']

0        Kindle Store, Kindle eBooks, Literature & Fict...
1        Kindle Store, Kindle eBooks, History  King of ...
2        Kindle Store, Kindle eBooks, Romance  Leanne B...
3        Kindle Store, Kindle eBooks, Romance  Leanne B...
4        Kindle Store, Kindle eBooks, Romance  Visit Am...
                               ...                        
19563    Kindle Store, Kindle eBooks, Literature & Fict...
19564    Kindle Store, Kindle eBooks, Literature & Fict...
19565    Kindle Store, Kindle eBooks, Science Fiction &...
19566    Kindle Store, Kindle eBooks, Literature & Fict...
19567    Kindle Store, Kindle eBooks, Teen & Young Adul...
Name: book_info, Length: 19568, dtype: object

In [90]:
#create subset of data_preprocessing to create new subset of columns
data_contentBased = data_preprocessed[["reviewerID", "asin", "rating", "book_info"]]

In [91]:
data_contentBased

Unnamed: 0,reviewerID,asin,rating,book_info
0,A3OC8ZG1S3OAVA,B0015Z7VFQ,1.0,"Kindle Store, Kindle eBooks, Literature & Fict..."
1,A2U8YWPP1PYHJM,B0017HNV1U,4.0,"Kindle Store, Kindle eBooks, History King of ..."
2,A3361XGKYF17S3,B001892EI8,3.0,"Kindle Store, Kindle eBooks, Romance Leanne B..."
3,AVGYENZU56KBR,B001892EI8,4.0,"Kindle Store, Kindle eBooks, Romance Leanne B..."
4,A3361XGKYF17S3,B001892DGG,3.0,"Kindle Store, Kindle eBooks, Romance Visit Am..."
...,...,...,...,...
19563,A1EQY74OFGE4NE,B01HIGNUGE,4.0,"Kindle Store, Kindle eBooks, Literature & Fict..."
19564,A1EQY74OFGE4NE,B01HINH1WQ,3.0,"Kindle Store, Kindle eBooks, Literature & Fict..."
19565,A1SVA69J57MX2A,B01HIOR0S0,5.0,"Kindle Store, Kindle eBooks, Science Fiction &..."
19566,A3FVMG7SWNF7QR,B01HIULQXY,5.0,"Kindle Store, Kindle eBooks, Literature & Fict..."


# Collaborative Filtering Recommender System (Memory-Based)

## 1. Item-based collaborative filtering with KNNWithMeans

In [5]:
def item_based_KNNWithMeans_recommender_system(trainset, testset, data):

    # Define the parameter grid with an increased range of k and more CV folds
    param_grid = {'k': [1, 3, 10, 15, 20, 25, 30, 35, 40],
                  'sim_options': {'name': ['cosine', 'pearson'],
                                  'user_based': [False]}
                  }
    
    # Use GridSearchCV to find the best model
    gs = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse', 'mae'], cv=5)
    gs.fit(data)
    
    # Get the best model from grid search
    best_algo = gs.best_estimator['rmse']
    print("Best RMSE:", gs.best_score['rmse'])
    print("Best Parameters:", gs.best_params['rmse'])
    
    # Evaluate the best model using cross-validation
    cross_validate(best_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

    #Choose best algorithm based on grid search
    algo = KNNWithMeans(k=gs.best_params['rmse']['k'], sim_options=gs.best_params['rmse']['sim_options'])
    
    # Train the best model with the new parameters and evaluate the trained model on the test set
    test_pred = algo.fit(trainset).test(testset)

    # Calculate evaluation metrics
    print("Item-based Model with KNNWithMeans: Test Set")
    accuracy.rmse(test_pred, verbose=True)
    accuracy.mae(test_pred, verbose=True)
    
    # run the trained model against the testset
    test_pred = algo.test(testset)
    
    return test_pred

## 2. Item-based collaborative filtering with KNNBasic

In [7]:
def item_based_KNNBasic_recommender_system(trainset, testset, data):

    # Define the parameter grid with an increased range of k and more CV folds
    param_grid = {'k': [1, 3, 10, 15, 20, 25, 30, 35, 40],
                  'sim_options': {'name': ['cosine', 'pearson'],
                                  'user_based': [False]}
                  }
    
    # Use GridSearchCV to find the best model
    gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=5)
    gs.fit(data)
    
    # Get the best model from grid search
    best_algo = gs.best_estimator['rmse']
    print("Best RMSE:", gs.best_score['rmse'])
    print("Best Parameters:", gs.best_params['rmse'])
    
    # Evaluate the best model using cross-validation
    cross_validate(best_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

    #Choose best algorithm based on grid search
    algo = KNNBasic(k=gs.best_params['rmse']['k'], sim_options=gs.best_params['rmse']['sim_options'])
    
    # Train the best model with the new parameters and evaluate the trained model on the test set
    test_pred = algo.fit(trainset).test(testset)

    # Calculate evaluation metrics
    print("Item-based Model with KNNBasic: Test Set")
    accuracy.rmse(test_pred, verbose=True)
    accuracy.mae(test_pred, verbose=True)
    
    # run the trained model against the testset
    test_pred = algo.test(testset)
    
    return test_pred

## 3. User-Based Collaborative Filtering with KNNWithMeans

In [8]:
def user_based_KNNWithMeans_recommender_system(trainset, testset, data):
    
    param_grid = {'k': [1, 3, 10, 15, 20, 25, 30, 35, 40],
                  'sim_options': {'name': ['pearson', 'cosine'],
                                  'user_based': [True]}
                 }


    # Use GridSearchCV to find the best model
    gs = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse', 'mae'], cv=5)
    gs.fit(data)
    
    # Get the best model from grid search
    best_algo = gs.best_estimator['rmse']
    print("Best RMSE:", gs.best_score['rmse'])
    print("Best Parameters:", gs.best_params['rmse'])
    
    # Evaluate the best model using cross-validation
    cross_validate(best_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

    #Choose best algorithm based on grid search
    algo = KNNWithMeans(k=gs.best_params['rmse']['k'], sim_options=gs.best_params['rmse']['sim_options'])
    
    # Train the best model with the new parameters and evaluate the trained model on the test set
    test_pred = algo.fit(trainset).test(testset)

    # Calculate evaluation metrics
    print("User-based Model with KNNWithMeans: Test Set")
    accuracy.rmse(test_pred, verbose=True)
    accuracy.mae(test_pred, verbose=True)
    
    # run the trained model against the testset
    test_pred = algo.test(testset)
    
    return test_pred

## 4. User-Based Collaborative Filtering with KNNBasic

In [9]:
def user_based_KNNBasic_recommender_system(trainset, testset, data):
    
    param_grid = {'k': [1, 3, 10, 15, 20, 25, 30, 35, 40],
              'sim_options': {'name': ['pearson', 'cosine'],
                              'user_based': [True]}
              }

    # Use GridSearchCV to find the best model
    gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=5)
    gs.fit(data)

    # Get the best model from grid search
    best_algo = gs.best_estimator['rmse']
    print("Best RMSE:", gs.best_score['rmse'])
    print("Best Parameters:", gs.best_params['rmse'])
    
    # Evaluate the best model using cross-validation
    cross_validate(best_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

    #Choose best algorithm based on grid search
    algo = KNNBasic(k=gs.best_params['rmse']['k'], sim_options=gs.best_params['rmse']['sim_options'])
    
    # Train the best model with the new parameters and evaluate the trained model on the test set
    test_pred = algo.fit(trainset).test(testset)

    # Calculate evaluation metrics
    print("User-based Model with KNNBasic: Test Set")
    accuracy.rmse(test_pred, verbose=True)
    accuracy.mae(test_pred, verbose=True)
    
    # run the trained model against the testset
    test_pred = algo.test(testset)
    
    return test_pred

# Model-Based Recommender System

### Singular Value Decomposition (SVD)
 SVD is a matrix factorization technique commonly used in collaborative filtering-based recommender systems. 

In [16]:
def SVD_recommender_system(trainset, testset, data):
    """
    Trains an SVD-based recommender system using grid search for hyperparameter tuning,
    and evaluates it on the provided test set.
    
    Parameters:
    - trainset (surprise.Trainset): Training set for the recommender system.
    - testset (list of tuples): Test set for evaluation.
    - data (surprise.Dataset): Full dataset for grid search.
    
    Returns:
    - test_pred (list of Prediction objects): Predictions on the test set.
    """
    
    # Define the parameter grid for grid search
    param_grid = {
        "n_factors": [25, 30, 40, 55, 60, 75, 90, 100, 110],  # Number of latent factors in SVD model
        "n_epochs": [10, 20, 30], # Number of training epochs
        "lr_all": [0.005, 0.025, 0.125], # Learning rate for all parameters
        "reg_all": [0.08, 0.16, 0.32], # Regularization term for all parameters
        "random_state": [0], # Seed for reproducibility
    }

    # Perform grid search
    gs = GridSearchCV(SVD, 
                      param_grid, 
                      measures=["rmse", "mae"], 
                      cv=5, 
                      refit=True, # Refit on the entire dataset using best params
                      n_jobs=-1,  # Use all available CPUs for parallel processing
                      joblib_verbose=2 # Verbosity level for parallel processing
                     )
    gs.fit(data)

    # Get the best model from grid search
    best_algo = gs.best_estimator['rmse']
    params = gs.best_params['rmse']
    print("Best RMSE:", gs.best_score['rmse'])
    print("Best Parameters:", gs.best_params['rmse'])
    
    # Evaluate the best model using cross-validation
    cross_validate(best_algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

    #Choose best algorithm based on grid search
    algo = SVD(n_epochs = params['n_epochs'], 
               lr_all = params['lr_all'], 
               n_factors = params['n_factors'],
               reg_all = params['reg_all'],
               random_state = params["random_state"])
    
    # Train the best model with the new parameters and evaluate the trained model on the test set
    test_pred = algo.fit(trainset).test(testset)

    # Calculate evaluation metrics
    print("SVD: Test Set")
    accuracy.rmse(test_pred, verbose=True)
    accuracy.mae(test_pred, verbose=True)
    
    # run the trained model against the testset
    test_pred = algo.test(testset)
    
    return test_pred


# Content Recommender System

### With TF-IDF Verctorizer and Cosine Similarity

In [74]:
# imports used
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline # using pipeline so different 


# split data random state 0 and test_size 0.25 default as you did not give the test_size

# Splitting the data into training and testing sets
X = data_contentBased['book_info']  # Independent variable - contains category_string, brand, paid_free, print_length_category, publication_year, language
y = data_contentBased['asin']  # Dependent variable (product ID)

# split training and testdata. random_state 42 is the same as for the split of the collaborative filtering models, to make sure to use the same training/testsplit
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.25, random_state=42)




from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid_full = {
    'tfidf__sublinear_tf': [True, False],
    'tfidf__min_df': [1, 5, 10],
    'tfidf__norm': ['l1', 'l2'],
    'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'clf__alpha': [0.1, 0.5, 1.0]
}

# Define the parameter grid
param_grid = {
    'tfidf__sublinear_tf': [True],
    'tfidf__min_df': [5],
    'tfidf__norm': ['l2'],
    'tfidf__ngram_range': [(1, 2)]
    #'clf__alpha': [0.1, 0.5, 1.0]
}

# Initialize the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', MultinomialNB()) #classifier
])

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

from sklearn.model_selection import cross_val_score

# Get the best estimator from the grid search
best_estimator = grid_search.best_estimator_
cross_val = cross_val_score(best_estimator, X_train, y_train, cv=3, scoring='accuracy')

# Print the cross-validation scores
print("Cross-validation scores:", cross_val)

# Train the Best Model
best_estimator.fit(X_train, y_train)

# Evaluate the Trained Model
# You can use accuracy_score, classification_report, confusion_matrix, etc., depending on your evaluation needs
from sklearn.metrics import accuracy_score

# Predictions on the test set
y_pred = best_estimator.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on test set:", accuracy)

y_pred




Best parameters found: {'tfidf__min_df': 5, 'tfidf__ngram_range': (1, 2), 'tfidf__norm': 'l2', 'tfidf__sublinear_tf': True}
Best cross-validation score: 0.044562551103843016


In [97]:
# Import necessary libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Define the number of similar ASINs to retrieve for each ASIN
n_recommendations = 10

# 1. Feature Extraction
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf = tfidf_vectorizer.fit_transform(data_contentBased['book_info'])

# 2. Similarity Calculation
similarity_matrix = cosine_similarity(X_tfidf)

# 3. Recommendation
# Create a dictionary to store similar ASINs for each ASIN
similar_asins_dict = {}

# Iterate through each ASIN
for i, asin in enumerate(data_contentBased['asin']):
    # Find index of current ASIN
    asin_index = data_contentBased.index[data_contentBased['asin'] == asin].tolist()[0]
    
    # Find top similar ASINs based on similarity scores
    similar_asins_indices = similarity_matrix[asin_index].argsort()[:-n_recommendations-1:-1]  # Top n similar ASINs
    similar_asins = data_contentBased.iloc[similar_asins_indices]['asin'].tolist()
    
    # Store similar ASINs in the dictionary
    similar_asins_dict[asin] = similar_asins

# Print or use similar_asins_dict as needed
print(similar_asins_dict)


{'B0015Z7VFQ': ['B0015Z7VFQ', 'B00LIAZ9H0', 'B00LIAZ9H0', 'B00SG1JF7Y', 'B00MNHZ49K', 'B00MNHZ49K', 'B002TLTMH0', 'B002TG4MIY', 'B002TG4MIY', 'B002TG4MPM'], 'B0017HNV1U': ['B0017HNV1U', 'B00MS0F4RY', 'B00PNU8Z86', 'B00PNU8Z86', 'B002HE1GS2', 'B00JWTGGVC', 'B00JWTGGVC', 'B00PKFVO0U', 'B00FX8SV50', 'B00KZ41KGY'], 'B001892EI8': ['B001892EI8', 'B001892EI8', 'B00CS74GE4', 'B00CS74GE4', 'B00CS74GE4', 'B00CS74GE4', 'B00K0KHCMO', 'B00HY3HN7E', 'B00HY3HN7E', 'B002HE1GS2'], 'B001892DGG': ['B001892DGG', 'B00HZMFD0I', 'B002HE1GS2', 'B00MLG8Y8Q', 'B00AR63B62', 'B00MGRTU8I', 'B00KAQXYL6', 'B00IWUGYQY', 'B00K0JQAAK', 'B00JPFOWZA'], 'B001BRD238': ['B001BRD238', 'B0019BI4XE', 'B00LH06W5O', 'B009YKYK1M', 'B015SD1EHQ', 'B00ON5NGXG', 'B00VGVHYQU', 'B002HE1GS2', 'B00F9BRVAC', 'B00F9UOK7U'], 'B001GPOO64': ['B001GPOO64', 'B00J48G2G0', 'B00O5A1BVS', 'B00ICC50TO', 'B00ICC50TO', 'B01CTH5H0Y', 'B002HE1GS2', 'B004LLIDMQ', 'B0015DRP7M', 'B006TXN540'], 'B001V9KG4E': ['B001V9KG4E', 'B001V9KG4E', 'B001V9KG4E', 'B00O1

In [105]:
sample_user_id = 'B0017HNV1U'
ground_truth = data_contentBased[data_contentBased['reviewerID'] == sample_user_id]['asin'].tolist()
recommendation_list = similar_asins_dict.get(sample_user_id, [])
print("Ground Truth:", ground_truth)
print("Recommendations:", recommendation_list)


Ground Truth: []
Recommendations: ['B0017HNV1U', 'B00MS0F4RY', 'B00PNU8Z86', 'B00PNU8Z86', 'B002HE1GS2', 'B00JWTGGVC', 'B00JWTGGVC', 'B00PKFVO0U', 'B00FX8SV50', 'B00KZ41KGY']


In [102]:
from sklearn.metrics import average_precision_score
from sklearn.metrics import ndcg_score

def calculate_map_ndcg(data_contentBased, recommendations):
    user_ids = data_contentBased['reviewerID'].unique()
    map_scores = []
    ndcg_scores = []

    for user_id in user_ids:
        ground_truth = data_contentBased[data_contentBased['reviewerID'] == user_id]['asin'].tolist()
        recommendation_list = recommendations.get(user_id, [])

        # Compute Average Precision
        relevance = [1 if item in ground_truth else 0 for item in recommendation_list]
        ap_score = average_precision_score(relevance, range(1, len(relevance) + 1))
        map_scores.append(ap_score)

        # Compute NDCG
        relevance_dict = {item: 1 if item in ground_truth else 0 for item in recommendation_list}
        relevance_array = [relevance_dict[item] for item in recommendation_list]
        ndcg_score = ndcg_score([relevance_array])
        ndcg_scores.append(ndcg_score)

    mean_ap = sum(map_scores) / len(map_scores)
    mean_ndcg = sum(ndcg_scores) / len(ndcg_scores)

    return mean_ap, mean_ndcg

# Calculate MAP and NDCG
map_score, ndcg_score = calculate_map_ndcg(data_contentBased, similar_asins_dict)
print("Mean Average Precision (MAP):", map_score)
print("Normalized Discounted Cumulative Gain (NDCG):", ndcg_score)


IndexError: cannot do a non-empty take from an empty axes.

In [32]:
def tfidf_recommender_system(data):

    """
    A content-based recommender system based on TF-IDF (Term Frequency-Inverse Document Frequency) vectorization 
    and cosine similarity. It takes a pandas DataFrame containing book information, including titles and descriptions,
    and returns a dictionary mapping ASINs to recommended ASINs based on their textual similarities.

    Args:
        data (pandas.DataFrame): DataFrame containing book information with columns 'asin', 'title', and 'book_info'.

    Returns:
        dict: A dictionary mapping ASINs to recommended ASINs based on textual similarities.
    """
    
    # Vectorize the text data using TfidfVectorizer
    tfidf_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=1, stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(data['book_info'])
    
    # Split the data into train, validation, and test sets
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)
    
    # Debugging print
    print("Train Data Shape:", train_data.shape)
    print("Validation Data Shape:", val_data.shape)
    print("Test Data Shape:", test_data.shape)
    
    # Calculate cosine similarity matrices
    tfidf_matrix_train = tfidf_vectorizer.transform(train_data['book_info'])
    tfidf_matrix_val = tfidf_vectorizer.transform(val_data['book_info'])
    tfidf_matrix_test = tfidf_vectorizer.transform(test_data['book_info'])
    
    # Calculate cosine similarity matrices with correct dimensions
    cosine_sim_train = cosine_similarity(tfidf_matrix_train, tfidf_matrix_train)
    cosine_sim_val = cosine_similarity(tfidf_matrix_val, tfidf_matrix_train)
    cosine_sim_test = cosine_similarity(tfidf_matrix_test, tfidf_matrix_train)
    
    # Debugging print
    print("Cosine similarity matrix size (Train):", cosine_sim_train.shape)
    print("Cosine similarity matrix size (Validation):", cosine_sim_val.shape)
    print("Cosine similarity matrix size (Test):", cosine_sim_test.shape)
    
    # Implement function to recommend books
    def recommend(title, cosine_sim_matrix, train_data):
        indices = train_data[train_data['title'] == title].index.tolist()
        if not indices:
            return []
        
        recommended_books = []
        for idx in indices:
            try:
                sim_scores = list(enumerate(cosine_sim_matrix[idx]))
                sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
                sim_scores = sim_scores[1:11]  # Top 10 similar items
                book_indices = [i[0] for i in sim_scores]
                recommended_books.extend(train_data['title'].iloc[book_indices].tolist())
            except IndexError:
                continue
        
        return recommended_books if recommended_books else []  # Return empty list if no recommendations found
    
    # Pre-calculate recommendations for all books where recommendations are available
    all_titles = train_data['title'].unique()
    pred_content_based_recommender_system = {title: recommend(title, cosine_sim_train, train_data)[:10] for title in all_titles}
    
    # Evaluate the recommender system
    def evaluate_recommender(test_data, cosine_sim_matrix, train_data):
        y_true = []
        y_pred = []
        for title in test_data['title']:
            true_books = set(test_data[test_data['title'] == title]['title'])
            recommended_books = set(pred_content_based_recommender_system.get(title, []))  # Get precalculated recommendations
            y_true.append(len(true_books))
            y_pred.append(len(recommended_books))
    
        #print(y_true)
        #print(y_pred)
        
        mae = mean_absolute_error(y_true, y_pred)
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        return mae, rmse
    
    # Evaluate the recommender system
    mae_val, rmse_val = evaluate_recommender(val_data, cosine_sim_val, train_data)
    mae_test, rmse_test = evaluate_recommender(test_data, cosine_sim_test, train_data)
    
    print("Validation MAE:", mae_val)
    print("Validation RMSE:", rmse_val)
    print("Test MAE:", mae_test)
    print("Test RMSE:", rmse_test)

    # Create a dictionary mapping titles to ASINs
    title_to_asin = data.set_index('title')['asin'].to_dict()
    
    # Replace titles with ASINs in the original dictionary
    asin_based_recommender_system = {}
    for title, related_titles in pred_content_based_recommender_system.items():
        asin = title_to_asin.get(title, None)
        if asin is not None:
            asin_based_recommender_system[asin] = [title_to_asin.get(t, None) for t in related_titles]
    
    # Print the result
    #print(asin_based_recommender_system)

    return asin_based_recommender_system

# Run All predictions - Compare RMSE and MAE

In [6]:
prediction_item_based_KNNWithMeans = item_based_KNNWithMeans_recommender_system(trainset, testset, data)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computi

In [10]:
prediction_item_based_KNNBasic = item_based_KNNBasic_recommender_system(trainset, testset, data)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computi

In [11]:
prediction_user_based_KNNWithMeans = user_based_KNNWithMeans_recommender_system(trainset, testset, data)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Comp

In [12]:
prediction_user_based_KNNBasic = user_based_KNNBasic_recommender_system(trainset, testset, data)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Comp

In [17]:
prediction_SVD = SVD_recommender_system(trainset, testset, data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    4.9s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   27.1s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:   50.7s
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1215 out of 1215 | elapsed:  2.1min finished


Best RMSE: 0.7481124313304917
Best Parameters: {'n_factors': 40, 'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.08, 'random_state': 0}
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7271  0.7475  0.7678  0.7539  0.7435  0.7480  0.0133  
MAE (testset)     0.5191  0.5334  0.5470  0.5330  0.5269  0.5319  0.0092  
Fit time          0.23    0.25    0.27    0.25    0.27    0.25    0.01    
Test time         0.07    0.07    0.09    0.08    0.09    0.08    0.01    
SVD: Test Set
RMSE: 0.7588
MAE:  0.5375


In [33]:
pred_content_based_recommender_system = tfidf_recommender_system(data_contentBased)

Train Data Shape: (8700, 3)
Validation Data Shape: (2900, 3)
Test Data Shape: (2901, 3)
Cosine similarity matrix size (Train): (8700, 8700)
Cosine similarity matrix size (Validation): (2900, 8700)
Cosine similarity matrix size (Test): (2901, 8700)
Validation MAE: 1.0882758620689654
Validation RMSE: 1.3721365167830988
Test MAE: 1.0827300930713548
Test RMSE: 1.3517769530190797


# Hybrid Approach

### User-Based Collaborative Filtering + Content-Based Filtering
Run after these predictions are calculated

In [34]:
# Create a dummy algorithm class that does nothing for the DummyAlgorithm class 
# that initializes the trainset attribute with a dummy value. Here's how you can modify the code:

class DummyAlgorithm(AlgoBase):
    def __init__(self):
        pass

    def fit(self, trainset):
        self.trainset = trainset  # Initialize trainset attribute with a dummy value
        pass

    def estimate(self, u, i):
        return 5  # Return a dummy rating of 5 for all predictions


def hybrid_recommender_system(prediction, data_contentBased, data):
    # Convert prediction to a dictionary with user IDs as keys
    user_item_recs = {}
    for pred in prediction:
        user_id = pred.uid
        item_id = pred.iid
        if user_id not in user_item_recs:
            user_item_recs[user_id] = []
        user_item_recs[user_id].append(item_id)

    # Initialize dictionary for hybrid recommendations
    hybrid_recs = {}

    # For each user, generate recommendations
    for user_id, items in user_item_recs.items():
        # Initialize list to store recommendations for the user
        user_recommendations = items.copy()  # Store original recommendations

        # For each item recommended to the user
        for item in items:
            # Get similar item recommendations from TF-IDF
            similar_items = data_contentBased.get(item)

            # Check if similar_items is not None before iterating
            if similar_items is not None:
                # Extend user_recommendations with similar items (excluding already recommended items)
                user_recommendations.extend([item for item in similar_items if item not in items])

        # Count occurrences of each item ID (ASIN) in user_recommendations
        item_counts = {}
        for item in user_recommendations:
            if item in item_counts:
                item_counts[item] += 1
            else:
                item_counts[item] = 1

        # Sort items by count (descending order) and prioritize original recommendations if counts are equal
        sorted_items = sorted(item_counts.items(), key=lambda x: (-x[1], x[0]))

        # Take the top ten ASINs from sorted_items
        top_ten_asins = [item[0] for item in sorted_items[:10]]

        # Add user recommendations to the hybrid recommendations
        hybrid_recs[user_id] = top_ten_asins

    # Cross-validate the hybrid recommender system
    dummy_algo = DummyAlgorithm()
    cross_val_results = cross_validate(dummy_algo, data, measures=['mae', 'rmse'], cv=5, verbose=True)
    avg_mae = np.mean(cross_val_results['test_mae'])
    avg_rmse = np.mean(cross_val_results['test_rmse'])

    # Calculate MAE and RMSE on the test data
    actual_ratings = []
    predicted_ratings = []
    for user_id, items in hybrid_recs.items():
        for item in items:
            actual_rating = [rating for (uid, iid, rating, _) in data.raw_ratings if uid == user_id and iid == item]
            if actual_rating:  # Check if actual rating exists
                actual_ratings.append(actual_rating[0])
                predicted_ratings.append(5)  # Assuming all predicted ratings are 5 (can be replaced with actual predictions)

    mae = mean_absolute_error(actual_ratings, predicted_ratings)
    rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))

    # Print MAE and RMSE
    print("MAE:", mae)
    print("RMSE:", rmse)

    return hybrid_recs, avg_mae, avg_rmse, mae, rmse

# Call the hybrid recommender system function
prediction_hybrid, avg_mae, avg_rmse, mae, rmse = hybrid_recommender_system(prediction_user_based_KNNWithMeans, pred_content_based_recommender_system, data)


Evaluating MAE, RMSE of algorithm DummyAlgorithm on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.6498  0.6231  0.6362  0.6226  0.6094  0.6282  0.0137  
RMSE (testset)    1.1467  1.1137  1.1173  1.1089  1.0936  1.1161  0.0173  
Fit time          0.00    0.00    0.01    0.01    0.01    0.00    0.00    
Test time         0.05    0.06    0.06    0.06    0.05    0.06    0.00    
MAE: 0.6052631578947368
RMSE: 1.038723913473187


# Interface

In [36]:
# format the hybrid prediction to match the other predictions

from collections import namedtuple

# Define the Prediction namedtuple
Prediction = namedtuple('Prediction', ['uid', 'iid', 'r_ui', 'est', 'details'])

# Initialize an empty list to store Prediction objects
prediction_hybrid_formatted = []

# Iterate over each user ID and their corresponding list of item IDs
for user_id, item_ids in prediction_hybrid.items():
    for item_id in item_ids:
        # Create a Prediction object with empty values for r_ui, est, and details
        prediction = Prediction(uid=user_id, iid=item_id, r_ui=None, est=None, details=None)
        # Append the Prediction object to the list
        prediction_hybrid_formatted.append(prediction)


In [37]:
rec_sys_dict = {'Item-based KNNWithMeans': prediction_item_based_KNNWithMeans, 
                'Item-based KNNBasic': prediction_item_based_KNNBasic,
                'User-based KNNWithMeans': prediction_user_based_KNNWithMeans,
                'User-based KNNBasic': prediction_user_based_KNNBasic,
                'Singular Value Decomposition (SVD)': prediction_SVD,
                'Hybrid Approach' : prediction_hybrid_formatted}

In [38]:
import tkinter as tk
from tkinter import ttk

def create_recommendation_interface(rec_sys_dict):
    # Function to create GUI for book recommendation system
    global notebook, text_results, comboboxes, data_preprocessed, current_userID

    current_userID = 'Enter User ID'
    
    # Create the root window
    root = tk.Tk()
    root.title("Book Recommendation System")
    root.tk_setPalette(background='#f0f0f0', foreground='#2e3440', activeBackground='#ff8c00', activeForeground='#2e3440')

    # Create a notebook (tabs) to switch between recommender systems
    notebook = ttk.Notebook(root)
    notebook.pack(fill='both', expand=True)

    # Event handler for tab selection
    notebook.bind("<<NotebookTabChanged>>", on_tab_selected)
    
    # Dictionary to store comboboxes for each tab
    comboboxes = {}
    text_results = {}

    for system_name, system_pred in rec_sys_dict.items():
        # Create a frame for the current tab
        frame = tk.Frame(notebook)
        frame.configure(background='#e0e0e0')  # Set a slightly darker grey for the frame
        notebook.add(frame, text=system_name)
        
        # Create a label and combobox for entering the user ID
        label_user_id = tk.Label(frame, text="Enter User ID:", font=("Helvetica", 12), background='#e0e0e0', foreground='#ff8c00')
        label_user_id.pack(pady=5)
        
        combobox_user_id = ttk.Combobox(frame, font=("Helvetica", 12))
        combobox_user_id.pack(pady=5)
        
        # Store the combobox in the dictionary
        comboboxes[system_name] = combobox_user_id

        # Get the unique user IDs for the current tab's system
        user_ids = set([pred.uid for pred in system_pred])
        
        # Update the combobox with the user IDs
        combobox_user_id['values'] = sorted(user_ids)

        # Set the initial value of combobox to current_userID
        combobox_user_id.set(current_userID)

        # Create a search button
        button_search = tk.Button(frame, text="Search", command=lambda system_name=system_name: search_books(system_name), font=("Helvetica", 12), bg="#ff8c00", fg="#2e3440", activebackground="#ffa31a", activeforeground="#2e3440")
        button_search.pack(pady=5)

        # Create a text widget to display results
        text_results[system_name] = tk.Text(frame, height=15, width=130 , font=("Helvetica", 12), bg="#f0f0f0", fg="#2e3440", selectbackground="#ff8c00", selectforeground="#2e3440")
        text_results[system_name].pack(pady=10, padx=10)
        
        # Insert default message
        text_results[system_name].insert(tk.END, "Please select a user ID and click 'Search' to display results.\n")

    # Run the main event loop
    root.mainloop()

def on_tab_selected(event):
    # Event handler for tab selection
    global comboboxes, notebook, rec_sys_dict, current_userID

    #print('changed')

    # Get the system name of the currently selected tab
    system_name = notebook.tab(notebook.select(), "text")
    
    # Get the combobox for the current tab's system
    combobox_user_id = comboboxes[system_name]

    # Set combobox to last used userID
    combobox_user_id.set(current_userID)

    # Call search_books directly after setting the combobox
    search_books(system_name)

def search_books(system_name):

    # Function to search books for a given user ID
    global text_results, comboboxes, current_userID  # Declare global variables
    
    # Get the user ID entered by the user
    user_id = comboboxes[system_name].get()
    
    #print('Search')
    #print(user_id)
    
    # Check if the user ID is valid
    if user_id.strip() == '':
        #print('Error: Please enter a valid User ID.')
        messagebox.showerror("Error", "Please enter a valid User ID.")
        return
    
    # Clear the current contents of the text widget
    text_results[system_name].delete('1.0', tk.END)
    
    #print(system_name)
    
    # Get the predictions for the current tab's system
    selected_pred = rec_sys_dict[system_name]

    if (system_name != 'Hybrid Approach'):

        # Initialize an empty list to store predictions for the target UID
        predictions_for_uid = []
        
        # Iterate through each prediction in the selected prediction
        for prediction in selected_pred:
            # Check if the UID of the prediction matches the target UID
            if prediction.uid == user_id:
                # If it matches, append the prediction to the list
                predictions_for_uid.append(prediction)
    
        # Sort the predictions_for_uid list based on the estimated rating (est)
        predictions_for_uid_sorted = sorted(predictions_for_uid, key=lambda x: x.est, reverse=True)
    
        # Get the top ten predictions
        top_ten_predictions = predictions_for_uid_sorted[:10]
        #print(top_ten_predictions)
    
        # Insert the top ten predictions into the text widget
        text_results[system_name].insert(tk.END, f"Top Ten Predictions for User ID: {user_id}\n\n")
        for i, prediction in enumerate(top_ten_predictions, 1):
            # Get the title corresponding to the item ID (ASIN)
            title = data_preprocessed[data_preprocessed['asin'] == prediction.iid]['title'].values[0]
            text_results[system_name].insert(tk.END, f"{i}. Title: {title}\n")#, Item ID: {prediction.iid}, Estimated Rating: {prediction.est}\n")        
    else:
        #print(user_id)

        # Iterate over each Prediction object in selected_pred and get top ten predictions
        for prediction in selected_pred:            
            # Get the top ten predictions for the current user ID
            top_ten_predictions = [p for p in selected_pred if p.uid == user_id][:10]

        # Insert the top ten predictions into the text widget
        text_results[system_name].insert(tk.END, f"Top Ten Predictions for User ID: {user_id}\n\n")
        # Iterate over the top ten predictions for the current user ID
        for i, prediction in enumerate(top_ten_predictions, 1):
            # Get the title corresponding to the item ID (iid)
            title = data_preprocessed[data_preprocessed['asin'] == prediction.iid]['title'].values[0]
            text_results[system_name].insert(tk.END, f"{i}. Title: {title}\n")#, Item ID: {prediction.iid}, Estimated Rating: {prediction.est}\n")        
        
    #save current user for changing tabs
    current_userID = comboboxes[system_name].get()

# Example usage
create_recommendation_interface(rec_sys_dict)
