In [1]:
# Install the surprise package
!pip install -q -U scikit-surprise
from surprise import Dataset, Reader
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import arff
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise import KNNWithMeans
from surprise.model_selection import cross_validate
from surprise import KNNBasic
from surprise import SVD
import random
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
path = os.path.expanduser('~/Documents/Studium/Master/Web Mining/Project/data_kindle_preprocessed.xlsx')
data_preprocessed = pd.read_excel(path, index_col=[0])

In [33]:
data = data_preprocessed.copy()

# Replace non-finite values with NaN
data['publication_year'] = data['publication_year'].replace([np.inf, -np.inf], np.nan)

# Convert NaN to a placeholder value (e.g., -1)
data['publication_year'].fillna(-1, inplace=True)

# Convert the column to integers
data['publication_year'] = data['publication_year'].astype(int).astype(str)

data["book_info"] =  data['category_string'] + '  ' + data['brand'] + '  ' + data['paid_free']+ ' ' + data['print_length_category'] + ' ' + data['publication_year'] + '  ' + data['language'] 
data.drop(['rating', 'brand', 'reviewerID', 'language','print_length_category', 'publication_year', 'category_string', 'paid_free'],axis=1,inplace=True)
data.drop_duplicates(subset=['asin', 'title'], inplace=True)
data.head()

Unnamed: 0,asin,title,book_info
0,B0015Z7VFQ,Look What Santa Brought (The Perfect Gift) - K...,"Kindle Store, Kindle eBooks, Literature & Fict..."
1,B0017HNV1U,Babylonian Laws- The Oldest Code of Laws in th...,"Kindle Store, Kindle eBooks, History King of ..."
2,B001892EI8,The Billionaire&s Baby (Harlequin Mini # 19) -...,"Kindle Store, Kindle eBooks, Romance Leanne B..."
4,B001892DGG,The Wallflower (Halle Puma Book 1) - Kindle ed...,"Kindle Store, Kindle eBooks, Romance Visit Am..."
5,B001BRD238,Secrets: a PsyCop Novel - Kindle edition,"Kindle Store, Kindle eBooks, Romance Visit Am..."


In [38]:
def tfidf_recommender_system(data):
    import pandas as pd
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_absolute_error, mean_squared_error
    import numpy as np
    
    # Vectorize the text data using TfidfVectorizer
    tfidf_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=1, stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(data['book_info'])
    
    # Split the data into train, validation, and test sets
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)
    
    # Debugging print
    print("Train Data Shape:", train_data.shape)
    print("Validation Data Shape:", val_data.shape)
    print("Test Data Shape:", test_data.shape)
    
    # Calculate cosine similarity matrices
    tfidf_matrix_train = tfidf_vectorizer.transform(train_data['book_info'])
    tfidf_matrix_val = tfidf_vectorizer.transform(val_data['book_info'])
    tfidf_matrix_test = tfidf_vectorizer.transform(test_data['book_info'])
    
    # Calculate cosine similarity matrices with correct dimensions
    cosine_sim_train = cosine_similarity(tfidf_matrix_train, tfidf_matrix_train)
    cosine_sim_val = cosine_similarity(tfidf_matrix_val, tfidf_matrix_train)
    cosine_sim_test = cosine_similarity(tfidf_matrix_test, tfidf_matrix_train)
    
    # Debugging print
    print("Cosine similarity matrix size (Train):", cosine_sim_train.shape)
    print("Cosine similarity matrix size (Validation):", cosine_sim_val.shape)
    print("Cosine similarity matrix size (Test):", cosine_sim_test.shape)
    
    # Implement function to recommend books
    def recommend(title, cosine_sim_matrix, train_data):
        indices = train_data[train_data['title'] == title].index.tolist()
        if not indices:
            return []
        
        recommended_books = []
        for idx in indices:
            try:
                sim_scores = list(enumerate(cosine_sim_matrix[idx]))
                sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
                sim_scores = sim_scores[1:11]  # Top 10 similar items
                book_indices = [i[0] for i in sim_scores]
                recommended_books.extend(train_data['title'].iloc[book_indices].tolist())
            except IndexError:
                continue
        
        return recommended_books if recommended_books else []  # Return empty list if no recommendations found
    
    # Pre-calculate recommendations for all books where recommendations are available
    all_titles = train_data['title'].unique()
    pred_content_based_recommender_system = {title: recommend(title, cosine_sim_train, train_data)[:10] for title in all_titles}
    
    # Evaluate the recommender system
    def evaluate_recommender(test_data, cosine_sim_matrix, train_data):
        y_true = []
        y_pred = []
        for title in test_data['title']:
            true_books = set(test_data[test_data['title'] == title]['title'])
            recommended_books = set(pred_content_based_recommender_system.get(title, []))  # Get precalculated recommendations
            y_true.append(len(true_books))
            y_pred.append(len(recommended_books))
    
        print(y_true)
        print(y_pred)
    
        
        mae = mean_absolute_error(y_true, y_pred)
        print('mae')
        print(mae)
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        print('rmse')
        print(rmse)
        
        mae = mean_absolute_error(y_true, y_pred)
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        return mae, rmse
    
    # Evaluate the recommender system
    mae_val, rmse_val = evaluate_recommender(val_data, cosine_sim_val, train_data)
    mae_test, rmse_test = evaluate_recommender(test_data, cosine_sim_test, train_data)
    
    print("Validation MAE:", mae_val)
    print("Validation RMSE:", rmse_val)
    print("Test MAE:", mae_test)
    print("Test RMSE:", rmse_test)


    return pred_content_based_recommender_system


In [39]:
red_content_based_recommender_system = tfidf_recommender_system(data)

Train Data Shape: (8700, 3)
Validation Data Shape: (2900, 3)
Test Data Shape: (2901, 3)
Cosine similarity matrix size (Train): (8700, 8700)
Cosine similarity matrix size (Validation): (2900, 8700)
Cosine similarity matrix size (Test): (2901, 8700)
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [36]:
red_content_based_recommender_system

{' Powerlifting Navigation Guide - Sure Way to Fast Strength Building (Raw and Natural Muscle Power Training Book 1) eBook': [],
 'Second Touch (Emma&s Arabian Nights, #2) - Kindle edition': ['Spark (Ruin Outlaws MC series Book 2) - Kindle edition',
  'Candy Man - Kindle edition',
  'Christmas with the Billionaire (Holiday Encounters Book 1) - Kindle edition',
  ' Make-Believe Wife (Destiny Bay Romances',
  'The Apprenticeship of Julian St. Albans (Consulting Magic Book 2) - Kindle edition',
  'Turkey in the Snow - Kindle edition',
  'Puppy, Car, and Snow - Kindle edition',
  'A Brand New Step: A Taboo Short (Sexy Household Secrets: Man of the House Book 2) - Kindle edition',
  'Bitter Taffy (Candy Man Book 2) - Kindle edition',
  'Getting Lucky with the Rock Star (Holiday Encounters Book 3) - Kindle edition'],
 'Biker Faith: The Lost Souls MC Series Book 2 - Kindle edition': ['Bayou Bounty: No matter the twists and turns, the trail always leads back to the heart - Kindle edition',
  '

In [37]:
# Create a dictionary mapping titles to ASINs
title_to_asin = data.set_index('title')['asin'].to_dict()

# Replace titles with ASINs in the original dictionary
asin_based_recommender_system = {}
for title, related_titles in red_content_based_recommender_system.items():
    asin = title_to_asin.get(title, None)
    if asin is not None:
        asin_based_recommender_system[asin] = [title_to_asin.get(t, None) for t in related_titles]

# Print the result
print(asin_based_recommender_system)


{'B00ITYTPYQ': [], 'B00HMTTAQM': ['B00KO11PTA', 'B00PNPOQ4S', 'B00QL14R88', 'B00C9RRTFQ', 'B00N2HQW1Y', 'B00AGGKQ16', 'B006OIRC5S', 'B00UDINSN0', 'B012EBM6XG', 'B00YSP8WMG'], 'B00KOZTEXA': ['B016TEMQ9S', 'B00G60VU2U', 'B00FPYHJ9Q', 'B00PO8EHSO', 'B00JU84UD6', 'B00LCAV4B6', 'B011SB1Z8A', 'B0167ZY4NK', 'B00LNM8L6E', 'B01G5SRJGS'], 'B00P37QLBW': [], 'B00HDFDVGK': [], 'B00XWFRCX8': [], 'B00KUHD3M0': ['B01FOSUOIA', 'B00KZKZTNI', 'B01BW83B72', 'B0184SY5IM', 'B00S6ROWI0', 'B014CGOUOY', 'B00RSRKCGA', 'B00ND67XBC', 'B00EW9HWJM', 'B01DILPIRW'], 'B01EM1UHO6': [], 'B00R04OXC0': [], 'B018FKLJV0': [], 'B00ASZC4JW': ['B019M34ALK', 'B01CHHHSL2', 'B019PJ5LI2', 'B00K1RB5JW', 'B01DL6EQC2', 'B00RC83ATG', 'B00GW6S0DU', 'B019MCXVS4', 'B00KSND3LW', 'B015TYIW42'], 'B00WZML6RC': [], 'B00951DMYQ': ['B00IMSGR04', 'B00G61DDDI', 'B00D71EQRW', 'B00L4RE3O2', 'B00XIMO26K', 'B013RZCL6U', 'B00RD4J9EE', 'B00BUT8MIW', 'B00U4CW5W0', 'B007HHNYK2'], 'B00IWDTRJC': ['B0058EU1Z2', 'B00D2XOTN6', 'B00BDD52I8', 'B00GW9DLFE', 'B00

## New try

In [31]:
data = data_preprocessed.copy()

# Replace non-finite values with NaN
data['publication_year'] = data['publication_year'].replace([np.inf, -np.inf], np.nan)

# Convert NaN to a placeholder value (e.g., -1)
data['publication_year'].fillna(-1, inplace=True)

# Convert the column to integers
data['publication_year'] = data['publication_year'].astype(int).astype(str)

data["book_info"] =  data['title'] + '  ' + data['category_string'] + '  ' + data['brand'] + '  ' + data['paid_free']+ ' ' + data['print_length_category'] + ' ' + data['publication_year'] + '  ' + data['language'] 
data.drop(['title', 'rating', 'brand', 'reviewerID', 'language','print_length_category', 'publication_year', 'category_string', 'paid_free'],axis=1,inplace=True)
data.drop_duplicates(subset=['asin'], inplace=True)
data.head()

Unnamed: 0,asin,book_info
0,B0015Z7VFQ,Look What Santa Brought (The Perfect Gift) - K...
1,B0017HNV1U,Babylonian Laws- The Oldest Code of Laws in th...
2,B001892EI8,The Billionaire&s Baby (Harlequin Mini # 19) -...
4,B001892DGG,The Wallflower (Halle Puma Book 1) - Kindle ed...
5,B001BRD238,Secrets: a PsyCop Novel - Kindle edition Kind...


In [32]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Vectorize the text data using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=1, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['book_info'])

# Split the data into train, validation, and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)

# Debugging print
print("Train Data Shape:", train_data.shape)
print("Validation Data Shape:", val_data.shape)
print("Test Data Shape:", test_data.shape)

# Calculate cosine similarity matrices
tfidf_matrix_train = tfidf_vectorizer.transform(train_data['book_info'])
tfidf_matrix_val = tfidf_vectorizer.transform(val_data['book_info'])
tfidf_matrix_test = tfidf_vectorizer.transform(test_data['book_info'])

# Calculate cosine similarity matrices with correct dimensions
cosine_sim_train = cosine_similarity(tfidf_matrix_train, tfidf_matrix_train)
cosine_sim_val = cosine_similarity(tfidf_matrix_val, tfidf_matrix_train)
cosine_sim_test = cosine_similarity(tfidf_matrix_test, tfidf_matrix_train)

# Debugging print
print("Cosine similarity matrix size (Train):", cosine_sim_train.shape)
print("Cosine similarity matrix size (Validation):", cosine_sim_val.shape)
print("Cosine similarity matrix size (Test):", cosine_sim_test.shape)

# Implement function to recommend books
def recommend(asin, cosine_sim_matrix, train_data):
    indices = train_data[train_data['asin'] == asin	].index.tolist()
    if not indices:
        return []
    
    recommended_books = []
    for idx in indices:
        try:
            sim_scores = list(enumerate(cosine_sim_matrix[idx]))
            sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
            sim_scores = sim_scores[1:11]  # Top 10 similar items
            book_indices = [i[0] for i in sim_scores]
            recommended_books.extend(train_data['asin'].iloc[book_indices].tolist())
        except IndexError:
            continue
    
    return recommended_books if recommended_books else []  # Return empty list if no recommendations found

# Pre-calculate recommendations for all books where recommendations are available
all_asins = train_data['asin'].unique()
pred_content_based_recommender_system = {asin: recommend(asin, cosine_sim_train, train_data)[:10] for asin in all_asins}

def evaluate_recommender(test_data, cosine_sim_matrix, train_data):
    y_true = []
    y_pred = []
    for idx, asin in enumerate(test_data['asin']):
        true_asin = asin
        true_vector = cosine_sim_matrix[idx]
        recommended_asins = pred_content_based_recommender_system.get(asin, [])
        recommended_vectors = cosine_sim_matrix[train_data.index.isin(recommended_asins)]
        
        # Calculate similarity scores between true item and recommended items
        true_similarities = np.dot(recommended_vectors, true_vector)
        
        # RMSE and MAE calculations
        y_true.extend([1] * len(recommended_asins))  # Assuming all recommended items are relevant
        y_pred.extend(true_similarities.tolist())

    # Calculate RMSE and MAE
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    print('RMSE:', rmse)
    mae = mean_absolute_error(y_true, y_pred)
    print('MAE:', mae)

    return rmse, mae


# Evaluate the recommender system
mae_val, rmse_val = evaluate_recommender(val_data, cosine_sim_val, train_data)
mae_test, rmse_test = evaluate_recommender(test_data, cosine_sim_test, train_data)

print("Validation MAE:", mae_val)
print("Validation RMSE:", rmse_val)
print("Test MAE:", mae_test)
print("Test RMSE:", rmse_test)


Train Data Shape: (8700, 2)
Validation Data Shape: (2900, 2)
Test Data Shape: (2901, 2)
Cosine similarity matrix size (Train): (8700, 8700)
Cosine similarity matrix size (Validation): (2900, 8700)
Cosine similarity matrix size (Test): (2901, 8700)


IndexError: boolean index did not match indexed array along dimension 0; dimension is 2900 but corresponding boolean dimension is 8700

## For testing without train/validate/test

In [40]:
# vectorizing the book info column using TFidf Vectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


tf = TfidfVectorizer(analyzer="word", ngram_range=(1,2), min_df=1, stop_words='english')

tfidf_matrix = tf.fit_transform(data['book_info'])

cosine_sim =  cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim)

indices = pd.Series(data['title'])
indices[:5]

def recommend(title, cosine_sim = cosine_sim):
    if title not in indices.values:
        return "Title not found in the database."
    recommended_books = []
    idx = indices[indices == title].index[0]   # to get the index of book name matching the input book_name
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)   # similarity scores in descending order
    top_10_indices = list(score_series.iloc[1:11].index)   # to get the indices of top 10 most similar books
    # [1:11] to exclude 0 (index 0 is the input book itself)
    
    for i in top_10_indices:   # to append the titles of top 10 similar booksto the recommended_books list
        recommended_books.append(list(data['title'])[i])
        
    return recommended_books


#to output the recommendations.
recommend('Reagan&s Revenge and Ending Emily&s Engagement (The Reed Brothers Series) - Kindle edition')

[[1.         0.0190001  0.02312506 ... 0.04199093 0.0742936  0.03085186]
 [0.0190001  1.         0.26348392 ... 0.02212988 0.02601486 0.01829713]
 [0.02312506 0.26348392 1.         ... 0.02693433 0.03166275 0.02226948]
 ...
 [0.04199093 0.02212988 0.02693433 ... 1.         0.1441296  0.09686777]
 [0.0742936  0.02601486 0.03166275 ... 0.1441296  1.         0.11387327]
 [0.03085186 0.01829713 0.02226948 ... 0.09686777 0.11387327 1.        ]]


['Big Rock (Big Rock Book 1) - Kindle edition',
 'First Night: (Seductive Nights: Julia and Clay Book #0.5) - Kindle edition',
 'Every Second With You (No Regrets Book 2) - Kindle edition',
 'Caught Up in Her (Caught Up in Love) - Kindle edition',
 'The Start of Us (No Regrets) - Kindle edition',
 'Sinful Nights Bundle: Books 1-3 - Kindle edition',
 'Tempted By A Rogue - Kindle edition',
 'Blurred Lines (Love Unexpectedly) - Kindle edition',
 'Surviving For Us - Kindle edition',
 'ARENA (An Artemus Newton Thriller Book 2) - Kindle edition']