In [1]:
# Install the surprise package
!pip install -q -U scikit-surprise
from surprise import Dataset, Reader
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import arff
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise import KNNWithMeans
from surprise.model_selection import cross_validate
from surprise import KNNBasic
from surprise import SVD
import random
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
path = os.path.expanduser('~/Documents/Studium/Master/Web Mining/Project/data_kindle_preprocessed.xlsx')
data_preprocessed = pd.read_excel(path, index_col=[0])

In [27]:
data = data_preprocessed.copy()

In [28]:
# Replace non-finite values with NaN
data['publication_year'] = data['publication_year'].replace([np.inf, -np.inf], np.nan)

# Convert NaN to a placeholder value (e.g., -1)
data['publication_year'].fillna(-1, inplace=True)

# Convert the column to integers
data['publication_year'] = data['publication_year'].astype(int).astype(str)

data["book_info"] =  data['category_string'] + '  ' + data['paid_free']+ ' ' + data['print_length_category'] + ' ' + data['publication_year'] + '  ' + data['language'] 
data.drop(['language','print_length_category', 'publication_year', 'category_string', 'paid_free'],axis=1,inplace=True)

data.head()

Unnamed: 0,rating,reviewerID,asin,title,book_info
0,1.0,A3OC8ZG1S3OAVA,B0015Z7VFQ,Look What Santa Brought (The Perfect Gift) - K...,"Kindle Store, Kindle eBooks, Literature & Fict..."
1,4.0,A2U8YWPP1PYHJM,B0017HNV1U,Babylonian Laws- The Oldest Code of Laws in th...,"Kindle Store, Kindle eBooks, History Paid sma..."
2,3.0,A3361XGKYF17S3,B001892EI8,The Billionaire&s Baby (Harlequin Mini # 19) -...,"Kindle Store, Kindle eBooks, Romance Paid sma..."
3,4.0,AVGYENZU56KBR,B001892EI8,The Billionaire&s Baby (Harlequin Mini # 19) -...,"Kindle Store, Kindle eBooks, Romance Paid sma..."
4,3.0,A3361XGKYF17S3,B001892DGG,The Wallflower (Halle Puma Book 1) - Kindle ed...,"Kindle Store, Kindle eBooks, Romance Paid sma..."


In [31]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
from scipy.sparse import hstack

# Convert numerical ratings to text labels
def convert_rating_to_text(rating):
    if rating < 3:
        return 'low_rating'
    elif rating < 5:
        return 'medium_rating'
    else:
        return 'high_rating'

# Create a new column for textual representation of ratings
data['rating_text'] = data['rating'].apply(convert_rating_to_text)

# Combine user's interactions and book information into a single feature for content-based recommendation
data['user_book_info'] = data['rating_text'] + ' ' + data['book_info']

# Vectorize the text data using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=1, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['user_book_info'])

# Split the data into train, validation, and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)

# Debugging print
print("Train Data Shape:", train_data.shape)
print("Validation Data Shape:", val_data.shape)
print("Test Data Shape:", test_data.shape)

# Calculate cosine similarity matrices
tfidf_matrix_train = tfidf_vectorizer.transform(train_data['user_book_info'])
tfidf_matrix_val = tfidf_vectorizer.transform(val_data['user_book_info'])
tfidf_matrix_test = tfidf_vectorizer.transform(test_data['user_book_info'])

# Calculate cosine similarity matrices with correct dimensions
cosine_sim_train = cosine_similarity(tfidf_matrix_train, tfidf_matrix_train)
cosine_sim_val = cosine_similarity(tfidf_matrix_val, tfidf_matrix_train)
cosine_sim_test = cosine_similarity(tfidf_matrix_test, tfidf_matrix_train)

# Debugging print
print("Cosine similarity matrix size (Train):", cosine_sim_train.shape)
print("Cosine similarity matrix size (Validation):", cosine_sim_val.shape)
print("Cosine similarity matrix size (Test):", cosine_sim_test.shape)

# Implement function to recommend books based on user interactions
def recommend_books_for_user(reviewer_id, cosine_sim_matrix, train_data, top_n=10):
    user_data = train_data[train_data['reviewerID'] == reviewer_id]
    if user_data.empty:
        return []
    
    recommended_books = []
    for _, row in user_data.iterrows():
        idx = row.name
        try:
            sim_scores = list(enumerate(cosine_sim_matrix[idx]))
            sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
            sim_scores = sim_scores[1:top_n+1]  # Top N similar items
            book_indices = [i[0] for i in sim_scores]
            recommended_books.extend(train_data['title'].iloc[book_indices].tolist())
        except IndexError:
            continue
    
    return recommended_books[:top_n]  # Return only the top N recommendations

# Evaluate the recommender system
def evaluate_recommender(test_data, cosine_sim_matrix, train_data):
    y_true = []
    y_pred = []
    for reviewer_id in test_data['reviewerID'].unique():
        true_books = set(test_data[test_data['reviewerID'] == reviewer_id]['title'])
        recommended_books = set(pred_content_based_recommender_system.get(reviewer_id, []))  # Get precalculated recommendations
        y_true.append(len(true_books))
        y_pred.append(len(recommended_books))

    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return mae, rmse

# Evaluate the recommender system
mae_val, rmse_val = evaluate_recommender(val_data, cosine_sim_val, train_data)
mae_test, rmse_test = evaluate_recommender(test_data, cosine_sim_test, train_data)

print("Validation MAE:", mae_val)
print("Validation RMSE:", rmse_val)
print("Test MAE:", mae_test)
print("Test RMSE:", rmse_test)

Train Data Shape: (11742, 8)
Validation Data Shape: (3914, 8)
Test Data Shape: (3915, 8)
Cosine similarity matrix size (Train): (11742, 11742)
Cosine similarity matrix size (Validation): (3914, 11742)
Cosine similarity matrix size (Test): (3915, 11742)
Validation MAE: 22.305785123966942
Validation RMSE: 22.983465130860655
Test MAE: 22.322314049586776
Test RMSE: 23.01005888556905


In [32]:
reviewer_id = 'A11P853U6FIKAM'
recommended_books = recommend_books_for_user(reviewer_id, cosine_sim_train, train_data)

print("Recommended books for reviewer", reviewer_id, ":")
for book in recommended_books:
    print(book)


Recommended books for reviewer A11P853U6FIKAM :
Survival Pantry: Proven Tips for Storing Food &amp; Surviving In Dire Situations - Kindle edition
Gun Control The 2nd Amendment Ghost Dance - Kindle edition
Radioactive: A Dirty Bomb Prepper Survival Story - Kindle edition
Economic Jihad: Putting the Kibosh on Antiquated Social Axioms Defining Us - Kindle edition
Dating:The Way Of The Alpha Male 2nd Edition - Real Talk on How to be King of the Jungle (dating, How to attract woman, attract woman, law of attraction, confidence ... men, how to be a success, weight training) - Kindle edition
What He REALLY Means When He Says... - The Ultimate Guide to Understanding Men, Knowing What They REALLY Think and How to Read Their Minds in Every Situation - Kindle edition
Superheroes: The Best of Philosophy and Pop Culture (The Blackwell Philosophy and Pop Culture Series) - Kindle edition
President Barack Obama: The Kindle Singles Interview (Kindle Single) - Kindle edition
Hawaiian Folk Tales A Collec