In [14]:
# The list of packages and modules used
import gzip
import json
import re
import os
import sys
import nltk
import numpy as np
import pandas as pd
import random
import tensorflow as tf
from surprise import NormalPredictor
from surprise.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import linear_kernel
from sklearn.decomposition import TruncatedSVD
# from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download the list of English words as provided by nltk
nltk.download("words")

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\hashi\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [15]:
# Code to load the data, which came from the dataset's Github page
def load_data(file_name, head = 500):
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            count += 1
            data.append(d)
            
            if (head is not None) and (count > head):
                break
    return data

# Set the path for where the zipped files are stored
path = "Datasets"

# Load the poetry data, as outlined in the Github
# poetry_data = load_data(path + "\goodreads_books_poetry.json.gz")
# reviews_data = load_data(path + "\goodreads_reviews_poetry.json.gz")
# interactions_data = load_data(path + "\goodreads_interactions_poetry.json.gz")

# To print sample records
# print(" == sample record (poetry book) ==")
# print(np.random.choice(poetry_data))
# print(" == sample record (reviews) ==")
# print(np.random.choice(reviews_data))
# print(" == sample record (interactions) ==")
# print(np.random.choice(interactions_data))
# print(poetry.sample(n=1))
# print(reviews.sample(n=1))
# print(interactions.sample(n=1))

# Define the maximum amount of data to load, used when programming
maximum = 4000

# Load the poetry data with pandas
poetry_original = pd.read_json(path + "\goodreads_books_poetry.json.gz", lines=True, nrows=maximum)
#poetry_headers = list(poetry.columns.values)
poetry = poetry_original[["country_code", "language_code", "popular_shelves", "average_rating", "description", "authors", "image_url", "book_id", "ratings_count", "title"]]
poetry_old = poetry_original[["country_code", "language_code", "popular_shelves", "average_rating", "description", "authors", "image_url", "book_id", "ratings_count", "title"]]

# Load the review data with pandas
reviews_original = pd.read_json(path + "\goodreads_reviews_poetry.json.gz", lines=True, nrows=maximum)
#reviews_headers = list(reviews_original.columns.values)
reviews = reviews_original[["user_id", "book_id", "review_id", "rating", "review_text", "date_added"]]

# Load the interactions data with pandas (larger file)
# interactions_original = pd.read_json(path + "\goodreads_interactions_poetry.json.gz", lines=True, chunksize=100000)
# pd.DataFrame.from_dict(interactions_data, chunksize=100)
interactions_concat = pd.read_json(path + "\goodreads_interactions_poetry.json.gz", lines=True, nrows=maximum)
# Store the first 100000 interactions
# interactions_concat = pd.concat(interactions_original)
# Add all the interactions to an array
#interactions_concat = pd.concat(interactions_original, ignore_index=True)
interactions_true = interactions_concat.loc[interactions_concat["is_read"] == True]
interactions = interactions_true[["user_id", "book_id", "review_id", "rating", "date_added"]]

In [16]:
# User table
# Merge to create a user-interaction table, ensuirng we keep all the matched and unmatched rows, but no repeats of data
users = pd.merge(interactions, reviews, how="outer")

# Create the contextual data for the users table, including the month and the decade of reviews
# users["day"] = users["date_added"].apply(lambda x: 1 if 6<int(x.strftime("%H"))<20 else 0)
# users = users.drop("date_added", axis=1)

In [None]:
# Create the books-features vector space model

# To create a list for the unique features within a column
def unique_features(feature, column_name):
    # Go through the column features for each item in the table
    for i in poetry[column_name]:
        # The feature must be unique and must not be empty
        if i not in feature and i != "":
            # Adds the features to a list
            feature.append(i)

# To create the products-feature matrices
def feature_matrix(feature, column_name):
# Creates a column for each feature
    for item in feature:
        # Add a one to each row where the book shares the feature, otherwise add a zero
        poetry[item] = poetry[column_name].apply(lambda x: 1 if item in x else 0)

# Create the genre/"shelves" part of the book-feature matrix
# Single words within the English language
words = set(nltk.corpus.words.words())
# Irrelevant and obvious tags
not_allowed = ["poem", "poet", "to-read"]
tags = []
# Keep track of the index to later update values
index = -1
# Search through the popular shelf tags for each poem
for i in poetry["popular_shelves"]:
    num = 0
    index += 1
    for j in i:
        # Keep the tags that are not within the irrelevant words list
        if not any(unncessary in j["name"] for unncessary in not_allowed):
            num += 1
            # Ensure only the most popular tag is kept if it is not already in the list
            if j["name"] in words and num < 3 and j["name"] not in tags:
                # Add the words to the list
                tags.append(j["name"])
            # Update the book-features table cell when there is an instance of a tag
            if num < 3:
                poetry.at[index, j["name"]] = 1

# Creates the genre-feature matrix
#index = 0
#for i in poetry["popular_shelves"]:
#    name = []
#    for j in i:
#        name.append(j["name"])
#    poetry.iloc[index]["popular_shelves"] = name
#    index += 1
# feature_matrix(tags, "popular_shelves")
#Find the column with this name and change it to 1 in the right place

# The unique country of origin array
# country = []
# Creates the country of origin matrix
# unique_features(country, "country_code")

# The unique book language array
# language = []
# Create the book language matrix
# unique_features(language, "language_code")

# The unique authors array
authors = []
# Keep track of the index to update values later
index = -1
# Iterates through the list of authors
for i in poetry["authors"]:
    for j in i:
        # Creates the unique authors array
        if j["author_id"] not in authors:
            authors.append("author_" + str(j["author_id"]))
        # Update the book-features table cell when there is an instance of an author
        poetry.at[index, j["author_id"]] = 1

# Creates the authors of the book matrix
# feature_matrix(authors, "authors")

# print(poetry.iloc[:, 10:])
# np.count_nonzero(poetry)

# Create the keyword analysis matrix, from the poetry description

In [None]:
# Create the user-book matrix by combining the user and books tables

# Ensure ratings are placed for the correct user and the book they have rated
tmp = users.copy()
user_poetry = tmp.pivot_table(index="user_id", columns="book_id", values="rating")

# Look up the book_id and replace it with the book name
#user_poetry.columns = user_poetry.columns.map(poetry.set_index("book_id")["title"])
#user_poetry = user_poetry.loc[:, user_poetry.columns.notna()]
# Store the book names in an array
#print(len(user_poetry.columns.values))

# Ensure the interactions contain books within the books list
# for i in users.columns.values:
#    if i not in poetry["authors"]:
        # Remove all the books that are not within the book list
#        users.drop(columns=[i])

# Place NaN values where there is an intersection but no rating
missing = list(set(user_poetry.index) - set(user_poetry.columns))
for value in missing:
    user_poetry[value] = np.nan

# Perform normalisation as the weighting scheme
min_max_scaler = preprocessing.MinMaxScaler()
# Scale the data between zero and one
user_poetry[:] = min_max_scaler.fit_transform(user_poetry.values)
user_poetry = pd.DataFrame(user_poetry[:])

In [19]:
# Start of the interface - find or create the new user for the system

def login():
    # Allow the user to enter their username
    username = input("Enter your username:")
    # Determine whether the user is already in the system
    if str(username) in str(user_poetry.index):
        # If the user is in the system, set the user index as their username
        user_index = int(username)
        return user_index
    else:
        # If the user is not in the system, add them to the user-books matrix
        row = len(user_poetry)
        # user_poetry.loc[row] = np.NaN
        # Store the new user index
        user_index = 0
        # user_poetry.append(pd.Series(name=user_index))
        # Tell the user their username, as they are a new user
        print("You're a new user, your username is: " + str(user_poetry.index[0]))
        return user_index

# print(len(user_poetry.columns.values))
# print(len(poetry_old.index))
# print(poetry.iloc[:, 10:])
# np.count_nonzero(poetry.iloc[:, 10:])

In [20]:
# For the content-based recommender system
poetry = poetry.drop_duplicates(subset=["title"], keep="first")
feature_table = poetry.iloc[:, 10:].fillna(0)

# Apply cosine similariity to the new poetry table as we are seeing how similar/different the features are for each film
# Use the cosine similarity metric to match how close a rated item (from the user profile vector) is to an unrated item
cos_sim = pd.DataFrame(cosine_similarity(feature_table))
# print(cos_sim)
# np.count_nonzero(cos_sim)

# Input the name --> look up the id --> find the index of the id

# The function for the content recommender system
def content_recommender(search, user_index):
    # When the user input is found to be a poem in the system
    if str(search) in poetry_old["title"].values:
        # Lookup the index for the input
        search_index = poetry.index[poetry["title"] == search].tolist()
        print(search + ": " + str(poetry_old.iloc[search_index]["description"]))

        rating = input("Would you like to rate this item? Please type 'y' or 'n'")
        if rating == "y":
            rating_continued = input("What is your rating out of 5?")
            # Update the user rating in the table
            # for i in user_index:
            #    user_poetry.at[i, search_index] = float(rating_continued/5)

        # Find the similar books to the input
        similar = list(enumerate(cos_sim[search_index]))
        # Sort them based on the most similar ones first
        sorted_similar = sorted(similar, key=lambda x:x[1], reverse=True)
        # Notify the user if no simialr books were found
        if similar == []:
            print("No similar books were found.")
        if similar != []:
            # Print the first few (up to five) similar books based on the search
            for i in sorted_similar[:4]:
                # print(sorted_similar[:4], i[1])
                #output_index = poetry.index[poetry["book_id"] == i[1]].tolist()
                #print(output_index)
                print("Similar books include: ")
                print(poetry_old.iloc[i[1]]["title"])
    # Output when the search does not find a result
    else:
        print("The book is not in the system.")

In [21]:
# Content-based evalation (personalised vs non-personalised)

# Split the user-books table into a test set and a training set
# print(user_poetry)

# 80% of the data goes to the training set, 20% goes to the test set
train, test = train_test_split(user_poetry, test_size=0.2)

# print(train)

i = 1
content_train = train.iloc[i].to_frame(name="y")
content_test = test.iloc[i].to_frame(name="y")

# Keep the matching book ids in the two tables
# for i in train.columns.values:
#    if i not in user_poetry.columns.values:
#        train.drop(columns=[i])

# for i in user_poetry.columns.values:
#    if i not in train.columns.values:
#        user_poetry.drop(columns=[i])

# remove = list(set(user_poetry.columns.values).symmetric_difference(train.columns.values))
# print(remove)

# usr_ft = tf.matmul(usr, prd)
# weights = usr_ft / tf.reduce_sum(usr_ft, axis=1, keepdims=True)
# pred = tf.matmul(weights, prd.T)

# user_poetry = user_poetry.fillna(0)
# similarity_matrix = linear_kernel(user_poetry, user_poetry)
# print(similarity_matrix)

# mapping = pd.Series(poetry_old.index, index = poetry_old["title"])
# title = mapping[search]

# Use the k-nearest neighbour method to compare unrated items to all stored items and assign neighbour classes
# similarity = list(enumerate(user_poetry[]))

In [None]:
# Collaborative-based (model-based) - using other user information
# Transpose the matrix to perform a dot product
transposed = user_poetry.T.fillna(0)

# Decompose the matrix with SVD
SVD = TruncatedSVD(n_components=12, random_state=17)
# Fit the matrix to the model
matrix = SVD.fit_transform(transposed)

# param_grid = {'n_factors':[50,100,150],'n_epochs':[20,30],  'lr_all':[0.005,0.01],'reg_all':[0.02,0.1]}
# gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
# gs.fit(transposed)
# print(gs.best_score['rmse'])
# print(gs.best_score['mae'])
# params = gs.best_params['rmse']
# print(params)

# Create a correlation matrix between all the user book ratings
correlation = np.corrcoef(matrix)

# To know the mappings between the ids and the index values
book_ids = list(user_poetry.columns)

# The function for the collaborative recommender system
def collaborative_recommender(search_collab, user_index):
    # When the user input is found to be a poem in the system
    if str(search_collab) in poetry["title"].values:
        # Lookup the index for the input
        search_index = poetry.index[poetry["title"] == search_collab].tolist()

        rating = input("Would you like to rate this item? Please type 'y' or 'n'")
        if rating == "y":
            rating_continued = input("What is your rating out of 5?")
            # Update the user rating in the table
            # for i in user_index:
            #    user_poetry.at[i, search_index] = float(rating_continued/5)

        # Display the first five items (maximum), that are most similar to the input
        for i in search_index:
            similar = list(poetry_old.columns[(correlation[i]<1.0) & (correlation[i]>0.5)])
            print(search_collab + ": " + str(poetry_old.iloc[i]["description"]))
        # Notify the user if no similar books were found
        if similar == []:
            print("No similar books were found.")
        if similar != []:
            for i in similar[:4]:
                # We get ids instead, look-up the index in the book_ids array
                position = book_ids.index(i)
                print("Similar books include: ")
                print(poetry_old.iloc[position]["title"])
        # Output when the search does not find a result
    else:
        print("The book is not in the system or no similar books were found.")


In [24]:
# Collaborative-based evaluation (personalised vs non-personalised)

# Perform parameter tuning
# transposed = user_poetry.T

# Create a correlation matrix between all the books
# correlation = np.corrcoef(matrix)
# book_ids = list(user_poetry.columns)

In [25]:
# Non-personalised functions and calculations
# Store the most popular books on the system
popular_books = []
# Keep track of the item index
index = -1
for i in poetry_old["popular_shelves"]:
    index += 1
    for j in i:
        # Store all the books with a count over 100 and those marked as "to-read"
        if int(j["count"]) > 100 and str(j["name"]) == "to-read":
            popular_books.append(poetry_old.iloc[index]["title"])

def popular_books_recommendation():
    # Print a random five books from the most popular books list
    print("Popular books:")
    for i in range(5):
        print(random.choice(popular_books), end = "\n")
    print("\n")

In [30]:
# User interface
# Allow the user to choose between the non-personalised and personalised recommender systems
print("When using this system, contextual data and data based on rating you make will be collected. Please continue if you consent.")
start = input("This is a poetry collection hybrid recommender system, using content and model-based collaborative filtering. \nPlease type 'np' to use the non-personalised recommender system or 'p' to use the personalised recommender system.")

# Non-personalised recommender
if start == "np":
    refresh = input("Type in 'n' to see new recommendations, or 'q' to quit the system.")
    # Loop until the user quits
    while refresh != "q":
        # Provide the most popular books
        popular_books_recommendation()
        # Allow the user to see new recommendations or quit the system
        refresh = input("Type in 'n' to see new recommendations, or 'q' to quit the system.")
    
    # High average ratings
    
# Personalised recommender
if start == "p":
    # Allow the user to login to add ratings
    user_index = login()
    # Allow the user to search books, see new recommendations or quit the system
    action = input("Type in 's' to search a book title (to learn more about it, find similar books and rate it) or 'q' to quit the system.")
    iteration = 0
    # Loop until the user quits
    while action != "q":
        # Store the number of ratings the user has made
        number_of_ratings = user_poetry.iloc[user_index].sum()
        iteration += 1
        # Hybrid approach
        # Use switching and change recommender when there is enough interaction data
        # Primarly use content-based filtering to begin with, to avoid the cold-start problem and to deal with the spare data
        if (action == "s" and number_of_ratings < 20):
            # Use the content-based recommender when there are few user ratings
            content_recommender(input("Enter the name of a poetry collection to find similar titles."), user_index)
            action = input("Type in 's' to search a book title (to learn more about it, find similar books and rate it) or 'q' to quit the system.")
        if action == "s" and number_of_ratings > 20 or iteration == 2:        
            # Use the collaborative-based recommender when there are enough user ratings or when the system has been used long enough
            collaborative_recommender(input("Enter the name of a poetry collection to find similar titles."), user_index)
            action = input("Type in 's' to search a book title (to learn more about it, find similar books and rate it) or 'q' to quit the system.")

else:
    # If anything else is typed in, notify the user to restart the system
    print("\nPlease restart the system.")

When using this system, contextual data and data based on rating you make will be collected. Please continue if you consent.
You're a new user, your username is: 004d5e96c8a318aeb006af50f8cc949c
Into Temptation: 6    Into Temptation is the debut collection of poe...
Name: description, dtype: object
Similar books include: 
Into Temptation
