In [2]:
#import libraries
import numpy as np
import pprint
import scipy
import scipy.linalg
import pandas as pd
import random
from collections import OrderedDict
from scipy.linalg import svd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Lasso
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import json
from tqdm import tqdm
from collections import defaultdict
from surprise import Reader, Dataset, accuracy
from surprise.model_selection import cross_validate, GridSearchCV
from surprise.prediction_algorithms import SVD, KNNBasic, KNNBaseline
from surprise.model_selection import train_test_split
np.set_printoptions(threshold=20)
pd.options.mode.copy_on_write = True

In [3]:
#FUNCTION TO CONVERT CSV TO DATAFRAME
#converts and reduces dataframe to usable data
def csv_to_df(string1, string2):    
    df = pd.read_csv(string1)

    df_book = pd.read_csv(string2)

    category_array = df_book['categories'].unique()

    category_array.size
    #print('\n'.join(category_array.values))



    #Preprocessing/Data Cleaning
    df_ratings = df

    drop_columns = ["Price", "profileName", "review/time", "review/summary", "review/helpfulness", "review/text"]

    books_ratings_cleaned = df_ratings.drop(drop_columns, axis=1)

    #Create Dictionary Key for Unique ID:Book
    book_dict = dict()
    for row in books_ratings_cleaned.itertuples():
        if row[1] not in book_dict:
            book_dict[row[1]] = [row[2],0]

        book_dict[row[1]][1] +=1


    drop_columns = ["Title", "Price", "profileName", "review/time", "review/summary", "review/helpfulness", "review/text"]

    books_ratings_cleaned = df_ratings.drop(drop_columns, axis=1)

    #books_ratings_cleaned

    books_ratings_cleaned['User_id'].isna()

    sum(books_ratings_cleaned['User_id'].isna())

    books_ratings_cleaned = books_ratings_cleaned[books_ratings_cleaned['User_id'].notna()]

    return books_ratings_cleaned, book_dict

In [4]:
#function to convert data frame to numpy matrix
def Convert_to_Matrix(matrix):
    #convert Dictionary Key into Sparse Matrix With users as Columns, Books as rows
    test = pd.DataFrame.from_dict(matrix)

    test.head()

    #take Matrix Transpose
    test_transpose = test.transpose()

    #populate NaN's as zeroes
    A_transpose = test_transpose.fillna(0)

    return A_transpose

#function to perform LU factorization on numpy matrix
def LU_Factorization(matrix):
    #A = PLU FACTORIZATION
    P, L, U = scipy.linalg.lu(matrix)

    A_test = np.matmul(P, np.matmul(L, U))

    #Test for Accuracy
    print(np.allclose(matrix, A_test))
    
    return P, L, U
    

In [5]:
#COLLABORATIVE FILTERING COSINE SIMILARITY FUNCTION
#finding similar users by using cosine similarity algorithm
def find_similar(user_1, k):
    allusers = A.values
    denominator1 = np.sqrt(sum([np.square(x) for x in user_1]))
    
    #performs cosine similarity algorithm on vectors (users)
    cosinesimilarity = [(user_1.name,1)]
    i=1
    for user in tqdm(allusers[1:]):
        numerator = [x*y for x,y in zip(user_1.values, user)]
        denominator2 = np.sqrt(sum([np.square(x) for x in user]))
        costheta = sum(numerator) / (denominator1 * denominator2)
        cosinesimilarity.append((A.index[i],costheta))
        i+=1
    
    #sort the results
    cosinesimilarity.sort(key = lambda x: x[1], reverse = True)
    
    #combine sorted results
    similar10users = cosinesimilarity[0:k]
    
    #output results of sorted similar users
    for i in range(0,k):
        print(similar10users[i])
    
    #place similar users into a data frame
    top10usersdf = pd.DataFrame()
    for user in similar10users:
        top10usersdf = top10usersdf.append(A.loc[user[0]])
    top10usersdf['costheta'] = [user[1] for user in similar10users]
    
    #to be used in calculation of distance/similarity for recommendation
    all_values = top10usersdf.values

    return top10usersdf

In [6]:
# Some helper methods for getting subsets of users that have more data, reducing the sparcity of matrix
# when experimenting with the dataset
def get_most_frequent_users(count, source_frame):
    assert isinstance(count, int), "Argument must be an integer"
    assert count > 0, "Argument must be a positive integer"
    
    user_occurences = source_frame['User_id'].value_counts()
    
    assert count < user_occurences.count(), "Requested more data then we have...not tiny"
    
    return user_occurences.head(count).index.to_list()

def get_users_with_minimal_ratings(rated, source_frame):
    assert isinstance(rated, int), "Argument must be an integer"
    assert rated > 0, "Argument must be a positive integer"
    
    # Get a series mapping user ids to their occurences in the data frame
    user_occurences = source_frame['User_id'].value_counts()
    return user_occurences[user_occurences > rated].index.to_list()

def get_training_data(ratio: float, source_frame):
    assert isinstance(ratio, float), "Ratio must be a float"
    assert 0.0 < ratio < 1.0, "Ratio must be in [0, 1]"
    
    # we want to ensure that our test data is built from good users, users with
    # less then 10 ratings are considered un-testable.
    # FILTER USERS BY NUMBER OF REVIEWS PER USER HERE
    good_users = get_users_with_minimal_ratings(20, source_frame)
    
    # Randomize the order of ratings for users in our good users set.
    good_users_ratings = source_frame[source_frame['User_id'].isin(good_users)].sample(frac=1)
    
    total_count = good_users_ratings.shape[0]
    training_data_count = int(total_count * ratio)
    
    training_data = pivot_rating_to_user_frame(good_users_ratings[0:training_data_count])
    test_data = pivot_rating_to_user_frame(good_users_ratings[training_data_count:])
    
    return training_data, test_data

def pivot_rating_to_user_frame(source_frame):
    new_data = dict()
    for _, row in tqdm(source_frame.iterrows(), total=source_frame.shape[0]):
        (user_id, book_id, score) = row
        if user_id not in new_data:
            new_data[user_id] = defaultdict(lambda: np.nan)
        new_data[user_id][book_id] = score
        
    return pd.DataFrame.from_dict(new_data)

In [7]:
#SVD algorithm on numpy matrix from DF
def do_svd(mat, k=0, option=False):
    U, Sigma, VT = svd(mat)
    U = pd.DataFrame(U[:,:k])
    VT = pd.DataFrame(VT[:k,:])
    if option:
        return Sigma
    else:
        return U, VT

#plot SVD factorization
def plot_data(mat, data_type, camera=None):
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    if camera != None:
        ax.view_init(elev=camera[0], azim=camera[1])
    for index, row in mat.iterrows():
        ax.scatter(row[0], row[1], row[2], alpha=0.8)
        ax.text(row[0], row[1], row[2],'{0} {1}'.format(data_type, index), size=4)
    plt.show()

In [9]:
#TIERLIST RECOMMENDATION FUNCTION
def recommend(liked_book, VT, output_num=5):
    rec = []
    for item in range(len(VT.columns)):
        if item != liked_book:
            rec.append([item,np.dot(VT[item],VT[liked_book])])
    final_rec = [i[0] for i in sorted(rec, key=lambda x: x[1],reverse=True)]
    return final_rec[:output_num]

#RETURN TIERLIST
def get_tier_list(column_names, row_names, user, Vt, book_dict):
    recommendation = recommend(user, Vt)
    recommendation
    
    tier_list = []
    get_user_review_list = []
    for i in range(len(recommendation)):
        tier_list.append(column_names[recommendation[i]])
        get_user_review_list.append(tier_list[i])

    print ("User", row_names[user], "recommendations:")
    for i in range(len(recommendation)):
        tier_list[i] = book_dict[tier_list[i]]
        print(get_user_review_list[i], tier_list[i])
    return tier_list

#GET INDIVIDUAL USER DATA FOR BOOK:RATING
def get_individual_user_data(column_names, row_names, A, user, book_dict):
    
    get_user_review_list = []
    get_user_scores = []
    
    user_individual_data = A.iloc[user].values
    
    for i in range(len(column_names)):
        if user_individual_data[i] != 0.0:
            get_user_review_list.append(column_names[i])
            get_user_scores.append(user_individual_data[i])
            
    refined_user_list = []
    
    for i in range(len(get_user_review_list)):
        refined_user_list.append(book_dict[get_user_review_list[i]])

    return user_individual_data, refined_user_list, get_user_scores, get_user_review_list

#output information for viewing
def print_user_data(user, refined_user_list, get_user_review_list, get_user_scores, row_names):
    
    print("User", row_names[user], "book reviews with score:\n")
    
    for i in range(len(get_user_review_list)):
        print(get_user_review_list[i], refined_user_list[i], get_user_scores[i])