In [21]:
#import libraries
import numpy as np
import pprint
import scipy
import scipy.linalg
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Lasso
import matplotlib.pyplot as pl
import json
from tqdm import tqdm
from collections import defaultdict
from surprise import Reader, Dataset
from surprise.model_selection import cross_validate, GridSearchCV
from surprise.prediction_algorithms import SVD, KNNBasic, KNNBaseline
np.set_printoptions(threshold=20)
pd.options.mode.copy_on_write = True

In [6]:
#FUNCTION TO CONVERT CSV TO DATAFRAME
#converts and reduces dataframe to usable data
def csv_to_df(string1, string2):    
    df = pd.read_csv(string1)

    df_book = pd.read_csv(string2)

    category_array = df_book['categories'].unique()

    category_array.size
    #print('\n'.join(category_array.values))



    #Preprocessing/Data Cleaning
    df_ratings = df

    drop_columns = ["Price", "profileName", "review/time", "review/summary", "review/helpfulness", "review/text"]

    books_ratings_cleaned = df_ratings.drop(drop_columns, axis=1)

    #Create Dictionary Key for Unique ID:Book
    book_dict = dict()
    for row in books_ratings_cleaned.itertuples():
        if row[1] not in book_dict:
            book_dict[row[1]] = [row[2],0]

        book_dict[row[1]][1] +=1


    drop_columns = ["Title", "Price", "profileName", "review/time", "review/summary", "review/helpfulness", "review/text"]

    books_ratings_cleaned = df_ratings.drop(drop_columns, axis=1)

    #books_ratings_cleaned

    books_ratings_cleaned['User_id'].isna()

    sum(books_ratings_cleaned['User_id'].isna())

    books_ratings_cleaned = books_ratings_cleaned[books_ratings_cleaned['User_id'].notna()]

    return books_ratings_cleaned, book_dict

In [7]:
#function to convert data frame to numpy matrix
def Convert_to_Matrix(matrix):
    #convert Dictionary Key into Sparse Matrix With users as Columns, Books as rows
    test = pd.DataFrame.from_dict(matrix)

    test.head()

    #take Matrix Transpose
    test_transpose = test.transpose()

    #populate NaN's as zeroes
    A_transpose = test_transpose.fillna(0)

    return A_transpose

#function to perform LU factorization on numpy matrix
def LU_Factorization(matrix):
    #A = PLU FACTORIZATION
    P, L, U = scipy.linalg.lu(matrix)

    A_test = np.matmul(P, np.matmul(L, U))

    #Test for Accuracy
    print(np.allclose(matrix, A_test))
    
    return P, L, U
    

In [8]:
#COLLABORATIVE FILTERING COSINE SIMILARITY FUNCTION
#finding similar users by using cosine similarity algorithm
def find_similar(user_1, k):
    allusers = A.values
    denominator1 = np.sqrt(sum([np.square(x) for x in user_1]))
    
    #performs cosine similarity algorithm on vectors (users)
    cosinesimilarity = [(user_1.name,1)]
    i=1
    for user in tqdm(allusers[1:]):
        numerator = [x*y for x,y in zip(user_1.values, user)]
        denominator2 = np.sqrt(sum([np.square(x) for x in user]))
        costheta = sum(numerator) / (denominator1 * denominator2)
        cosinesimilarity.append((A.index[i],costheta))
        i+=1
    
    #sort the results
    cosinesimilarity.sort(key = lambda x: x[1], reverse = True)
    
    #combine sorted results
    similar10users = cosinesimilarity[0:k]
    
    #output results of sorted similar users
    for i in range(0,k):
        print(similar10users[i])
    
    #place similar users into a data frame
    top10usersdf = pd.DataFrame()
    for user in similar10users:
        top10usersdf = top10usersdf.append(A.loc[user[0]])
    top10usersdf['costheta'] = [user[1] for user in similar10users]
    
    #to be used in calculation of distance/similarity for recommendation
    all_values = top10usersdf.values

    return top10usersdf

In [9]:
# Some helper methods for getting subsets of users that have more data, reducing the sparcity of matrix
# when experimenting with the dataset
def get_most_frequent_users(count, source_frame):
    assert isinstance(count, int), "Argument must be an integer"
    assert count > 0, "Argument must be a positive integer"
    
    user_occurences = source_frame['User_id'].value_counts()
    
    assert count < user_occurences.count(), "Requested more data then we have...not tiny"
    
    return user_occurences.head(count).index.to_list()

def get_users_with_minimal_ratings(rated, source_frame):
    assert isinstance(rated, int), "Argument must be an integer"
    assert rated > 0, "Argument must be a positive integer"
    
    # Get a series mapping user ids to their occurences in the data frame
    user_occurences = source_frame['User_id'].value_counts()
    return user_occurences[user_occurences > rated].index.to_list()

def get_training_data(ratio: float, source_frame):
    assert isinstance(ratio, float), "Ratio must be a float"
    assert 0.0 < ratio < 1.0, "Ratio must be in [0, 1]"
    
    # we want to ensure that our test data is built from good users, users with
    # less then 10 ratings are considered un-testable.
    # FILTER USERS BY NUMBER OF REVIEWS PER USER HERE
    good_users = get_users_with_minimal_ratings(10, source_frame)
    
    # Randomize the order of ratings for users in our good users set.
    good_users_ratings = source_frame[source_frame['User_id'].isin(good_users)].sample(frac=1)
    
    total_count = good_users_ratings.shape[0]
    training_data_count = int(total_count * ratio)
    
    training_data = pivot_rating_to_user_frame(good_users_ratings[0:training_data_count])
    test_data = pivot_rating_to_user_frame(good_users_ratings[training_data_count:])
    
    return training_data, test_data

def pivot_rating_to_user_frame(source_frame):
    new_data = dict()
    for _, row in tqdm(source_frame.iterrows(), total=source_frame.shape[0]):
        (user_id, book_id, score) = row
        if user_id not in new_data:
            new_data[user_id] = defaultdict(lambda: np.nan)
        new_data[user_id][book_id] = score
        
    return pd.DataFrame.from_dict(new_data)

In [10]:
books_ratings_cleaned, book_dict = csv_to_df("Books_rating.csv", "books_data.csv")

In [50]:
books_ratings_cleaned.count()

Id              2438213
User_id         2438213
review/score    2438213
dtype: int64

In [18]:
columns_titles = ["User_id","Id", "review/score"]
Books_ratings_cleaned = books_ratings_cleaned.reindex(columns=columns_titles)

#Books_ratings_cleaned.head()

Unnamed: 0,User_id,Id,review/score
0,AVCGYZL8FQQTD,1882931173,4.0
1,A30TK6U7DNS82R,826414346,5.0
2,A3UH4UZ4RSVO82,826414346,5.0
3,A2MVUWT453QH61,826414346,4.0
4,A22X4XUPKF66MR,826414346,4.0


In [19]:
#EXCUTE ME PLEASE FOR REAL IM DYING
#LIKE REALLY, RUN ME
training_data, test_data = get_training_data(0.75, books_ratings_cleaned.sample(100000))

100%|███████████████████████████████████████████████████████████████████████████| 3760/3760 [00:00<00:00, 11720.00it/s]
100%|███████████████████████████████████████████████████████████████████████████| 1254/1254 [00:00<00:00, 13137.09it/s]


In [14]:
A = training_data.fillna(0)

#A.head()

#A = A.to_numpy()

#np.linalg.eig(A)

In [15]:
# read in values as Surprise dataset
#reader = Reader(rating_scale=(1, 5))
#data = Dataset.load_from_df(training_data, reader)

In [None]:
#A_similar = find_similar(A.iloc[0],10)

#A_similar

In [None]:
#USE THIS TO CHECK BOOK ID
#print(book_dict.get('1557424470'))
#A_similar.head()

In [51]:
USERBOOK = Books_ratings_cleaned.head(50000)

#USERBOOK.head()

In [52]:
reader = Reader()
data = Dataset.load_from_df(USERBOOK, reader)

In [53]:
# Use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results.
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0855  1.0751  1.0851  1.0836  1.0891  1.0837  0.0047  
MAE (testset)     0.8225  0.8153  0.8203  0.8243  0.8268  0.8218  0.0039  
Fit time          0.75    0.63    0.73    0.74    0.79    0.73    0.05    
Test time         0.08    0.08    0.07    0.07    0.08    0.08    0.00    


{'test_rmse': array([1.08553103, 1.07509269, 1.08513763, 1.08358879, 1.08909244]),
 'test_mae': array([0.82247379, 0.81529567, 0.82034333, 0.82426678, 0.82681063]),
 'fit_time': (0.7490639686584473,
  0.6280405521392822,
  0.7321815490722656,
  0.7377645969390869,
  0.7922866344451904),
 'test_time': (0.07548046112060547,
  0.08153748512268066,
  0.07371354103088379,
  0.0736551284790039,
  0.07596135139465332)}

In [54]:
dataset = data.build_full_trainset()
svd = SVD(n_factors=5000, reg_all=0.05)
svd.fit(dataset)
svd.predict('A30TK6U7DNS82R', '0826414346')

Prediction(uid='A30TK6U7DNS82R', iid='0826414346', r_ui=None, est=4.99653471821967, details={'was_impossible': False})