In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import itertools
from math import ceil, sqrt

In [3]:
class ContextAwareMatrixFactorization:
    def __init__(self, data, user_column_name, item_column_name,
                 rating_column_name, context_column_names, features,
                 average_item_ratings=None):
        """
        Constructor for the class

        Parameters
        ----------
            data (DataFrame): the data for the algorithm to use
            user_column_name (string): Name of the user column
            item_column_name (string): Name of the item column
            rating_column_name (string): Name of the rating column
            context_column_name (list(string)): Names of the context columns.
            features (int): The number of features used in the factorization.
            average_item_ratings (dict(itemId,rating)): average rating of items
        ---------
        """
        self.data = data
        self.user_column_name = user_column_name
        self.item_column_name = item_column_name
        self.rating_column_name = rating_column_name
        self.context_column_names = context_column_names
        self.features = features
        self.unique_users = self.data[self.user_column_name].unique()
        self.unique_items = self.data[self.item_column_name].unique()
        self.unique_contexts = dict()
        for context in self.context_column_names:
            self.unique_contexts[context] = list(
                itertools.product(self.data[context].unique(),
                                  self.unique_items))
        self.users_dict = {k: v for v, k in enumerate(self.unique_users)}
        self.items_dict = {k: v for v, k in enumerate(self.unique_items)}
        self.context_dicts = dict()
        for key, value in self.unique_contexts.items():
            self.context_dicts[key] = {k: v for v, k in enumerate(value)}
        self.number_of_users = len(self.unique_users)
        self.number_of_items = len(self.unique_items)
        self.number_of_contexts = dict()
        for key, value in self.unique_contexts.items():
            self.number_of_contexts[key] = len(value)

        # Set random seed
        np.random.seed(42)
        self.user_feature_matrix = np.random.uniform(
            0, 1, (self.number_of_users, self.features))
        self.feature_item_matrix = np.random.uniform(
            0, 1, (self.features, self.number_of_items))
        self.user_biases = np.random.uniform(size=self.number_of_users)
        self.context_biases = dict()
        for key, value in self.number_of_contexts.items():
            self.context_biases[key] = np.random.uniform(size=value)

        self.average_item_ratings = average_item_ratings
        if self.average_item_ratings is None:
            self.get_average_ratings()

    def get_average_ratings(self):
        """
        Gets the average rating for each item and stores in a dictionary.
        """
        self.average_item_ratings = {}
        for _, row in self.data.iterrows():
            self.average_item_ratings[row[self.item_column_name]] = \
                self.data.loc[self.data[self.item_column_name] == row[
                    self.item_column_name]][self.rating_column_name].mean()
        return self.average_item_ratings

    def predict_ratings_for_entries(self, df):
        """
        Params:
            df (DataFrame): the dataframe containing rows of actual ratings
                for a specific user that we want predict ratings for
        Returns a dict of
            key: movieId and value: (actual_rating, predicted_rating)
        """
        predictions = {}
        for _, row in df.iterrows():
            user_index = self.users_dict[row[self.user_column_name]]
            item_index = self.items_dict[row[self.item_column_name]]
            context_indexes = dict()
            for key, value in self.context_dicts.items():
                context_indexes[key] = value[(row[key],
                                              row[self.item_column_name])]
            rating = row[self.rating_column_name]

            predicted_rating = self.predict_rating(user_index, item_index,
                                                   row[self.item_column_name],
                                                   context_indexes)
            predictions[row[self.item_column_name]] = (rating,
                                                       predicted_rating)
        return predictions

    def train(self, training_data, lrate, rterm, epochs=1000,
              batch_size=0.001):
        """
        Function that trains the model by learning the parameters
        through stochastic gradient descent
        """
        for epoch in range(epochs):
            batch = training_data.sample(
                ceil(len(training_data.index)*batch_size))

            for _, row in batch.iterrows():
                user_index = self.users_dict[row[self.user_column_name]]
                item_index = self.items_dict[row[self.item_column_name]]
                context_indexes = dict()
                for key, value in self.context_dicts.items():
                    context_indexes[key] = value[(row[key],
                                                  row[self.item_column_name])]
                rating = row[self.rating_column_name]

                predicted_rating = self.predict_rating(
                    user_index, item_index,
                    row[self.item_column_name],
                    context_indexes)
                error = rating - predicted_rating

                self.user_biases[user_index] = self.user_biases[
                    user_index] - lrate * (2 * error * (-1) +
                                           2 * rterm *
                                           self.user_biases[user_index])
                # Update the context variables
                for key, value in self.context_biases.items():
                    value[context_indexes[key]] = value[context_indexes[key]] \
                        - lrate * (2 * error * (-1) + 2 *
                                   rterm * value[context_indexes[key]])

                for k in range(self.features):
                    self.user_feature_matrix[user_index][k] = \
                        self.user_feature_matrix[user_index][k] - lrate * (
                        2 * error * (-1)
                        * self.feature_item_matrix[k][item_index] + 2
                        * rterm * self.user_feature_matrix[user_index][k])
                    self.feature_item_matrix[k][item_index] = \
                        self.feature_item_matrix[k][item_index] - lrate * (
                        2 * error * (-1)
                        * self.user_feature_matrix[user_index][k] + 2
                        * rterm * self.feature_item_matrix[k][item_index])

            if epoch % 100 == 0:
                mse, mae, rmse = self.calculate_metrics(training_data)
                print('Epoch', epoch, "-- MSE:", mse,
                      "-- RMSE:", rmse, "-- MAE:", mae)

    def calculate_metrics(self, data):
        """
        Calculates the metrics MSE, MAE and RMSE
        Parameter:
            data (DataFrame): the data to calculate the error for
        Returns: mse, mae, rmse
        """
        mse = 0
        mae = 0
        for _, row in data.iterrows():
            user_index = self.users_dict[row[self.user_column_name]]
            item_index = self.items_dict[row[self.item_column_name]]
            context_indexes = dict()
            for key, value in self.context_dicts.items():
                context_indexes[key] = value[(row[key],
                                              row[self.item_column_name])]
            rating = row[self.rating_column_name]

            predicted_rating = self.predict_rating(user_index, item_index,
                                                   row[self.item_column_name],
                                                   context_indexes)
            mse = mse + (rating - predicted_rating)**2
            mae = mae + abs(rating - predicted_rating)

        mse = mse / len(data.index)
        mae = mae / len(data.index)
        rmse = sqrt(mse)

        return mse, mae, rmse

    def calculate_precision_recall_mapk(self, test_data, k_val=10):
        """
        Parameters:
        ---------------------
        test_data (dataFrame): the test data
        k_val (int): the value of k used for the map at k calculation
        ---------------------
        Returns:
        precision, recall, mapk, ndcg
        """
        # Relevant items and user_top_k_items are used
        # for mapk calculation
        relevant_items = {}
        users_top_k_items = {}
        # predictions used for precision and recall
        predictions = []

        for _, row in test_data.iterrows():
            userId = row[self.user_column_name]
            itemId = row[self.item_column_name]
            unique_test_items = test_data[self.item_column_name].unique()

            # Get top k items
            if users_top_k_items.get(userId) is None:
                users_top_k_items[userId] = self.get_top_k_pred(
                    userId, unique_test_items, k_val)

            context_indexes = {}
            for key, value in self.context_dicts.items():
                context_indexes[key] = value[(row[key], itemId)]
            prediction = self.predict_rating(self.users_dict[userId],
                                             self.items_dict[itemId], itemId,
                                             context_indexes)
            predictions.append((userId, itemId, row[self.rating_column_name],
                                prediction, None))

            if row[self.rating_column_name] >= 3:
                if relevant_items.get(userId) is None:
                    relevant_items[userId] = [itemId]
                else:
                    relevant_items[userId].append(itemId)

        mapk = calculate_map(users_top_k_items, relevant_items, k_val)
        precision, recall = precision_recall_at_k(predictions)
        ndcg = calculate_ndcg(users_top_k_items, relevant_items)

        return precision, recall, mapk, ndcg

    def get_top_k_pred(self, userId, unique_test_items, k_val):
        predictions = {}
        lengths = list([(column_name, len(self.data[column_name].unique()))
                        for column_name in self.context_column_names])
        context_values_dict = {}
        for name in self.context_column_names:
            context_values_dict[name] = self.data[name].unique()
        counters = [0] * len(self.context_column_names)

        def nested_for_context(item_id, counters, lengths, level=0):
            if level == len(counters):
                user_index = self.users_dict[userId]
                item_index = self.items_dict[item_id]
                context_values = dict()
                for index, value in enumerate(lengths):
                    context_values[value[0]] = context_values_dict[
                        value[0]][counters[index]]
                context_indexes = dict([(key,
                                         self.context_dicts[key][(value,
                                                                  item_id)])
                                        for key, value
                                        in context_values.items()])
                predicted_rating = self.predict_rating(user_index, item_index,
                                                       item_id,
                                                       context_indexes)

                predictions[(item_id,) +
                            tuple(context_values.items())] = predicted_rating
            else:
                for _ in range(lengths[level][1]):
                    nested_for_context(item_id, counters, lengths, level + 1)
                    counters[level] += 1
                counters[level] = 0

        for item in unique_test_items:
            nested_for_context(item, counters, lengths, 0)

        sorted_predictions = sorted(predictions,
                                    key=predictions.get, reverse=True)
        top_items = [x[0] for x in sorted_predictions]
        top_items = list(dict.fromkeys(top_items))
        return top_items[:k_val]

    def predict_rating(self, user_index, item_index, item_id, context_indexes):
        predicted_rating = (np.dot(
            self.user_feature_matrix[user_index, :],
            self.feature_item_matrix[:, item_index])
            + self.average_item_ratings[item_id]
            + self.user_biases[user_index]
            + sum([value[context_indexes[key]]
                   for key, value in self.context_biases.items()]))
        return predicted_rating

In [4]:
maes = []
mses = []
rmses = []
precisions = []
recalls = []
mapks = []
ndcgs = []

In [None]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import Reader

In [None]:
def built_in_100k():
    # Load the movielens-100k data set (download it if needed),
    data = Dataset.load_builtin('ml-100k')

    return data

In [None]:
data = built_in_100k()

In [5]:
import pandas as pd
import numpy as np
import networkx as nx


def load_data_ml100k():
    """Load the movielens files in and return as pds."""
    ratings = pd.read_csv('./data/ml-100k/u.data', sep='\t',
                          header=None, index_col=False,
                          names=['userID', 'movieId', 'rating', 'timestamp'])

    return ratings

In [None]:
def custom_pandas_100k():
    # Load movieLens-100k from pandas dataframe using our data loader
    ratings = load_data_ml100k()

    # Ratings data is reordered such that it fits Surprise - item, rating, user
    ratings_frame = ratings[['userID', 'movieId', 'rating']]

    # Reader is a Surprise class that is necessary for parsing files
    # For pandas dataframes, it simply requires a rating scale.
    # The scale in this case is 1-5.
    reader = Reader(rating_scale=(1, 5))

    data = Dataset.load_from_df(ratings_frame, reader)

    return data

In [None]:
df = custom_pandas_100k()

In [None]:
def svd_train_test_split():
    data = custom_pandas_100k()

    # Split data, training is 80% and test is 20%
    train_set, test_set = train_test_split(data, test_size=.20)
    algo = SVD()

    # Train on trainings et
    algo.fit(train_set)
    # Predict ratings for test set
    predictions = algo.test(test_set)

    # Compute RMSE
    accuracy.rmse(predictions)
    
    return train_set, test_set

In [None]:
trains, tests = train_test_split(data, test_size=.20)

In [7]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/TA_109/beauty_data_v5.csv")
df.sample()

Unnamed: 0,reviewerID,productID,reviewerName,review,rating,reviewTime,description,title,imageUrl,salesRank,price,related,brand,pos_feedback,neg_feedback,revID-baru,productID-baru
1977508,A1WLTEUNHS92VU,B00KBB6DPE,JMR03,"After a weekend spent covered in sunscreen, bu...",5,"07 3, 2014","SANS [CEUTICALS]Pure, chemical-free and highly...",Sans Ceuticals - Volumizing Hair Care (Hair Hy...,http://ecx.images-amazon.com/images/I/31OWuUia...,{'Beauty': 324500},33.0,"{'also_viewed': ['B00KB5SXFI', 'B00KB4V2AC', '...",noBrand,0,0,285970,244038


In [8]:
df.isnull().sum()

reviewerID        0
productID         0
reviewerName      0
review            0
rating            0
reviewTime        0
description       0
title             0
imageUrl          0
salesRank         0
price             0
related           0
brand             0
pos_feedback      0
neg_feedback      0
revID-baru        0
productID-baru    0
dtype: int64

In [None]:
df1 = df.dropna()

In [None]:
df.isnull().sum()

Unnamed: 0        0
reviewerID        0
productID         0
reviewerName      0
review            0
rating            0
reviewTime        0
description       0
title             0
imageUrl          0
salesRank         0
price             0
related           0
brand             0
pos_feedback      0
neg_feedback      0
revID-baru        0
productID-baru    0
dtype: int64

In [9]:
import numpy as np
from sklearn.model_selection import train_test_split as tts

In [10]:
train, test = tts(df, test_size=0.33)

In [None]:
train_test_sets=[]
for i in range(5):
    train_set = train
    test_set = test
    train_test_sets.append((train_set,test_set))

In [None]:
for train, test in train_test_sets:
    print(train)
    break

         Unnamed: 0      reviewerID   productID               reviewerName  \
341082      1895244  A1E8FHDNY2OGZ8  B00ESYFFDQ                   JASMINKA   
116120       573011   AX8IU5A5G5I2H  B00198OG7U           V. Farias "WeR1"   
587327      1204217  A1GNYLUH49W9ZG  B004HGO540  L. Stockert "working mom"   
536364        62102  A184EFZ26HGG91  B00021AK4I                 D. Cormack   
492591      1953209  A23KU7BDXSLTI3  B00HPH8E8G             Inked Up Mommy   
...             ...             ...         ...                        ...   
116785       206587  A26DVPUEP7LWQG  B000EBFLG8     Carol Jean Jacobi "CJ"   
168859       282603  A1V4VVBQBFXRHC  B000JL7Z5S                      Big-D   
1363633     1428166  A25X8PTZ4BEB84  B005Z6K2T8                   reviewer   
972408      1921838  A21T5C3KL41REL  B00GBQSS56                  galbarran   
307126      1118811  A1DXJ1LR7NLLBS  B0042QTSK6                         JP   

                                                    review  rat

In [None]:
train_test_sets[3]

In [None]:
for train, test in train_test_sets:
    full_data = train.append(test)
    algo = ContextAwareMatrixFactorization(full_data, 'revID-baru', 'productID-baru',
                                           'rating',
                                           ['reviewTime', 'price'], 10)
    algo.train(training_data=train, lrate=0.01, rterm=0.2,
               batch_size=0.01)
    mse, mae, rmse = algo.calculate_metrics(test)
    precision, recall, mapk, ndcg = algo.calculate_precision_recall_mapk(
        test)
    print("Errors for the test data", "MSE:", mse,
          "-- RMSE:", rmse, "-- MAE:", mae)
    print('Precision:', precision, 'Recall:', recall, "Mapk:", mapk,
          'NDCG', ndcg)
    maes.append(mae)
    mses.append(mse)
    rmses.append(rmse)
    precisions.append(precision)
    recalls.append(recall)
    mapks.append(mapk)
    ndcgs.append(ndcg)
print('MEANS --', 'MAE:', np.mean(maes), 'MSE:', np.mean(mses),
      'RMSE:', np.mean(rmses))
print('MEANS --', 'Precision:', np.mean(precisions),
      'Recall:', np.mean(recalls),
      'MapK:', np.mean(mapks), 'NDCG', np.mean(ndcgs))

MemoryError: 

In [None]:
df.isnull().sum()

In [None]:
len(train_test_sets)

In [None]:
for train, test in train_test_sets:
    coba = train.append(test)
    break

In [None]:
len(coba)

Unnamed: 0.1,Unnamed: 0,reviewerID,productID,reviewerName,review,rating,reviewTime,description,title,imageUrl,salesRank,price,related,brand,pos_feedback,neg_feedback,revID-baru,productID-baru
341082,1895244,A1E8FHDNY2OGZ8,B00ESYFFDQ,JASMINKA,"They come pack of four,two nice colors.So easy...",5,"02 2, 2014",DescriptionFeatures:Brand New and High Quality...,4 PCS Ornament Magic Tape Handy And Practical ...,http://ecx.images-amazon.com/images/I/51FkR1Y4...,{'Beauty': 632840},0.00,nothing,noBrand,0,0,125253,225501
116120,573011,AX8IU5A5G5I2H,B00198OG7U,"V. Farias ""WeR1""","I have tried just about everything and I love,...",5,"10 21, 2012",Repair for distressed hair. For hair that is u...,Redken Extreme shampoo 33.8 oz &amp; Condition...,http://ecx.images-amazon.com/images/I/41wobVpW...,{'Beauty': 1002},34.99,"{'also_bought': ['B004Z1L3JC', 'B000141O6E', '...",Redken,4,1,1165888,49943
587327,1204217,A1GNYLUH49W9ZG,B004HGO540,"L. Stockert ""working mom""","When we got the dreaded ""lice in school"" notic...",5,"03 6, 2013","For eliminating and fending off lice, use clea...",LiceLogic Repel Conditioning Hair Spray - Rose...,http://ecx.images-amazon.com/images/I/51FpL%2B...,{'Beauty': 49274},15.00,"{'also_bought': ['B004HGTTEQ', 'B004HLY464', '...",noBrand,2,2,146421,116909
536364,62102,A184EFZ26HGG91,B00021AK4I,D. Cormack,There's always been something about CK One whi...,5,"08 5, 2011",EDT SPRAY 1.7 OZ Design House: Calvin Klein Ye...,Calvin Klein Ck One EDT Spray 1.7 oz for Unisex,http://ecx.images-amazon.com/images/I/317B8-ub...,{'Beauty': 82091},19.99,"{'also_bought': ['B0022V2PKI', 'B0049CZBP4', '...",noBrand,0,0,72086,3796
492591,1953209,A23KU7BDXSLTI3,B00HPH8E8G,Inked Up Mommy,So excited that Amazon is finally carrying my ...,5,"01 21, 2014",The Pretty Clean Soap (PCS) Poppyseed and Maca...,Poppy Seed and Macadamia Nut Oil Exfoliating S...,http://ecx.images-amazon.com/images/I/41NQ54dc...,{'Beauty': 51741},12.99,"{'also_viewed': ['B00IHEMETQ', 'B00IHEIA6C', '...",noBrand,1,0,346396,237814
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
637676,150168,A2C1SQ1YOL6LXU,B0009YDO32,"M. Peebles ""reader77""",I love this scent! It really lasts and it is b...,5,"01 28, 2013","Utilize, Aromatize, Harmonize...Spray the bad ...",Zum Mist Aromatherapy Room and Body Spray Fran...,http://ecx.images-amazon.com/images/I/41-JON39...,{'Beauty': 4862},9.99,"{'also_bought': ['B0009YDO4Q', 'B000BOZ8UM', '...",Zum,0,0,420266,9698
640649,1690199,A18HOYW9YY8O8K,B009DDGHFC,Emery Moore,I am not very happy with this product. The coa...,1,"02 3, 2014",Eyeliner color: BlackDimension:13 x 2cm/5.12&#...,New Leopard Shell Waterproof Liquid Eye Liner ...,http://ecx.images-amazon.com/images/I/41Xw82MT...,{'Beauty': 90},2.02,"{'also_bought': ['B00ESYFTVO', 'B00GQU5YY0', '...",Evermarket,0,0,75360,184852
763488,713193,A16MG8KOEPABW,B001MA0QY2,C.C,Bought in February and it doesn't work all the...,2,"12 9, 2013",The Proffesional HSI Flat Iron is great for tr...,HSI PROFESSIONAL 1 CERAMIC TOURMALINE IONIC FL...,http://ecx.images-amazon.com/images/I/41aQZ%2B...,{'Beauty': 1},53.59,"{'also_bought': ['B00LS5E1OC', 'B007QCI1II', '...",HSI PROFESSIONAL,0,1,59055,63779
702083,290609,A2NQBZST8V6CMM,B000KGXSWG,janeo,I concur with the comments of some of the othe...,2,"06 12, 2012","Launched in the year 1935, by the design house...","Revlon Jean Nate, 30-Ounce",http://ecx.images-amazon.com/images/I/41DaNl9r...,{'Beauty': 5910},8.56,"{'also_bought': ['B00152RYTM', 'B0088W7OT2', '...",Revlon,4,0,522457,21841


In [None]:
algo = ContextAwareMatrixFactorization(train, 'revID-baru', 'productID-baru','rating',['reviewTime', 'price'], 10)

In [None]:
algo.train(training_data=train, lrate=0.01, rterm=0.2,batch_size=0.01)

In [None]:
algo.get_average_ratings()

In [None]:
algo.data

In [None]:
mse,mae,rmse = algo.calculate_metrics(df)
print("mse :" + str(mse))
print("mae :" + str(mae))
print("rmse :" + str(rmse))

In [None]:
algo.calculate_precision_recall_mapk(test)

In [None]:
algo.predict_ratings_for_entries(df)

In [None]:
algo.get_top_k_pred("100","676",10)

In [None]:
# import pdb;pdb.set_trace()
algo.predict_rating(1,1,"676",3)