In [24]:
from pyfm import pylibfm
from sklearn.feature_extraction import DictVectorizer
import pickle
from lenskit.algorithms.als import BiasedMF
from lenskit.crossfold import sample_rows
import pandas as pd

import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import numpy as np
from tqdm import tqdm

In [20]:
with open("item_embeddings.pkl", "rb") as f:
    trainset_embeddings = pickle.load(f)
with open('entropy_table.pkl', 'rb') as f:
    entropy_table = pickle.load(f)
w2v = Word2Vec.load("w2v_1")

In [6]:
def handle_duplicates(reviews):
    duplicated_pairs = reviews.duplicated(subset=['user', 'item'], keep=False)

    duplicates_df = reviews[duplicated_pairs]

    print(len(duplicates_df))

    combined_reviews = duplicates_df.groupby(['user', 'item']).agg({'rating': 'mean', 'text': ' '.join}).reset_index()

    merged_reviews = pd.merge(reviews, combined_reviews, on=['user', 'item'], how='left', suffixes=('', '_combined'))

    merged_reviews['rating'] = merged_reviews['rating_combined'].fillna(merged_reviews['rating'])
    merged_reviews['text'] = merged_reviews['text_combined'].fillna(merged_reviews['text'])
    merged_reviews = merged_reviews.drop(columns=['rating_combined', 'text_combined'])
    merged_reviews = merged_reviews.drop_duplicates(subset=['user', 'item'])
    
    return merged_reviews



In [15]:
reviews.columns

Index(['reviewerID', 'amazon-id', 'reviewText', 'overall'], dtype='object')

In [16]:
reviews = pd.read_json("mard/mard_reviews.json" , lines=True)
reviews=reviews.drop(['helpful', 'summary','reviewTime', 'reviewerName'],axis=1)
reviews=reviews[["reviewerID", 'amazon-id', "overall" , 'reviewText']]
new_column_names = {'reviewerID': 'user', 'amazon-id': 'item', 'overall': 'rating', "reviewText": "text"}
reviews.rename(columns=new_column_names, inplace=True)
reviews = handle_duplicates(reviews)

3412


In [35]:
df_split = reviews['embedding'].apply(pd.Series)
reviews = pd.concat([reviews, df_split], axis=1)
reviews = reviews.drop('embedding', axis=1)
reviews

Unnamed: 0,user,item,rating,text,0,1,2,3,4,5,...,290,291,292,293,294,295,296,297,298,299
0,A1OFY4ATO7D13W,0026197898,5.0,Buy this album. Now. Don't worry about the re...,0.182620,-0.073689,-0.057639,-0.704688,-0.407296,0.296694,...,0.505639,-0.159476,-0.718974,0.584941,0.066539,-0.033142,-0.743983,0.323740,-0.083542,0.074567
1,A2KH83L1F70QR8,0026197898,5.0,The Sudden Passion did a great job with this o...,0.328262,-0.124202,-0.222787,0.448518,-0.487578,0.085612,...,0.120392,-0.461723,-0.229318,0.411556,0.024881,0.190744,0.185607,-0.447765,-0.091268,0.139924
2,A1KGXC7IRLVJR3,0615205399,5.0,I received this CD as a gift a few weeks ago f...,0.179609,0.003010,0.077909,-0.586981,-0.140251,0.157447,...,-0.192433,-0.187081,-0.450810,0.356835,-0.104535,-0.523663,0.379791,0.184014,-0.013592,-0.289380
3,A1BT6LQ9NY6RO3,0615205399,5.0,I am a beginner and have tried a couple of med...,0.154137,0.266216,0.049565,-0.584482,-0.275513,-0.026594,...,-0.192041,-0.173476,-0.301226,0.481804,-0.093252,-0.067886,0.174888,0.081487,0.112477,-0.131485
4,A206OKO2FE2IPL,0615205399,5.0,This is coming from a person that didn't belie...,0.051958,0.287010,0.160240,-0.599972,0.038995,-0.058497,...,-0.266477,-0.034365,-0.126335,0.137666,-0.258989,-0.677647,0.493429,0.281236,0.290509,-0.140560
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263520,A3RKUPYX1RC9WO,B00LG9GR3S,5.0,What a wonderful CD capturing the unique sound...,-0.178527,0.354311,-0.022475,0.830391,-0.133064,-0.027217,...,0.184552,-0.103708,-0.239194,0.696728,0.400533,0.194764,0.460398,0.029412,0.179413,-0.198964
263521,A122G17YDFX176,B00LG9GR3S,5.0,I love the Smokies! This cd is awesome to lis...,0.571472,0.309562,-0.308959,-0.585049,-0.346509,0.253695,...,0.069977,-0.170078,-0.608539,0.385450,0.114087,0.233377,0.107992,0.056025,0.323060,-0.130090
263522,A26QVK35BBBKU8,B00LG9GR3S,5.0,Beautiful sounding CD. I felt like I was there...,-0.053734,0.295176,0.079057,0.574030,-0.216796,0.247487,...,0.146535,-0.045120,-0.291241,0.307123,0.252571,0.211896,0.395926,-0.209798,0.247477,0.032251
263523,A1V76VMZ0N3H5W,B00LG9GR3S,5.0,This is a really neat collection of sounds fro...,-0.368464,0.083159,-0.244005,0.032956,-0.379628,0.657752,...,0.434356,-0.069080,-0.364371,-0.078799,0.204484,-0.165873,0.240977,0.029032,0.147648,0.226174


In [29]:
def clean_text(text):
    '''
    Cleaning pipeline:
    - lowercase
    - remove punctuation
    - remove unnecessary whitespace
    - urls
    - remove non characters
    - remove stop words
    - lemmatization
    '''
    text = text.lower()
    
    # Removing punctuation
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    
    # remove unnecessary whitespace
    text = re.sub("\s+"," ",text)
    
    # remove urls
    pattern = re.compile(r'https?://\S+|www\.\S+')
    text = pattern.sub('', text)
    
    # remove non-character
    text = re.sub("[^a-zA-Z]", " ", text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Removing stop words
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Joining tokens back into a sentence
    cleaned_text = " ".join(tokens)
    
    return cleaned_text

def get_review_embedding(review_text, model, embed_len=300):
    '''
    get the embedding for an individual review
    returns the mean word 2 vec score applied o each word
    '''
    
    text = clean_text(review_text)
    words = [word for word in word_tokenize(text) if word in model.wv]
    
    if len(words) == 0: 
        return np.zeros(embed_len) # array of size 0
    
    word_embeddings = [model.wv[word] for word in words]
    mean_embedding = np.mean(word_embeddings, axis=0)
   
    if len(mean_embedding) != embed_len:
        return np.zeros(embed_len)
    
    return mean_embedding

def generate_embedding(text):
    
    return get_review_embedding(text, w2v)
    
tqdm.pandas()
reviews['embedding'] = reviews['text'].progress_apply(generate_embedding)

100%|██████████████████████████████████| 261740/261740 [06:20<00:00, 687.81it/s]


In [48]:
trainset, testset = sample_rows(reviews, None, 1500)

In [44]:
old_column_names = [i for i in range(300)]
new_column_names = ['embedding_' + str(i) for i in range(300)]

column_mapping = dict(zip(old_column_names, new_column_names))

reviews.rename(columns=column_mapping, inplace=True)


Unnamed: 0,user,item,rating,text,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,...,embedding_290,embedding_291,embedding_292,embedding_293,embedding_294,embedding_295,embedding_296,embedding_297,embedding_298,embedding_299
0,A1OFY4ATO7D13W,0026197898,5.0,Buy this album. Now. Don't worry about the re...,0.182620,-0.073689,-0.057639,-0.704688,-0.407296,0.296694,...,0.505639,-0.159476,-0.718974,0.584941,0.066539,-0.033142,-0.743983,0.323740,-0.083542,0.074567
1,A2KH83L1F70QR8,0026197898,5.0,The Sudden Passion did a great job with this o...,0.328262,-0.124202,-0.222787,0.448518,-0.487578,0.085612,...,0.120392,-0.461723,-0.229318,0.411556,0.024881,0.190744,0.185607,-0.447765,-0.091268,0.139924
2,A1KGXC7IRLVJR3,0615205399,5.0,I received this CD as a gift a few weeks ago f...,0.179609,0.003010,0.077909,-0.586981,-0.140251,0.157447,...,-0.192433,-0.187081,-0.450810,0.356835,-0.104535,-0.523663,0.379791,0.184014,-0.013592,-0.289380
3,A1BT6LQ9NY6RO3,0615205399,5.0,I am a beginner and have tried a couple of med...,0.154137,0.266216,0.049565,-0.584482,-0.275513,-0.026594,...,-0.192041,-0.173476,-0.301226,0.481804,-0.093252,-0.067886,0.174888,0.081487,0.112477,-0.131485
4,A206OKO2FE2IPL,0615205399,5.0,This is coming from a person that didn't belie...,0.051958,0.287010,0.160240,-0.599972,0.038995,-0.058497,...,-0.266477,-0.034365,-0.126335,0.137666,-0.258989,-0.677647,0.493429,0.281236,0.290509,-0.140560
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263520,A3RKUPYX1RC9WO,B00LG9GR3S,5.0,What a wonderful CD capturing the unique sound...,-0.178527,0.354311,-0.022475,0.830391,-0.133064,-0.027217,...,0.184552,-0.103708,-0.239194,0.696728,0.400533,0.194764,0.460398,0.029412,0.179413,-0.198964
263521,A122G17YDFX176,B00LG9GR3S,5.0,I love the Smokies! This cd is awesome to lis...,0.571472,0.309562,-0.308959,-0.585049,-0.346509,0.253695,...,0.069977,-0.170078,-0.608539,0.385450,0.114087,0.233377,0.107992,0.056025,0.323060,-0.130090
263522,A26QVK35BBBKU8,B00LG9GR3S,5.0,Beautiful sounding CD. I felt like I was there...,-0.053734,0.295176,0.079057,0.574030,-0.216796,0.247487,...,0.146535,-0.045120,-0.291241,0.307123,0.252571,0.211896,0.395926,-0.209798,0.247477,0.032251
263523,A1V76VMZ0N3H5W,B00LG9GR3S,5.0,This is a really neat collection of sounds fro...,-0.368464,0.083159,-0.244005,0.032956,-0.379628,0.657752,...,0.434356,-0.069080,-0.364371,-0.078799,0.204484,-0.165873,0.240977,0.029032,0.147648,0.226174


In [49]:
y_train = trainset['rating'].to_numpy()
X_train = trainset.drop(['rating', 'text'], axis=1).to_dict(orient="records")

  X_train = trainset.drop(['rating', 'text'], axis=1).to_dict(orient="records")


In [50]:
v = DictVectorizer()
X_train = v.fit_transform(X_train)

In [52]:
fm = pylibfm.FM(num_factors=50, num_iter=10, verbose=True, task="regression", initial_learning_rate=0.001, learning_rate_schedule="optimal")
fm.fit(X_train,y_train)

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training MSE: 0.41194
-- Epoch 2
Training MSE: 0.34877
-- Epoch 3
Training MSE: 0.34222
-- Epoch 4
Training MSE: 0.33754
-- Epoch 5
Training MSE: 0.33263
-- Epoch 6
Training MSE: 0.32741
-- Epoch 7
Training MSE: 0.32332
-- Epoch 8
Training MSE: 0.31902
-- Epoch 9
Training MSE: 0.31374
-- Epoch 10
Training MSE: 0.31000


In [56]:
y_test = testset['rating'].to_numpy()
X_test = testset.drop(['rating', 'text'], axis=1).to_dict(orient="records")

  X_test = testset.drop(['rating', 'text'], axis=1).to_dict(orient="records")


In [57]:
v = DictVectorizer()
X_test = v.fit_transform(X_test)

In [59]:
from sklearn.metrics import mean_squared_error

y_pred = fm.predict(X_test)
mean_squared_error(y_test, y_pred)

0.684591947419034

0.684591947419034