In [32]:
import math
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.decomposition import NMF
from sklearn.utils.extmath import randomized_svd
from sklearn.decomposition import TruncatedSVD
from scipy.sparse.linalg import svds
from sklearn.metrics import mean_squared_error

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer


In [2]:
f1 = open('user_5k_avg', 'r').read()
user_avg = eval(f1)

f2 = open('business_5k_avg', 'r').read()
business_avg = eval(f2)

f3 = open('review_5k_user', 'r').read()
review5k_user = eval(f3)

f4 = open('review_5k_business', 'r').read()
review5k_business = eval(f4)

f5 = open('review_5k_rating', 'r').read()
review5k_rating = eval(f5)

f6 = open('review_5k_text', 'r').read()
review5k_text = eval(f6)

f7 = open('relation_5k', 'r').read()
relation = eval(f7)

<h1>Parameters</h1>

In [3]:
train_user = review5k_user[0:45000]
train_business = review5k_business[0:45000]
train_rating = review5k_rating[0:45000]
train_text = review5k_text[0:45000]

test_user = review5k_user[45000:]
test_business = review5k_business[45000:]
test_rating = review5k_rating[45000:]

In [58]:
K_topic = 10 
Times = 5000
DocWord = 300
DocTopic = 30
SVD_model = 0 # [0, 1, 2]

In [66]:
# U, S, VT = randomized_svd(BasicIn, n_components=K_topic, n_iter=Times, random_state=None)
# U, S, VT = svds(BasicIn, k=K_topic, which='LM', maxiter = Times, return_singular_vectors=True)
'''rc4s = []
for n in xrange(K_topic):
    rc4s.append(n)
rc4s = np.array(rc4s)
S = csr_matrix((S,(rc4s,rc4s)), shape=(K_topic, K_topic)).toarray()
BasicOut = np.dot(np.dot(U,S),VT)'''


def SVD(In):
    svd = TruncatedSVD(n_components=K_topic, n_iter=Times, random_state=None)
    U = svd.fit_transform(In)
    VT = svd.components_
    BasicOut = np.dot(U,VT)
    return Out

<h1>No Model</h1>

In [5]:
num_user = len(user_avg)
num_business = len(business_avg)
num_train = len(train_rating)
num_test = len(test_rating)

mu = np.mean(train_rating)

ubb = []
for i in xrange(len(user_avg)):
    ubb.append([])
    for j in xrange(len(business_avg)):
        ubb[i].append(user_avg[i] + business_avg[j] - mu) 


In [7]:
UbbPd = []
for r in xrange(num_test):
    UbbPd.append(ubb[test_user[r]][test_business[r]]) 

Ubb_rmse = mean_squared_error(test_rating, UbbPd)  

print Ubb_rmse

0.854368016052


<h1>Basic Model</h1>

In [25]:
delta_basic = []
for r in xrange(len(train_rating)):
    delta_basic.append(train_rating[r] - ubb[train_user[r]][train_business[r]]) 

row = np.array(train_user)
col = np.array(train_business)
val = np.array(delta_basic)

BasicIn = csr_matrix((val,(row,col)), shape=(num_user, num_business)).toarray()

In [75]:
svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
U = svd.fit_transform(BasicIn)
VT = svd.components_
BasicOut = np.dot(U,VT)

In [76]:
BasicOut

array([[  4.90997788e-04,  -1.86532125e-18,  -9.71325391e-04, ...,
         -1.23634225e-02,  -1.08976530e-02,  -2.09044077e-02],
       [ -2.84528887e-03,   7.50965024e-17,   6.14505204e-03, ...,
         -8.84735035e-02,  -1.06266987e-01,  -7.64334881e-03],
       [ -2.68832488e-03,   1.99525243e-17,   2.27058253e-03, ...,
         -1.13645805e-02,  -3.63315975e-02,  -1.73916929e-02],
       ..., 
       [  4.04948153e-04,   2.75654208e-20,   2.90080501e-04, ...,
         -6.05942588e-03,  -3.73982935e-04,  -5.13101855e-03],
       [ -3.49385933e-02,   4.81971278e-16,   2.99101389e-02, ...,
         -3.72007405e-01,  -6.18409051e-01,   1.99592743e-02],
       [  1.65008363e-03,   1.12323637e-19,   1.18202067e-03, ...,
         -2.46909619e-02,  -1.52390649e-03,  -2.09078857e-02]])

In [73]:
BasicOut = SVD(BasicIn)


In [74]:
len(BasicOut)

12

In [56]:
BasicPd = []
for r in xrange(num_test):
    BasicPd.append(ubb[test_user[r]][test_business[r]] + BasicOut[test_user[r]][test_business[r]]) 

Basic_rmse = mean_squared_error(test_rating, BasicPd)  

print Basic_rmse

0.854402441609


In [None]:
print mean_squared_error(BasicIn, BasicOut) 

<h1>Topic Model</h1>

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
en_stop = get_stop_words('en')
p_stemmer = PorterStemmer()

def prep(doc):
    raw = doc.lower().replace("\n", "").replace("\t", "")
    tokens = tokenizer.tokenize(raw)
    stopped_tokens = [i for i in tokens if not i in en_stop]
    texts = [p_stemmer.stem(i) for i in stopped_tokens]
    return (" ").join(texts)

Breview = [""]*num_business
for r in xrange(num_train):
    Breview[train_business[r]] += prep(train_text[r])


In [None]:
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=DocWord, stop_words='english')
tf = tf_vectorizer.fit_transform(Breview)

lda = LatentDirichletAllocation(n_topics=DocTopic, max_iter=5, learning_method='online',learning_offset=50.,random_state=0)
DocTopicDist = lda.fit_transform(tf)

DocCosine_norm = []
for n in xrange(num_business):
    DocCosine_norm.append(math.sqrt(np.dot(DocTopicDist[n], DocTopicDist[n])))

for n in xrange(num_business):
    DocTopicDist[n] = 1.0*DocTopicDist[n]/DocCosine_norm[n]
    
DocSim = np.dot(DocTopicDist, DocTopicDist.T)

In [None]:
delta_temp1 = {}
for n in xrange(num_business):
    delta_temp1[n] = []
for n in xrange(num_train):
    delta_temp1[train_business[n]].append(delta_basic[n])

delta_temp2 = []
for n in xrange(num_business):
    if n in train_business:
        delta_temp2.append(np.mean(delta_temp1[n]))
    else:
        delta_temp2.append(0)

In [None]:
TopicModify = []
for n in xrange(num_business):
    TopicModify.append(np.dot(DocSim[n],delta_temp2)/sum(DocSim[n]))

In [None]:
delta_topic = []
for r in xrange(len(train_rating)):
    delta_topic.append(train_rating[r] - ubb[train_user[r]][train_business[r]] - TopicModify[train_business[r]]) 

row = np.array(train_user)
col = np.array(train_business)
val = np.array(delta_topic)
TopicIn = csr_matrix((val,(row,col)), shape=(num_user, num_business)).toarray()


In [None]:
U, S, VT = randomized_svd(TopicIn, n_components=K_topic, n_iter=Times, random_state=None)
rc4s = []
for n in xrange(K_topic):
    rc4s.append(n)
rc4s = np.array(rc4s)
S = csr_matrix((S,(rc4s,rc4s)), shape=(K_topic, K_topic)).toarray()

TopicOut = np.dot(np.dot(U,S),VT)

In [None]:
BasicPd = []
for r in xrange(num_test):
    BasicPd.append(ubb[test_user[r]][test_business[r]] + TopicModify[test_business[r]] + BasicOut[test_user[r]][test_business[r]]) 

Basic_rmse = mean_squared_error(test_rating, BasicPd)  

print Basic_rmse

<h1>Topic Model</h1>

In [None]:
import networkx as nx

node_list = []
for u in xrange(num_user):
    node_list.append(u)

G=nx.Graph()
G.add_nodes_from(node_list)
for u in relation:
    for f in u[1]:
        G.add_edge(u[0], f)

PR = nx.pagerank(G)


In [None]:
from operator import itemgetter
sorted_PR = sorted(PR.items(), key=itemgetter(1), reverse=True)

rank_dict = {}
for u in xrange(num_user):
    rank_dict[sorted_PR[u][0]] = u
    
rank_list = []
for u in xrange(num_user):
    rank_list.append(rank_dict[u]+1)

rank_weight = []
for ri in rank_list:
    rank_weight.append(1.0/(1.0+ math.log(ri)))  


In [None]:
delta_temp3 = {}
for n in xrange(num_user):
    delta_temp3[n] = []
for n in xrange(num_train):
    delta_temp3[train_user[n]].append(delta_basic[n])

delta_temp4 = []
for n in xrange(num_user):
    if n in train_user:
        delta_temp4.append(np.mean(delta_temp3[n]))
    else:
        delta_temp4.append(0)

In [None]:
SocialModify = []
for n in xrange(num_user):
    SocialModify.append(np.dot(DocSim[n],delta_temp4)/sum(DocSim[n]))