# Hybrid Models for Recommendation Systems

Load Pandas, we are going to need it for manipulating data

In [1]:
import pandas as pd
import numpy as np
from IPython.display import Image
np.set_printoptions(precision = 3)

Now load the data

In [18]:
user_ratings_df = pd.read_csv("user_ratings.csv")
user_features_df = pd.read_csv("user_features.csv")
item_features_df = pd.read_csv("item_features.csv")


user_features_df["key"] = 0
user_features_df["user_id"] = range(0,user_features_df.shape[0])
item_features_df["key"] = 0
item_features_df["item_id"] = range(0,item_features_df.shape[0])

merged_df = pd.merge(user_features_df, item_features_df,left_index=True,on="key")
merged_df[["item_id", "user_id"]]

#print(merged_df)


merged_df["rating"] = map(lambda ids: user_ratings_df.values[ids[1]][ids[2]], 
                          merged_df[["user_id", "item_id"]].itertuples())
print(merged_df)
train = merged_df.dropna()

test = merged_df[merged_df.isnull().any(axis=1)]

print (test.to_latex())



   Sex   Over60  key  user_id  Critic0   Critic1  item_id  \
0  1.0      0.0    0        0      0.3       0.9        0   
1  1.0      0.0    0        0      0.9       0.3        1   
2  1.0      0.0    0        0      0.6       0.4        2   
3  1.0      0.0    0        0      0.2       0.1        3   
4  1.0      0.0    0        0      0.7       0.8        4   
0  0.0      1.0    0        1      0.3       0.9        0   
1  0.0      1.0    0        1      0.9       0.3        1   
2  0.0      1.0    0        1      0.6       0.4        2   
3  0.0      1.0    0        1      0.2       0.1        3   
4  0.0      1.0    0        1      0.7       0.8        4   
0  0.0      0.0    0        2      0.3       0.9        0   
1  0.0      0.0    0        2      0.9       0.3        1   
2  0.0      0.0    0        2      0.6       0.4        2   
3  0.0      0.0    0        2      0.2       0.1        3   
4  0.0      0.0    0        2      0.7       0.8        4   
0  1.0      0.0    0    

In [32]:
n_latent_features = 2

user_ratings = user_ratings_df.values
latent_user_preferences = np.random.random((user_ratings.shape[0], n_latent_features))
latent_item_features = np.random.random((user_ratings.shape[1],n_latent_features))

user_features = user_features_df.values
item_features = item_features_df.values

print (item_features_df.to_latex())


user_features = np.concatenate([np.ones(shape = (user_features.shape[0],1)), user_features], axis = 1)
item_features = np.concatenate([np.ones(shape = (item_features.shape[0],1)), item_features], axis = 1)



user_features_weights = np.random.random((user_ratings.shape[0], user_features.shape[1] ))
item_features_weights = np.random.random((user_ratings.shape[1],item_features.shape[1] ))



print (user_features)

\begin{tabular}{lrrrr}
\toprule
{} &  Critic0 &   Critic1 &  key &  item\_id \\
\midrule
0 &      0.3 &       0.9 &    0 &        0 \\
1 &      0.9 &       0.3 &    0 &        1 \\
2 &      0.6 &       0.4 &    0 &        2 \\
3 &      0.2 &       0.1 &    0 &        3 \\
4 &      0.7 &       0.8 &    0 &        4 \\
\bottomrule
\end{tabular}

[[ 1.  1.  0.  0.  0.]
 [ 1.  0.  1.  0.  1.]
 [ 1.  0.  0.  0.  2.]
 [ 1.  1.  0.  0.  3.]
 [ 1.  0.  1.  0.  4.]
 [ 1.  0.  0.  0.  5.]
 [ 1.  0.  0.  0.  6.]
 [ 1.  1.  0.  0.  7.]
 [ 1.  0.  1.  0.  8.]
 [ 1.  1.  0.  0.  9.]]


In [35]:
def predict_rating(user_id,item_id):
    """ Predict a rating given a user_id and an item_id.
    """
    user_preference = latent_user_preferences[user_id]
    item_preference = latent_item_features[item_id]
    
    user_score = user_features_weights[user_id].dot(user_features[user_id])
    item_score = item_features_weights[item_id].dot(item_features[item_id])
    #print user_preference.dot(item_preference), user_score, item_score
    return user_preference.dot(item_preference) + user_score + item_score

def train(user_id, item_id, rating,alpha = 0.001, 
                                   latent_feature_weight_decay = 0.1, 
                                   user_weight_decay = 0.01,
                                   item_weight_decay = 0.0001):
    
    #print item_id
    prediction_rating = predict_rating(user_id, item_id)
    err =  ( prediction_rating - rating );
    #print err
    user_pref_values = latent_user_preferences[user_id][:]
    latent_user_preferences[user_id] -= alpha * err *  ( latent_item_features[item_id] + latent_feature_weight_decay*latent_user_preferences[user_id])
    latent_item_features[item_id] -= alpha * err * ( user_pref_values + latent_feature_weight_decay*latent_item_features[item_id])
    
    user_features_weights[user_id] -=alpha * err *(  user_features[user_id] + user_weight_decay* user_features_weights[user_id])
    item_features_weights[item_id] -=alpha * err * ( item_features_weights[item_id] + item_weight_decay* item_features_weights[item_id])
    
    
    return err
    


def sgd(iterations = 30000):
    """ Iterate over all users and all items and train for 
        a certain number of iterations
    """
    for iteration in range(0,iterations):
        error = []
        for user_id in range(0,latent_user_preferences.shape[0]):
            for item_id in range(0,latent_item_features.shape[0]):
                rating = user_ratings[user_id][item_id]
                if(not np.isnan(rating)):
                    err = train(user_id,item_id,rating)
                    error.append(err)
    mse = (np.array(error) ** 2).mean()          
    print (mse)
                    
                    
    
                    
                    
    


In [36]:
for _ in range(0,10): 
    sgd()

0.2797756832
0.279035020935
0.278536034099
0.278181076358
0.27791980304
0.277723613096
0.277575079996
0.277463023784
0.277379995762
0.277320900812


In [37]:
predictions = np.zeros(shape = (latent_user_preferences.shape[0], latent_item_features.shape[0]) )
#print latent_user_preferences
print (user_features_weights)
print (item_features_weights)
for user_id in range(0,latent_user_preferences.shape[0]):
            for item_id in range(0,latent_item_features.shape[0]):
                predictions[user_id,item_id] =  predict_rating(user_id,item_id)

  

[[-0.943 -1.421  0.926  0.77   0.8  ]
 [ 2.024  0.568  2.394  0.187  1.833]
 [ 0.917  0.056  0.102  0.878  0.523]
 [ 0.462  0.802  0.024  0.988  0.651]
 [ 0.594  0.043  0.271  0.365 -0.271]
 [ 0.348  0.991  0.467  0.395  1.36 ]
 [ 0.588  0.237  0.445  0.887 -0.303]
 [-0.03   0.716  0.703  0.432  0.096]
 [ 0.383  0.64   0.707  0.714  0.626]
 [ 0.298  0.644  0.805  0.014 -0.308]]
[[  1.308e+00   1.087e+00   1.117e+00   1.221e-01   2.076e+00]
 [  2.711e-04   6.156e-05   9.362e-04   4.220e-04   4.737e-04]
 [  3.275e+00   2.616e+00   1.146e+00   9.049e-01   1.198e+00]
 [  1.060e+00   4.767e-01   6.682e-01   3.724e-01   3.573e-01]
 [  2.986e+00   2.449e-01   8.423e-01   4.871e+00   1.546e+00]]


In [40]:
data = pd.read_csv("user_ratings.csv")
values = [zip(user_ratings[i], predictions[i]) for i in range(0,predictions.shape[0])]
comparison_data = pd.DataFrame(values)
comparison_data.columns = data.columns
comparison_data.applymap(lambda xy: "(%2.3f|%2.3f)"%xy)

Unnamed: 0,The Call of Cthulhu,Frankenstein,Dracula,Neuromancer,Space Odyssey
0,(8.000|7.987),(2.000|2.047),(nan|-25.528),(5.000|4.974),(4.000|3.994)
1,(3.000|2.897),(2.000|2.550),(nan|75.622),(7.000|6.627),(7.000|6.928)
2,(9.000|9.037),(nan|4.391),(7.000|7.002),(8.000|7.959),(5.000|5.000)
3,(nan|8.968),(nan|4.936),(7.000|7.000),(8.000|8.000),(9.000|8.999)
4,(nan|3.882),(1.000|0.572),(8.000|7.990),(3.000|3.369),(7.000|7.052)
5,(2.000|2.001),(3.000|2.996),(5.000|5.000),(nan|2.172),(nan|34.115)
6,(4.000|4.319),(2.000|0.431),(nan|-4.528),(2.000|3.002),(7.000|7.198)
7,(7.000|6.750),(1.000|2.905),(2.000|2.082),(7.000|5.667),(9.000|8.769)
8,(3.000|3.103),(3.000|2.521),(nan|84.979),(7.000|7.289),(3.000|3.059)
9,(4.000|3.903),(nan|-0.144),(5.000|4.987),(3.000|3.098),(3.000|2.997)


In [41]:
comparison_data

Unnamed: 0,The Call of Cthulhu,Frankenstein,Dracula,Neuromancer,Space Odyssey
0,"(8.0, 7.98675823097)","(2.0, 2.04743717058)","(nan, -25.5276898089)","(5.0, 4.97363905813)","(4.0, 3.99406295366)"
1,"(3.0, 2.89725495427)","(2.0, 2.5501991579)","(nan, 75.6215311901)","(7.0, 6.62658043591)","(7.0, 6.9284063685)"
2,"(9.0, 9.03712287003)","(nan, 4.39092033642)","(7.0, 7.00223918848)","(8.0, 7.95890513661)","(5.0, 5.00032828523)"
3,"(nan, 8.96791850076)","(nan, 4.93584674954)","(7.0, 7.0000006414)","(8.0, 7.99957124096)","(9.0, 8.99930895475)"
4,"(nan, 3.88192830242)","(1.0, 0.572338108271)","(8.0, 7.98968606008)","(3.0, 3.36882581388)","(7.0, 7.05163686256)"
5,"(2.0, 2.00128247535)","(3.0, 2.99558902041)","(5.0, 5.00000600523)","(nan, 2.17236635258)","(nan, 34.1149236006)"
6,"(4.0, 4.31924289614)","(2.0, 0.431069241011)","(nan, -4.52828396077)","(2.0, 3.00181258827)","(7.0, 7.19830376194)"
7,"(7.0, 6.74959992432)","(1.0, 2.90477097743)","(2.0, 2.0823817796)","(7.0, 5.66690790463)","(9.0, 8.76857021641)"
8,"(3.0, 3.10312397028)","(3.0, 2.52141078085)","(nan, 84.9785709291)","(7.0, 7.28931486962)","(3.0, 3.05856997919)"
9,"(4.0, 3.90262870684)","(nan, -0.144183585255)","(5.0, 4.98655624175)","(3.0, 3.09762932916)","(3.0, 2.99745369609)"


In [42]:
d = comparison_data.to_latex()
text_file = open("comparison.txt", "w")
text_file.write(d)
text_file.close()