# Hybrid Models for Recommendation Systems

Load Pandas, we are going to need it for manipulating data

In [1]:
import pandas as pd
import numpy as np
from IPython.display import Image
np.set_printoptions(precision = 3)

Now load the data

In [8]:
user_ratings_df = pd.read_csv("user_ratings.csv")
user_features_df = pd.read_csv("user_features.csv")
item_features_df = pd.read_csv("item_features.csv")


user_features_df["key"] = 0
user_features_df["user_id"] = range(0,user_features_df.shape[0])
item_features_df["key"] = 0
item_features_df["item_id"] = range(0,item_features_df.shape[0])

merged_df = pd.merge(user_features_df, item_features_df,left_index=True,on="key")
merged_df[["item_id", "user_id"]]




merged_df["rating"] = map(lambda ids: user_ratings_df.values[ids[1]][ids[2]], 
                          merged_df[["user_id", "item_id"]].itertuples())

train = merged_df.dropna()

test = merged_df[merged_df.isnull().any(axis=1)]

print("LATEX")
print()
print (test.to_latex())
print()
print()
print("NORMAL")
print()
print(test)



LATEX

\begin{tabular}{lrrrrrrrl}
\toprule
Empty DataFrame
Columns: Index(['Sex', ' Over60', 'key', 'user\_id', 'Critic0', ' Critic1', 'item\_id',
       'rating'],
      dtype='object')
Index: Int64Index([], dtype='int64') \\
\bottomrule
\end{tabular}



NORMAL

Empty DataFrame
Columns: [Sex,  Over60, key, user_id, Critic0,  Critic1, item_id, rating]
Index: []


In [13]:
n_latent_features = 2

user_ratings = user_ratings_df.values
latent_user_preferences = np.random.random((user_ratings.shape[0], n_latent_features))
latent_item_features = np.random.random((user_ratings.shape[1],n_latent_features))

user_features = user_features_df.values
item_features = item_features_df.values

print (item_features_df.to_latex())


user_features = np.concatenate([np.ones(shape = (user_features.shape[0],1)), user_features], axis = 1)
item_features = np.concatenate([np.ones(shape = (item_features.shape[0],1)), item_features], axis = 1)



user_features_weights = np.random.random((user_ratings.shape[0], user_features.shape[1] ))
item_features_weights = np.random.random((user_ratings.shape[1],item_features.shape[1] ))



# print (user_features)

\begin{tabular}{lrrrr}
\toprule
{} &  Critic0 &   Critic1 &  key &  item\_id \\
\midrule
0 &      0.3 &       0.9 &    0 &        0 \\
1 &      0.9 &       0.3 &    0 &        1 \\
2 &      0.6 &       0.4 &    0 &        2 \\
3 &      0.2 &       0.1 &    0 &        3 \\
4 &      0.7 &       0.8 &    0 &        4 \\
\bottomrule
\end{tabular}



In [15]:
def predict_rating(user_id,item_id):
    """ Predict a rating given a user_id and an item_id.
    """
    user_preference = latent_user_preferences[user_id]
    item_preference = latent_item_features[item_id]
    
    user_score = user_features_weights[user_id].dot(user_features[user_id])
    item_score = item_features_weights[item_id].dot(item_features[item_id])
    #print user_preference.dot(item_preference), user_score, item_score
    return user_preference.dot(item_preference) + user_score + item_score

def train(user_id, item_id, rating,alpha = 0.001, 
                                   latent_feature_weight_decay = 0.1, 
                                   user_weight_decay = 0.01,
                                   item_weight_decay = 0.0001):
    
    #print item_id
    prediction_rating = predict_rating(user_id, item_id)
    err =  ( prediction_rating - rating );
    #print err
    user_pref_values = latent_user_preferences[user_id][:]
    latent_user_preferences[user_id] -= alpha * err *  ( latent_item_features[item_id] + latent_feature_weight_decay*latent_user_preferences[user_id])
    latent_item_features[item_id] -= alpha * err * ( user_pref_values + latent_feature_weight_decay*latent_item_features[item_id])
    
    user_features_weights[user_id] -=alpha * err *(  user_features[user_id] + user_weight_decay* user_features_weights[user_id])
    item_features_weights[item_id] -=alpha * err * ( item_features_weights[item_id] + item_weight_decay* item_features_weights[item_id])
    
    
    return err
    


def sgd(iterations = 30000):
    """ Iterate over all users and all items and train for 
        a certain number of iterations
    """
    for iteration in range(0,iterations):
        error = []
        for user_id in range(0,latent_user_preferences.shape[0]):
            for item_id in range(0,latent_item_features.shape[0]):
                rating = user_ratings[user_id][item_id]
                if(not np.isnan(rating)):
                    err = train(user_id,item_id,rating)
                    error.append(err)
    mse = (np.array(error) ** 2).mean()          
    print (mse)
                    
                    
    
                    
                    
    


In [16]:
for _ in range(0,10): 
    sgd()

0.37457957704349976
0.36808171334980255
0.36566905054225746
0.3643768496325834
0.36356060791281014
0.3629992911392709
0.3625953138530224
0.36229848027148026
0.36208026037201274
0.3619233201993418


In [18]:
predictions = np.zeros(shape = (latent_user_preferences.shape[0], latent_item_features.shape[0]) )
#print latent_user_preferences
print (user_features_weights)
print (item_features_weights)
for user_id in range(0,latent_user_preferences.shape[0]):
            for item_id in range(0,latent_item_features.shape[0]):
                predictions[user_id,item_id] =  predict_rating(user_id,item_id)

  

[[ 0.701  1.324  0.985  0.94   0.146]
 [ 0.509  0.216  0.751  0.723  0.228]
 [ 1.062  0.264  0.959  0.397  1.248]
 [ 0.891  1.126  0.019  0.971  0.922]
 [ 1.133  0.63   1.196  0.206  2.467]
 [ 0.806  0.871  0.597  0.526  0.172]
 [ 0.195  0.933  0.319  0.893 -0.397]
 [ 0.067  0.523  0.572  0.039  0.005]
 [ 0.257  0.511  0.768  0.075  0.351]
 [-0.04   0.517  0.64   0.721 -0.017]]
[[1.476 4.141 1.007 0.572 0.747]
 [0.013 0.006 0.014 0.008 0.01 ]
 [1.325 0.928 1.492 1.771 0.71 ]
 [1.353 0.305 0.836 1.442 0.787]
 [0.423 0.923 0.386 0.71  0.356]]


In [21]:
values = [zip(user_ratings[i], predictions[i]) for i in range(0,predictions.shape[0])]
comparison_data = pd.DataFrame(values)
comparison_data.columns = user_ratings_df.columns
#comparison_data.applymap(lambda (x,y): "(%2.3f|%2.3f)"%(x,y))

In [22]:
comparison_data

Unnamed: 0,The Call of Cthulhu,Frankenstein,Dracula,Neuromancer,Space Odyssey
0,"(8.0, 0.0)","(2.0, 0.0)","(nan, 0.0)","(5.0, 0.0)","(4.0, 0.0)"
1,"(3.0, 0.0)","(2.0, 0.0)","(nan, 0.0)","(7.0, 0.0)","(7.0, 0.0)"
2,"(9.0, 0.0)","(nan, 0.0)","(7.0, 0.0)","(8.0, 0.0)","(5.0, 0.0)"
3,"(nan, 0.0)","(nan, 0.0)","(7.0, 0.0)","(8.0, 0.0)","(9.0, 0.0)"
4,"(nan, 0.0)","(1.0, 0.0)","(8.0, 0.0)","(3.0, 0.0)","(7.0, 0.0)"
5,"(2.0, 0.0)","(3.0, 0.0)","(5.0, 0.0)","(nan, 0.0)","(nan, 0.0)"
6,"(4.0, 0.0)","(2.0, 0.0)","(nan, 0.0)","(2.0, 0.0)","(7.0, 0.0)"
7,"(7.0, 0.0)","(1.0, 0.0)","(2.0, 0.0)","(7.0, 0.0)","(9.0, 0.0)"
8,"(3.0, 0.0)","(3.0, 0.0)","(nan, 0.0)","(7.0, 0.0)","(3.0, 0.0)"
9,"(4.0, 0.0)","(nan, 0.0)","(5.0, 0.0)","(3.0, 0.0)","(3.0, 0.0)"


In [25]:
d = comparison_data.to_latex()
text_file = open("comparison_features.txt", "w")
text_file.write(d)
text_file.close()