# Hybrid Models for Recommendation Systems

Load Pandas, we are going to need it for manipulating data

In [1]:
import pandas as pd
import numpy as np
from IPython.display import Image
np.set_printoptions(precision = 3)

Now load the data

In [4]:
user_ratings_df = pd.read_csv("user_ratings.csv")
user_features_df = pd.read_csv("user_features.csv")
item_features_df = pd.read_csv("item_features.csv")


user_features_df["key"] = 0
user_features_df["user_id"] = range(0,user_features_df.shape[0])
item_features_df["key"] = 0
item_features_df["item_id"] = range(0,item_features_df.shape[0])

merged_df = pd.merge(user_features_df, item_features_df,left_index=True,on="key")
merged_df[["item_id", "user_id"]]




merged_df["rating"] = map(lambda ids: user_ratings_df.values[ids[1]][ids[2]], 
                          merged_df[["user_id", "item_id"]].itertuples())

train = merged_df.dropna()

test = merged_df[merged_df.isnull().any(axis=1)]

print(test.to_latex())



\begin{tabular}{lrrrrrrrl}
\toprule
Empty DataFrame
Columns: Index(['Sex', ' Over60', 'key', 'user\_id', 'Critic0', ' Critic1', 'item\_id',
       'rating'],
      dtype='object')
Index: Int64Index([], dtype='int64') \\
\bottomrule
\end{tabular}



In [14]:
n_latent_features = 2

user_ratings = user_ratings_df.values
latent_user_preferences = np.random.random((user_ratings.shape[0], n_latent_features))
latent_item_features = np.random.random((user_ratings.shape[1],n_latent_features))

user_features = user_features_df.values
item_features = item_features_df.values

print(item_features_df.to_latex())


user_features = np.concatenate([np.ones(shape = (user_features.shape[0],1)), user_features], axis = 1)
item_features = np.concatenate([np.ones(shape = (item_features.shape[0],1)), item_features], axis = 1)



user_features_weights = np.random.random((user_ratings.shape[0], user_features.shape[1] ))
item_features_weights = np.random.random((user_ratings.shape[1],item_features.shape[1] ))



# print user_features

\begin{tabular}{lrrrr}
\toprule
{} &  Critic0 &   Critic1 &  key &  item\_id \\
\midrule
0 &      0.3 &       0.9 &    0 &        0 \\
1 &      0.9 &       0.3 &    0 &        1 \\
2 &      0.6 &       0.4 &    0 &        2 \\
3 &      0.2 &       0.1 &    0 &        3 \\
4 &      0.7 &       0.8 &    0 &        4 \\
\bottomrule
\end{tabular}



In [15]:
def predict_rating(user_id,item_id):
    """ Predict a rating given a user_id and an item_id.
    """
    user_preference = latent_user_preferences[user_id]
    item_preference = latent_item_features[item_id]
    
    user_score = user_features_weights[user_id].dot(user_features[user_id])
    item_score = item_features_weights[item_id].dot(item_features[item_id])
    #print user_preference.dot(item_preference), user_score, item_score
    return user_preference.dot(item_preference) + user_score + item_score

def train(user_id, item_id, rating,alpha = 0.001, 
                                   latent_feature_weight_decay = 0.1, 
                                   user_weight_decay = 0.01,
                                   item_weight_decay = 0.0001):
    
    #print item_id
    prediction_rating = predict_rating(user_id, item_id)
    err =  ( prediction_rating - rating );
    #print err
    user_pref_values = latent_user_preferences[user_id][:]
    latent_user_preferences[user_id] -= alpha * err *  ( latent_item_features[item_id] + latent_feature_weight_decay*latent_user_preferences[user_id])
    latent_item_features[item_id] -= alpha * err * ( user_pref_values + latent_feature_weight_decay*latent_item_features[item_id])
    
    user_features_weights[user_id] -=alpha * err *(  user_features[user_id] + user_weight_decay* user_features_weights[user_id])
    item_features_weights[item_id] -=alpha * err * ( item_features_weights[item_id] + item_weight_decay* item_features_weights[item_id])
    
    
    return err
    


def sgd(iterations = 30000):
    """ Iterate over all users and all items and train for 
        a certain number of iterations
    """
    for iteration in range(0,iterations):
        error = []
        for user_id in range(0,latent_user_preferences.shape[0]):
            for item_id in range(0,latent_item_features.shape[0]):
                rating = user_ratings[user_id][item_id]
                if(not np.isnan(rating)):
                    err = train(user_id,item_id,rating)
                    error.append(err)
    mse = (np.array(error) ** 2).mean()          
    print(mse)
                    
                    
    
                    
                    
    


In [16]:
for _ in range(0,10): 
    sgd()

0.2854073152963483
0.27798355679603987
0.27606874534315085
0.2751643235985715
0.2746347227274145
0.2742893808195622
0.27405041015741666
0.27387982645584397
0.2737568572900707
0.27366920650454996


In [17]:
predictions = np.zeros(shape = (latent_user_preferences.shape[0], latent_item_features.shape[0]) )
#print latent_user_preferences
print(user_features_weights)
print(item_features_weights)
for user_id in range(0,latent_user_preferences.shape[0]):
            for item_id in range(0,latent_item_features.shape[0]):
                predictions[user_id,item_id] =  predict_rating(user_id,item_id)

  

[[ 1.785e+00  1.476e+00  8.050e-01  3.615e-01  2.081e-01]
 [ 7.344e-02  4.157e-01  4.564e-02  4.212e-01 -2.357e-02]
 [ 9.220e-01  4.247e-01  5.688e-01  5.222e-01  1.764e+00]
 [ 7.202e-01  1.000e+00  8.287e-01  7.460e-02  1.124e+00]
 [ 2.910e-01  2.196e-01  1.964e-01  9.229e-01  2.903e-01]
 [ 9.548e-05  6.016e-02  7.235e-01  3.309e-01 -1.976e+00]
 [ 1.981e-01  4.585e-01  1.019e-01  6.126e-01  1.188e-01]
 [ 7.978e-01  6.068e-01  7.558e-01  5.018e-02  2.235e-01]
 [ 7.583e-01  5.842e-01  1.955e-01  8.237e-02 -1.422e-01]
 [ 3.200e-01  8.684e-01  7.958e-01  2.460e-01 -1.136e-01]]
[[1.958 3.292 1.476 0.734 0.982]
 [0.148 0.347 0.126 0.144 0.4  ]
 [0.194 0.191 0.045 0.183 0.112]
 [1.007 2.399 2.391 2.589 0.832]
 [0.009 0.005 0.087 0.161 0.038]]


In [18]:
values = [zip(user_ratings[i], predictions[i]) for i in range(0,predictions.shape[0])]
comparison_data = pd.DataFrame(values)
comparison_data.columns = comparison_data.columns
comparison_data.applymap(lambda x, y: "(%2.3f|%2.3f)"%(x,y))

TypeError: ("<lambda>() missing 1 required positional argument: 'y'", 'occurred at index 0')

In [19]:
comparison_data

Unnamed: 0,0,1,2,3,4
0,"(8.0, 7.873882182792691)","(2.0, 2.399879132661209)","(nan, 17.337397437059426)","(5.0, 4.729255431714947)","(4.0, 3.997072640657815)"
1,"(3.0, 2.913149738257946)","(2.0, 2.2776695378232334)","(nan, -22.195130221867046)","(7.0, 6.811176148093585)","(7.0, 6.998100801083666)"
2,"(9.0, 8.745429655055325)","(nan, 5.025150842889373)","(7.0, 7.041576867764631)","(8.0, 8.182687333258327)","(5.0, 5.022126258991089)"
3,"(nan, 8.974248357982852)","(nan, 5.00093852081645)","(7.0, 7.0000672087929345)","(8.0, 7.998582296320544)","(9.0, 9.000037881801182)"
4,"(nan, 5.489725121588419)","(1.0, 0.6874920336813072)","(8.0, 8.011805661727271)","(3.0, 3.276898241343994)","(7.0, 7.0106314644884495)"
5,"(2.0, 2.0103051273428507)","(3.0, 2.9908787560147756)","(5.0, 4.998481001605705)","(nan, 9.111838648087307)","(nan, -70.70249838939769)"
6,"(4.0, 4.543392640782522)","(2.0, 0.2878645002327149)","(nan, 3.1355163781606263)","(2.0, 3.125806845452677)","(7.0, 7.011376211696027)"
7,"(7.0, 6.519238967612656)","(1.0, 2.7485338689988947)","(2.0, 2.0478271450898564)","(7.0, 5.815933301563712)","(9.0, 8.98211516627115)"
8,"(3.0, 3.125181147463697)","(3.0, 2.6117289045582104)","(nan, -20.65543721414572)","(7.0, 7.248056134748872)","(3.0, 3.0023506285272905)"
9,"(4.0, 4.2619819502852305)","(nan, -0.03339375691603608)","(5.0, 4.977580168408994)","(3.0, 2.8258810252288042)","(3.0, 2.9789806787836572)"


In [20]:
d = comparison_data.to_latex()
text_file = open("comparison.txt", "w")
text_file.write(d)
text_file.close()