# Context-Aware Recommendation Algorithm by PAPERDAA-2

In [1]:
import numpy as np
import pandas as pd
import math
from sklearn import preprocessing
from itertools import product as cartesian_product

# Training

## Program Arguments

In [2]:
LMAX = 10    # Maximum transitive path length
K = 50       # Nearest neighbors

## Preprocessing

In [3]:
df = pd.read_csv('training_dataset.csv')

df.tail(5)

Unnamed: 0,userid,itemid,rating,Time,Location,Companion
3280,1082,tt0413267,1,Weekend,Home,Partner
3281,1082,tt1637706,2,Weekend,Home,Partner
3282,1082,tt0343660,1,Weekend,Home,Partner
3283,1082,tt1133985,1,Weekend,Home,Partner
3284,1082,tt1099212,1,Weekend,Home,Partner


In [4]:
encoder = preprocessing.LabelEncoder()

user_item_context_encodings = []
user_item_context_reverse_encodings = []
maximum_rating = df[df.columns[2]].max()

encoded_df = df.copy()
# Encode userid, itemid, and contextual informations for item splitting
for column_index in range(len(df.columns)):
    
    # Column attribute is not rating
    if column_index != 2:
        
        # Fit encoder
        encoder.fit(df[df.columns[column_index]])
        encoded_df[df.columns[column_index]] = encoder.transform(
                df[df.columns[column_index]]
            )
    
    # Column is nor user or rating
    if column_index != 2:
            user_item_context_encodings.append(
                dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
            )
            user_item_context_reverse_encodings.append(
                dict(zip(encoder.transform(encoder.classes_), encoder.classes_))
            )

In [5]:
display(encoded_df)

Unnamed: 0,userid,itemid,rating,Time,Location,Companion
0,2,38,1,0,0,0
1,2,8,1,0,0,0
2,2,74,1,0,0,0
3,2,5,1,0,0,0
4,2,2,1,0,0,0
...,...,...,...,...,...,...
3280,65,35,1,1,1,2
3281,65,62,2,1,1,2
3282,65,25,1,1,1,2
3283,65,50,1,1,1,2


## Item Splitting

In [6]:
# Cartesian product all items and contexts

users = user_item_context_encodings[0].values()
items = user_item_context_encodings[1].values()
contexts = [
    context_trans.values() for context_trans in user_item_context_encodings[2:]
]

context_T = list(cartesian_product(items, *contexts))

In [7]:
# Generate new user-item matrix for new items
rating_matrix = np.zeros((len(users), len(context_T)), dtype=object)

In [8]:
for row in encoded_df.iterrows():
    data = tuple(row[1])
    user = data[0]
    item = data[1]
    rating = data[2]
    context_item = (item, *data[3:])
    
    index = context_T.index(context_item)
    
    rating_matrix[user][index] = int(rating)/int(maximum_rating)

In [9]:
display(rating_matrix)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0.2, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1.0, 0.8, 1.0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=object)

## Graph Similarity Calculation (Item-Based)

In [10]:
rating_matrix_transposed = np.transpose(rating_matrix)

In [11]:
L = 2

WTW = np.matmul(rating_matrix_transposed, rating_matrix)
M = np.matmul(rating_matrix_transposed, rating_matrix)

while L != LMAX:
    M = np.matmul(WTW, M)
    L = L + 2

In [12]:
display(M)

array([[107249305.86998227, 129220059.68791185, 169283056.89488375, ...,
        11344696.704404594, 33275742.06576794, 66366151.081740186],
       [129220059.68791181, 216353066.99464956, 204667544.85426384, ...,
        17637142.467116836, 47381977.57878666, 116372165.82439996],
       [169283056.89488363, 204667544.85426393, 300185605.69188917, ...,
        18684110.6215468, 55265550.599811696, 99062848.5072651],
       ...,
       [11344696.704404578, 17637142.467116848, 18684110.6215468, ...,
        1659889.5675546625, 4105716.4298609686, 10268219.308705483],
       [33275742.06576792, 47381977.578786604, 55265550.59981168, ...,
        4105716.4298609695, 11850865.005202642, 24729966.160559],
       [66366151.08174018, 116372165.8244, 99062848.50726505, ...,
        10268219.30870549, 24729966.160559013, 80208593.95856671]],
      dtype=object)

# Prediction

In [13]:
USER = 1058   # User to check
N = 10

## Ratings Prediction

### Get list of rated items

In [14]:
wanted_user = user_item_context_encodings[0][USER]

inferred_ratings = []
user_rated_items = rating_matrix[wanted_user]

In [15]:
def find_item_neighbors(user, item, rating_matrix, similarity_matrix, K):
    corr_items_similarity = similarity_matrix[item]
    
    user_rated_items = rating_matrix[user]
    rated_item_indexes = []
    
    for item_index in range(len(user_rated_items)):
        
        # User has rated the item
        if user_rated_items[item_index] != 0:
            rated_item_indexes.append(item_index)
    
    filtered_similarity_scores = []
    
    # Get neighbors similarity score
    for item_index in rated_item_indexes:
        if item != item_index:
            filtered_similarity_scores.append(
                (item_index, corr_items_similarity[item_index])
            )
    
    # Sort based on simlarity scores 
    # (tuple is in (item_index, sim_score) format)
    filtered_similarity_scores = sorted(filtered_similarity_scores, 
                                       key=lambda x: x[1], 
                                       reverse=True
                                    )
    
    item_neighbors = []
    
    # Filter top K similar items
    for i in range(K):
        similar_item = filtered_similarity_scores[i]
        item_neighbors.append(similar_item[0])
    
    return item_neighbors

### Item-KNN

In [16]:
for item in range(len(user_rated_items)):
    rating = user_rated_items[item]
    
    # Item has not been rated
    if rating == 0:
        rating_sum = 0
        neighbor_count = 0
        
        # Find k nearest item neighbors
        nearest_neighbors = find_item_neighbors(
                                wanted_user,
                                item,
                                rating_matrix,
                                M,
                                K
                            )
        
        # Nearest neighbors are represented as their indexes
        # in the rating matrix
        for item_neighbor in nearest_neighbors:
            neighbor_rating = rating_matrix[wanted_user][item_neighbor]
            
            rating_sum += neighbor_rating
            neighbor_count += 1
            
        inferred_rating = rating_sum / neighbor_count if neighbor_count != 0 else 0
        
        if inferred_rating != 0:
            inferred_ratings.append((item, inferred_rating))

In [17]:
mapped_ratings = []
for i in inferred_ratings:
    
    # Get actual item encoding (from the cartesian product result)
    item_context = context_T[i[0]]
    
    item = user_item_context_reverse_encodings[1][item_context[0]]
    
    new_entry = [item]
    
    for j in range(1, len(item_context)):
        new_entry.append(item_context[j])
    
    new_entry.append(i[1])
    
    mapped_ratings.append(new_entry)

In [18]:
predicted_df = pd.DataFrame(
    mapped_ratings, 
    columns=[
        'Item', 
        *encoded_df.columns[3:], 
        'predicted_rating']
).sort_values(by='predicted_rating', ascending=False)
display(predicted_df)

Unnamed: 0,Item,Time,Location,Companion,predicted_rating
231,tt0293662,1,1,2,0.724
443,tt0800369,0,1,0,0.724
753,tt2574698,0,1,0,0.724
448,tt0800369,1,1,2,0.724
695,tt1707386,0,1,0,0.724
...,...,...,...,...,...
476,tt0945513,1,1,0,0.608
318,tt0376541,1,1,2,0.608
739,tt2251217,1,1,2,0.608
740,tt2557490,0,0,0,0.608


## Recommendation Generation

In [19]:
# Select N items to be recommended
chosen = []

for data in predicted_df.iterrows():
    data_tup = tuple(data[1])
      
    if len(chosen) <= N:
        chosen.append(data_tup)

for i in chosen:
    print(i)

('tt0293662', 1, 1, 2, 0.7240000000000001)
('tt0800369', 0, 1, 0, 0.7240000000000001)
('tt2574698', 0, 1, 0, 0.7240000000000001)
('tt0800369', 1, 1, 2, 0.7240000000000001)
('tt1707386', 0, 1, 0, 0.7240000000000001)
('tt3637328', 1, 0, 0, 0.7240000000000001)
('tt0462538', 0, 0, 2, 0.7240000000000001)
('tt0268380', 0, 1, 0, 0.7240000000000001)
('tt0289879', 1, 1, 2, 0.7240000000000001)
('tt0378194', 1, 1, 2, 0.7240000000000001)
('tt0147800', 1, 1, 2, 0.7240000000000001)


## Rating Prediction Translated Result

In [20]:
mapped_ratings = []
for i in inferred_ratings:
    
    # Get actual item encoding (from the cartesian product result)
    item_context = context_T[i[0]]
    
    item = user_item_context_reverse_encodings[1][item_context[0]]
    
    new_entry = [item]
    
    # Translate remaining context
    for j in range(1, len(item_context)):
        # 1 --> item
        # >= 2 --> context
        translator = user_item_context_reverse_encodings[j+1]
        new_entry.append(translator[item_context[j]])
    
    # Append the rating
    new_entry.append(i[1]*maximum_rating)
    
    mapped_ratings.append(new_entry)

In [21]:
res = pd.DataFrame(
    mapped_ratings, 
    columns=[
        'Item', 
        *encoded_df.columns[3:], 
        'predicted_rating']
).sort_values(by='predicted_rating', ascending=False)
display(res)

Unnamed: 0,Item,Time,Location,Companion,predicted_rating
231,tt0293662,Weekend,Home,Partner,3.62
443,tt0800369,Weekday,Home,Alone,3.62
753,tt2574698,Weekday,Home,Alone,3.62
448,tt0800369,Weekend,Home,Partner,3.62
695,tt1707386,Weekday,Home,Alone,3.62
...,...,...,...,...,...
476,tt0945513,Weekend,Home,Alone,3.04
318,tt0376541,Weekend,Home,Partner,3.04
739,tt2251217,Weekend,Home,Partner,3.04
740,tt2557490,Weekday,Cinema,Alone,3.04


In [22]:
res[res['Item'] == 'tt0266543']

Unnamed: 0,Item,Time,Location,Companion,predicted_rating
193,tt0266543,Weekday,Home,Alone,3.62
198,tt0266543,Weekend,Home,Partner,3.62
196,tt0266543,Weekend,Home,Alone,3.6
191,tt0266543,Weekday,Cinema,Family,3.6
195,tt0266543,Weekend,Cinema,Family,3.6
194,tt0266543,Weekday,Home,Partner,3.58
190,tt0266543,Weekday,Cinema,Alone,3.56
197,tt0266543,Weekend,Home,Family,3.56
192,tt0266543,Weekday,Cinema,Partner,3.54


## Dataset Merge

In [23]:
witheld = pd.read_csv('witheld_ratings.csv')

In [24]:
display(witheld)

Unnamed: 0,userid,itemid,rating,Time,Location,Companion
0,1067,tt0378194,2,Weekday,Cinema,Partner
1,1067,tt0111161,1,Weekday,Cinema,Partner
2,1067,tt1632708,1,Weekday,Cinema,Partner
3,1067,tt0356910,4,Weekend,Cinema,Partner
4,1067,tt0378194,1,Weekend,Cinema,Partner
...,...,...,...,...,...,...
147,1058,tt0319262,3,Weekday,Home,Family
148,1058,tt1657301,3,Weekday,Home,Family
149,1058,ttnanana1,2,Weekday,Cinema,Partner
150,1058,tt3793764,3,Weekday,Home,Family


In [25]:
actual_ratings = []

In [26]:
for row in res.iterrows():
    row_data = row[1]
    actual = witheld[
        (witheld['userid'] == USER) &\
        (witheld['itemid'] == row_data['Item']) &\
        (witheld['Time'] == row_data['Time']) &\
        (witheld['Location'] == row_data['Location']) &
        (witheld['Companion'] == row_data['Companion'])
    ]['rating']
    
    if(not actual.empty):
        actual_ratings.append(tuple(actual)[0])
        
    else:
        actual_ratings.append(np.nan)   

In [27]:
merged_result = res.copy()

In [28]:
merged_result['actual_rating'] = actual_ratings

In [29]:
merged_result['actual_rating']

231   NaN
443   NaN
753   NaN
448   NaN
695   NaN
       ..
476   NaN
318   NaN
739   NaN
740   NaN
773   NaN
Name: actual_rating, Length: 857, dtype: float64

In [30]:
merged_result[
    (merged_result['actual_rating'] != np.nan)
]

Unnamed: 0,Item,Time,Location,Companion,predicted_rating,actual_rating
231,tt0293662,Weekend,Home,Partner,3.62,
443,tt0800369,Weekday,Home,Alone,3.62,
753,tt2574698,Weekday,Home,Alone,3.62,
448,tt0800369,Weekend,Home,Partner,3.62,
695,tt1707386,Weekday,Home,Alone,3.62,
...,...,...,...,...,...,...
476,tt0945513,Weekend,Home,Alone,3.04,
318,tt0376541,Weekend,Home,Partner,3.04,
739,tt2251217,Weekend,Home,Partner,3.04,
740,tt2557490,Weekday,Cinema,Alone,3.04,


In [31]:
compared_df = merged_result.dropna()
display(compared_df)

Unnamed: 0,Item,Time,Location,Companion,predicted_rating,actual_rating
149,tt0181689,Weekday,Home,Family,3.62,5.0
630,tt1499658,Weekend,Cinema,Partner,3.62,3.0
26,tt0110357,Weekend,Cinema,Partner,3.62,3.0
279,tt0356910,Weekday,Home,Family,3.58,4.0
808,tt3793764,Weekday,Home,Family,3.58,3.0
686,tt1657301,Weekday,Home,Family,3.58,3.0
323,tt0378194,Weekday,Home,Family,3.58,3.0
620,tt1478338,Weekend,Cinema,Partner,3.56,3.0
248,tt0319262,Weekday,Home,Family,3.56,3.0
333,tt0382625,Weekday,Home,Family,3.56,4.0


In [32]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [33]:
print("MAE", mean_absolute_error(
    compared_df['actual_rating'],
    compared_df['predicted_rating']
))

print("RMSE", mean_squared_error(
    compared_df['actual_rating'],
    compared_df['predicted_rating']
))

MAE 0.9756521739130434
RMSE 1.3931478260869568
