# Context-Aware Recommendation Algorithm by PAPERDAA-2

In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from itertools import product as cartesian_product

# Training

## Program Arguments

In [2]:
LMAX = 12    # Maximum transitive path length
K = 70

## Preprocessing

In [3]:
df = pd.read_csv('training_dataset.csv')

df.tail(5)

Unnamed: 0,userid,itemid,rating,Time,Location,Companion
3432,1082,tt0413267,1,Weekend,Home,Partner
3433,1082,tt1637706,2,Weekend,Home,Partner
3434,1082,tt0343660,1,Weekend,Home,Partner
3435,1082,tt1133985,1,Weekend,Home,Partner
3436,1082,tt1099212,1,Weekend,Home,Partner


In [4]:
encoder = preprocessing.LabelEncoder()

user_item_context_encodings = []
user_item_context_reverse_encodings = []
maximum_rating = df[df.columns[2]].max()

encoded_df = df.copy()
# Encode userid, itemid, and contextual informations for item splitting
for column_index in range(len(df.columns)):
    
    # Column attribute is not rating
    if column_index != 2:
        
        # Fit encoder
        encoder.fit(df[df.columns[column_index]])
        encoded_df[df.columns[column_index]] = encoder.transform(
                df[df.columns[column_index]]
            )
    
    # Column is nor user or rating
    if column_index != 2:
            user_item_context_encodings.append(
                dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
            )
            user_item_context_reverse_encodings.append(
                dict(zip(encoder.transform(encoder.classes_), encoder.classes_))
            )

In [5]:
display(encoded_df)

Unnamed: 0,userid,itemid,rating,Time,Location,Companion
0,2,38,1,0,0,0
1,2,8,1,0,0,0
2,2,74,1,0,0,0
3,2,5,1,0,0,0
4,2,2,1,0,0,0
...,...,...,...,...,...,...
3432,65,35,1,1,1,2
3433,65,62,2,1,1,2
3434,65,25,1,1,1,2
3435,65,50,1,1,1,2


## Item Splitting

In [6]:
# Cartesian product all items and contexts

users = user_item_context_encodings[0].values()
items = user_item_context_encodings[1].values()
contexts = [
    context_trans.values() for context_trans in user_item_context_encodings[2:]
]

context_T = list(cartesian_product(items, *contexts))

In [7]:
# Generate new user-item matrix for new items
rating_matrix = np.zeros((len(users), len(context_T)), dtype=object)

In [8]:
for row in encoded_df.iterrows():
    data = tuple(row[1])
    user = data[0]
    item = data[1]
    rating = data[2]
    context_item = (item, *data[3:])
    
    index = context_T.index(context_item)
    
    rating_matrix[user][index] = int(rating)/int(maximum_rating)

In [9]:
display(rating_matrix)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0.2, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1.0, 0.8, 1.0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=object)

## Graph Similarity Calculation (User-Based)

In [10]:
rating_matrix_transposed = np.transpose(rating_matrix)

In [11]:
L = 2

WWT = np.matmul(rating_matrix, rating_matrix_transposed)
M = np.matmul(rating_matrix, rating_matrix_transposed)

while L != LMAX:
    M = np.matmul(WWT, M)
    L = L + 2

In [12]:
display(M)

array([[25150502243.81404, 21801087833.938908, 7452841819.672217, ...,
        50985618106.926506, 65454811970.57991, 20504493154.880524],
       [21801087833.93891, 20946620175.835846, 6786655950.897172, ...,
        44645749110.49383, 56254540558.10141, 18401203467.36349],
       [7452841819.672219, 6786655950.897174, 2332382609.4234204, ...,
        14717610152.052992, 19995512975.987724, 6185489934.954971],
       ...,
       [50985618106.92652, 44645749110.49382, 14717610152.052994, ...,
        108715303573.73485, 129157606785.62277, 41648475222.002914],
       [65454811970.5799, 56254540558.10141, 19995512975.987732, ...,
        129157606785.6228, 181115433689.2951, 53884229779.728546],
       [20504493154.880527, 18401203467.363495, 6185489934.954971, ...,
        41648475222.00293, 53884229779.72855, 17160088498.740337]],
      dtype=object)

# Prediction

In [13]:
USER = 1052   # User to check
N = 10

In [14]:
# # Context translation
# translated_context = []

# for cnt_index in range(len(CONTEXT)):
#     # 0 --> User
#     # 1 --> Item
#     # >= 2 --> context
#     map_index = cnt_index + 2
#     translation_table = user_item_context_encodings[map_index]
    
#     translated_context.append(translation_table[CONTEXT[cnt_index]])

# translated_context = tuple(translated_context)
# display(translated_context)

## Ratings Prediction

In [15]:
# Get K most similar users
wanted_user = user_item_context_encodings[0][USER]

scores = M[wanted_user]
K_similar_users = np.argpartition(scores, -(K+1))[-(K+1):]

if wanted_user in K_similar_users:
    K_similar_users = K_similar_users[K_similar_users != wanted_user]
else:
    K_similar_users = K_similar_users[:-1]

### Get list of rated items

In [16]:
inferred_ratings = []
user_rated_items = rating_matrix[wanted_user]

### KNN

In [17]:
for item in range(len(user_rated_items)):
    rating = user_rated_items[item]
    
    # Item has not been rated
    if rating == 0:
        rating_sum = 0
        neighbor_count = 0
        
        for neighbor in K_similar_users:
            neighbor_rating = rating_matrix[neighbor][item]
            
            # If neighbor has rated the item
            if neighbor_rating != 0:
                rating_sum += neighbor_rating
                neighbor_count += 1
            
        inferred_rating = rating_sum / neighbor_count if neighbor_count != 0 else 0
        
        if inferred_rating != 0:
            inferred_ratings.append((item, inferred_rating))

In [18]:
mapped_ratings = []
for i in inferred_ratings:
    
    # Get actual item encoding (from the cartesian product result)
    item_context = context_T[i[0]]
    
    item = user_item_context_reverse_encodings[1][item_context[0]]
    
    new_entry = [item]
    
    for j in range(1, len(item_context)):
        new_entry.append(item_context[j])
    
    new_entry.append(i[1])
    
    mapped_ratings.append(new_entry)

In [19]:
predicted_df = pd.DataFrame(
    mapped_ratings, 
    columns=[
        'Item', 
        *encoded_df.columns[3:], 
        'predicted_rating']
).sort_values(by='predicted_rating', ascending=False)
display(predicted_df)

Unnamed: 0,Item,Time,Location,Companion,predicted_rating
169,tt0211915,0,1,2,1.0
37,tt0110475,0,1,1,1.0
663,tt1707386,0,1,2,1.0
139,tt0147800,1,1,0,1.0
677,tt2096673,0,0,2,1.0
...,...,...,...,...,...
53,tt0114148,0,0,0,0.2
747,tt3637328,1,1,2,0.2
713,tt3203616,1,1,0,0.2
711,tt3203616,1,0,0,0.2


## Recommendation Generation

In [20]:
# Select N items to be recommended
chosen = []

for data in predicted_df.iterrows():
    data_tup = tuple(data[1])
      
    if len(chosen) <= N:
        chosen.append(data_tup)

for i in chosen:
    print(i)

('tt0211915', 0, 1, 2, 1.0)
('tt0110475', 0, 1, 1, 1.0)
('tt1707386', 0, 1, 2, 1.0)
('tt0147800', 1, 1, 0, 1.0)
('tt2096673', 0, 0, 2, 1.0)
('tt2096673', 0, 1, 0, 1.0)
('tt0407304', 1, 1, 2, 1.0)
('tt0138097', 1, 1, 0, 1.0)
('tt2096673', 1, 0, 1, 1.0)
('tt2096673', 1, 1, 1, 1.0)
('tt2251217', 0, 1, 1, 1.0)


## Rating Prediction Translated Result

In [21]:
mapped_ratings = []
for i in inferred_ratings:
    
    # Get actual item encoding (from the cartesian product result)
    item_context = context_T[i[0]]
    
    item = user_item_context_reverse_encodings[1][item_context[0]]
    
    new_entry = [item]
    
    # Translate remaining context
    for j in range(1, len(item_context)):
        # 1 --> item
        # >= 2 --> context
        translator = user_item_context_reverse_encodings[j+1]
        new_entry.append(translator[item_context[j]])
    
    # Append the rating
    new_entry.append(i[1]*maximum_rating)
    
    mapped_ratings.append(new_entry)

In [22]:
res = pd.DataFrame(
    mapped_ratings, 
    columns=[
        'Item', 
        *encoded_df.columns[3:], 
        'predicted_rating']
).sort_values(by='predicted_rating', ascending=False)
display(res)

Unnamed: 0,Item,Time,Location,Companion,predicted_rating
169,tt0211915,Weekday,Home,Partner,5.0
37,tt0110475,Weekday,Home,Family,5.0
663,tt1707386,Weekday,Home,Partner,5.0
139,tt0147800,Weekend,Home,Alone,5.0
677,tt2096673,Weekday,Cinema,Partner,5.0
...,...,...,...,...,...
53,tt0114148,Weekday,Cinema,Alone,1.0
747,tt3637328,Weekend,Home,Partner,1.0
713,tt3203616,Weekend,Home,Alone,1.0
711,tt3203616,Weekend,Cinema,Alone,1.0


In [23]:
res[res['Item'] == 'tt0266543']

Unnamed: 0,Item,Time,Location,Companion,predicted_rating
204,tt0266543,Weekend,Cinema,Family,4.7
202,tt0266543,Weekday,Home,Partner,4.5
200,tt0266543,Weekday,Cinema,Partner,4.333333
199,tt0266543,Weekday,Cinema,Family,4.2
205,tt0266543,Weekend,Cinema,Partner,3.9
207,tt0266543,Weekend,Home,Partner,3.833333
206,tt0266543,Weekend,Home,Alone,3.75
203,tt0266543,Weekend,Cinema,Alone,3.75
201,tt0266543,Weekday,Home,Alone,3.714286
198,tt0266543,Weekday,Cinema,Alone,3.0


## Dataset Merge

In [24]:
witheld = pd.read_csv('witheld_ratings.csv')

In [25]:
display(witheld)

Unnamed: 0,userid,itemid,rating,Time,Location,Companion
0,1066,tt4411490,1,Weekend,Cinema,Family
1,1066,tt1707386,2,Weekday,Home,Partner
2,1066,tt4411490,1,Weekday,Cinema,Family
3,1066,tt0232500,1,Weekday,Home,Partner
4,1066,tt1707386,1,Weekend,Cinema,Family
...,...,...,...,...,...,...
147,1098,tt0405422,3,Weekend,Cinema,Alone
148,1098,tt0211915,4,Weekend,Home,Family
149,1098,tt0169547,4,Weekend,Cinema,Alone
150,1098,tt0289879,1,Weekday,Home,Alone


In [26]:
actual_ratings = []

In [27]:
for row in res.iterrows():
    row_data = row[1]
    actual = witheld[
        (witheld['userid'] == USER) &\
        (witheld['itemid'] == row_data['Item']) &\
        (witheld['Time'] == row_data['Time']) &\
        (witheld['Location'] == row_data['Location']) &
        (witheld['Companion'] == row_data['Companion'])
    ]['rating']
    
    if(not actual.empty):
        actual_ratings.append(tuple(actual)[0])
        
    else:
        actual_ratings.append(np.nan)   

In [28]:
merged_result = res.copy()

In [29]:
merged_result['actual_rating'] = actual_ratings

In [30]:
merged_result['actual_rating']

169   NaN
37    NaN
663   NaN
139   NaN
677   NaN
       ..
53    NaN
747   NaN
713   NaN
711   NaN
293   NaN
Name: actual_rating, Length: 804, dtype: float64

In [31]:
merged_result[
    (merged_result['actual_rating'] != np.nan)
]

Unnamed: 0,Item,Time,Location,Companion,predicted_rating,actual_rating
169,tt0211915,Weekday,Home,Partner,5.0,
37,tt0110475,Weekday,Home,Family,5.0,
663,tt1707386,Weekday,Home,Partner,5.0,
139,tt0147800,Weekend,Home,Alone,5.0,
677,tt2096673,Weekday,Cinema,Partner,5.0,
...,...,...,...,...,...,...
53,tt0114148,Weekday,Cinema,Alone,1.0,
747,tt3637328,Weekend,Home,Partner,1.0,
713,tt3203616,Weekend,Home,Alone,1.0,
711,tt3203616,Weekend,Cinema,Alone,1.0,


In [32]:
merged_result.dropna()

Unnamed: 0,Item,Time,Location,Companion,predicted_rating,actual_rating
400,tt0454876,Weekday,Cinema,Alone,4.5,1.0
3,tt0088763,Weekday,Home,Alone,4.5,5.0
9,tt0088763,Weekend,Home,Family,4.125,4.0
407,tt0454876,Weekend,Cinema,Family,3.857143,1.0
216,tt0268380,Weekend,Home,Family,3.8,5.0
201,tt0266543,Weekday,Home,Alone,3.714286,5.0
26,tt0110357,Weekday,Home,Family,3.666667,5.0
500,tt1055369,Weekday,Home,Family,3.5,5.0
656,tt1657301,Weekend,Home,Family,3.25,3.0
278,tt0356910,Weekday,Cinema,Alone,2.333333,1.0
