# Context-Aware Recommendation Algorithm Item KNN

In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, mean_absolute_error
from itertools import product as cartesian_product

# Training

## Program Arguments

In [2]:
k = 15      # specifies the number of item-to-item similarities that will be stored for each item

## Preprocessing

In [3]:
df = pd.read_csv('training_dataset.csv')

df.tail(5)

Unnamed: 0,userid,itemid,rating,Time,Location,Companion
3432,1082,tt0413267,1,Weekend,Home,Partner
3433,1082,tt1637706,2,Weekend,Home,Partner
3434,1082,tt0343660,1,Weekend,Home,Partner
3435,1082,tt1133985,1,Weekend,Home,Partner
3436,1082,tt1099212,1,Weekend,Home,Partner


In [4]:
encoder = preprocessing.LabelEncoder()

user_item_context_encodings = []
user_item_context_reverse_encodings = []
maximum_rating = df[df.columns[2]].max()

encoded_df = df.copy()
# Encode userid, itemid, and contextual informations for item splitting
for column_index in range(len(df.columns)):
    
    # Column attribute is not rating
    if column_index != 2:
        
        # Fit encoder
        encoder.fit(df[df.columns[column_index]])
        encoded_df[df.columns[column_index]] = encoder.transform(
                df[df.columns[column_index]]
            )
    
    # Column is nor user or rating
    if column_index != 2:
            user_item_context_encodings.append(
                dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
            )
            user_item_context_reverse_encodings.append(
                dict(zip(encoder.transform(encoder.classes_), encoder.classes_))
            )

In [5]:
display(encoded_df)

Unnamed: 0,userid,itemid,rating,Time,Location,Companion
0,2,38,1,0,0,0
1,2,8,1,0,0,0
2,2,74,1,0,0,0
3,2,5,1,0,0,0
4,2,2,1,0,0,0
...,...,...,...,...,...,...
3432,65,35,1,1,1,2
3433,65,62,2,1,1,2
3434,65,25,1,1,1,2
3435,65,50,1,1,1,2


## Item Splitting

In [6]:
# Cartesian product all items and contexts

users = user_item_context_encodings[0].values()
items = user_item_context_encodings[1].values()
contexts = [
    context_trans.values() for context_trans in user_item_context_encodings[2:]
]

context_T = list(cartesian_product(items, *contexts))

[(0, 0, 0, 0),
 (0, 0, 0, 1),
 (0, 0, 0, 2),
 (0, 0, 1, 0),
 (0, 0, 1, 1),
 (0, 0, 1, 2),
 (0, 1, 0, 0),
 (0, 1, 0, 1),
 (0, 1, 0, 2),
 (0, 1, 1, 0),
 (0, 1, 1, 1),
 (0, 1, 1, 2),
 (1, 0, 0, 0),
 (1, 0, 0, 1),
 (1, 0, 0, 2),
 (1, 0, 1, 0),
 (1, 0, 1, 1),
 (1, 0, 1, 2),
 (1, 1, 0, 0),
 (1, 1, 0, 1),
 (1, 1, 0, 2),
 (1, 1, 1, 0),
 (1, 1, 1, 1),
 (1, 1, 1, 2),
 (2, 0, 0, 0),
 (2, 0, 0, 1),
 (2, 0, 0, 2),
 (2, 0, 1, 0),
 (2, 0, 1, 1),
 (2, 0, 1, 2),
 (2, 1, 0, 0),
 (2, 1, 0, 1),
 (2, 1, 0, 2),
 (2, 1, 1, 0),
 (2, 1, 1, 1),
 (2, 1, 1, 2),
 (3, 0, 0, 0),
 (3, 0, 0, 1),
 (3, 0, 0, 2),
 (3, 0, 1, 0),
 (3, 0, 1, 1),
 (3, 0, 1, 2),
 (3, 1, 0, 0),
 (3, 1, 0, 1),
 (3, 1, 0, 2),
 (3, 1, 1, 0),
 (3, 1, 1, 1),
 (3, 1, 1, 2),
 (4, 0, 0, 0),
 (4, 0, 0, 1),
 (4, 0, 0, 2),
 (4, 0, 1, 0),
 (4, 0, 1, 1),
 (4, 0, 1, 2),
 (4, 1, 0, 0),
 (4, 1, 0, 1),
 (4, 1, 0, 2),
 (4, 1, 1, 0),
 (4, 1, 1, 1),
 (4, 1, 1, 2),
 (5, 0, 0, 0),
 (5, 0, 0, 1),
 (5, 0, 0, 2),
 (5, 0, 1, 0),
 (5, 0, 1, 1),
 (5, 0, 1, 2),
 (5, 1, 0,

In [28]:
# Generate new user-item matrix for new items
rating_matrix = np.zeros((len(users), len(context_T)), dtype=object)

In [29]:
for row in encoded_df.iterrows():
    data = tuple(row[1])
    user = data[0]
    item = data[1]
    rating = data[2]
    context_item = (item, *data[3:])
    
    index = context_T.index(context_item)
    
    rating_matrix[user][index] = int(rating)/int(maximum_rating)

In [30]:
display(rating_matrix)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0.2, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1.0, 0.8, 1.0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=object)

## Item Similarity Approach (Cosine Similarity)

In [31]:
def get_column_vector(matrix, index):
    column_vector = []

    for row in matrix:
        column_vector.append(row[index])

    return column_vector


def dot_product(c1, c2):
    result = 0
    if (len(c1) == len(c2)):
        for i in range(len(c1)):
            result += c1[i] * c2[i]

    return result


def vector_length(vector):
    result = 0

    for integer in vector:
        result += integer * integer
    
    result = result**0.5
    return result if result != 0 else False


def cosine_similarity(c1, c2):
    l1 = vector_length(c1) 
    l2 = vector_length(c2)
    
    if l1 > 0 and l2 > 0:
        return dot_product(c1, c2) / (l1 * l2)
    else:
        return False

def get_k_largest(vector: list, k):
    k_largest = []
    while k > 0 and max(vector) != -999999:
        largest = max(vector)
        k_largest.append(largest)
        vector[vector.index(largest)] = -999999

        k -= 1
    
    return k_largest




In [1]:
M = [[0 for j in range(len(rating_matrix[0]))] for i in range(len(rating_matrix))]

for j in range(len(rating_matrix)):
    for i in range(len(rating_matrix)):
        if i != j:
            columnI = get_column_vector(rating_matrix, i)
            columnJ = get_column_vector(rating_matrix, j)
            sim = cosine_similarity(columnI, columnJ)
            if (sim):
                M[i][j] = sim
            else:
                M[i][j] = 0
    # for i in range(len(rating_matrix)):
    #     columnI = get_column_vector(rating_matrix, i)
    #     k_largest = get_k_largest(columnI, k)
    #     if M[i][j] not in k_largest:
    #         M[i][j] = 0

display(M)


NameError: name 'rating_matrix' is not defined

In [49]:

# apply model
from random import randint

temp = randint(0, len(rating_matrix))
print(temp)
U = rating_matrix[temp]
display(U)
x = np.matmul(M, U)
display(x)

77


array([0.6, 0, 0.6, 0, 0, 0, 0, 0, 0.6, 0, 0, 0, 1.0, 0, 0.8, 0, 0, 0, 0,
       0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1.0, 0, 1.0, 0, 0, 0, 0, 0, 0.8, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0.6, 0, 0.8, 0, 0, 0, 0, 0, 0.8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1.0, 0, 1.0, 0, 0, 0, 0, 0, 0.6, 0, 0, 0, 0.6, 0,
       1.0, 0, 0, 0, 0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.4, 0, 0.4, 0, 0, 0, 0, 0,
       0.4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.6, 0, 0.4, 0,
       0, 0, 0, 0, 0.6, 0, 0, 0, 1.0, 0, 0.8, 0, 0, 0, 0, 0, 1.0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.6, 0, 0.8, 0, 0, 0, 0, 0,
       0.6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0.6, 0, 0.8, 0

array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 0.0], dtype=object)

# Prediction

In [50]:
# USER = 1052   # User to check
N = 10

## Ratings Prediction

In [51]:

def rating_prediction(USER):
    # Get K most similar users
    wanted_user = user_item_context_encodings[0][USER]
    scores = M[wanted_user]
    K_similar_users = np.argpartition(scores, -(K+1))[-(K+1):]

    if wanted_user in K_similar_users:
        K_similar_users = K_similar_users[K_similar_users != wanted_user]
    else:
        K_similar_users = K_similar_users[:-1]

    return K_similar_users, wanted_user

### Get list of rated items

In [52]:
def get_rated_items(wanted_user):
    return rating_matrix[wanted_user]

### KNN

In [53]:

def KNN(user_rated_items, K_similar_users):
    inferred_ratings = []

    for item in range(len(user_rated_items)):
        rating = user_rated_items[item]
        
        # Item has not been rated
        if rating == 0:
            rating_sum = 0
            neighbor_count = 0
            
            for neighbor in K_similar_users:
                neighbor_rating = rating_matrix[neighbor][item]
                
                # If neighbor has rated the item
                if neighbor_rating != 0:
                    rating_sum += neighbor_rating
                    neighbor_count += 1
                
            inferred_rating = rating_sum / neighbor_count if neighbor_count != 0 else 0
            
            if inferred_rating != 0:
                inferred_ratings.append((item, inferred_rating))

    return inferred_ratings

In [54]:
def get_mapped_ratings_from_context(inferred_ratings, user_item_context_reverse_encodings, context_T):
    mapped_ratings = []
    for i in inferred_ratings:
        
        # Get actual item encoding (from the cartesian product result)
        item_context = context_T[i[0]]
        
        item = user_item_context_reverse_encodings[1][item_context[0]]
        
        new_entry = [item]
        
        for j in range(1, len(item_context)):
            new_entry.append(item_context[j])
        
        new_entry.append(i[1])
        
        mapped_ratings.append(new_entry)
    
    return mapped_ratings

In [55]:

def get_predictions(mapped_ratings, encoded_df):
    predicted_df = pd.DataFrame(
        mapped_ratings, 
        columns=[
            'Item', 
            *encoded_df.columns[3:], 
            'predicted_rating']
    ).sort_values(by='predicted_rating', ascending=False)
    return predicted_df

## Recommendation Generation

In [56]:

def recommendation_generation(N, predicted_df):
    # Select N items to be recommended
    chosen = []

    for data in predicted_df.iterrows():
        data_tup = tuple(data[1])
        
        if len(chosen) <= N:
            chosen.append(data_tup)

    return chosen

## Rating Prediction Translated Result

In [57]:
def get_mapped_ratings_translated(inferred_ratings, context_T, user_item_context_reverse_encodings):
    mapped_ratings = []
    for i in inferred_ratings:
        
        # Get actual item encoding (from the cartesian product result)
        item_context = context_T[i[0]]
        
        item = user_item_context_reverse_encodings[1][item_context[0]]
        
        new_entry = [item]
        
        # Translate remaining context
        for j in range(1, len(item_context)):
            # 1 --> item
            # >= 2 --> context
            translator = user_item_context_reverse_encodings[j+1]
            new_entry.append(translator[item_context[j]])
        
        # Append the rating
        new_entry.append(i[1]*maximum_rating)
        
        mapped_ratings.append(new_entry)
    
    return mapped_ratings

In [58]:
def get_result(mapped_ratings, encoded_df):   
    res = pd.DataFrame(
        mapped_ratings, 
        columns=[
            'Item', 
            *encoded_df.columns[3:], 
            'predicted_rating']
    ).sort_values(by='predicted_rating', ascending=False)
    return res

## Dataset Merge

In [59]:
witheld = pd.read_csv('witheld_ratings.csv')

In [60]:
display(witheld)

Unnamed: 0,userid,itemid,rating,Time,Location,Companion
0,1066,tt4411490,1,Weekend,Cinema,Family
1,1066,tt1707386,2,Weekday,Home,Partner
2,1066,tt4411490,1,Weekday,Cinema,Family
3,1066,tt0232500,1,Weekday,Home,Partner
4,1066,tt1707386,1,Weekend,Cinema,Family
...,...,...,...,...,...,...
147,1098,tt0405422,3,Weekend,Cinema,Alone
148,1098,tt0211915,4,Weekend,Home,Family
149,1098,tt0169547,4,Weekend,Cinema,Alone
150,1098,tt0289879,1,Weekday,Home,Alone


In [61]:
def merge_df(res, witheld, USER):
    actual_ratings = []
    for row in res.iterrows():
        row_data = row[1]
        actual = witheld[
            (witheld['userid'] == USER) &\
            (witheld['itemid'] == row_data['Item']) &\
            (witheld['Time'] == row_data['Time']) &\
            (witheld['Location'] == row_data['Location']) &
            (witheld['Companion'] == row_data['Companion'])
        ]['rating']
        
        if(not actual.empty):
            actual_ratings.append(tuple(actual)[0])
            
        else:
            actual_ratings.append(np.nan)   
    

    return actual_ratings

In [62]:
print(user_item_context_encodings[0])

{1001: 0, 1002: 1, 1003: 2, 1004: 3, 1005: 4, 1006: 5, 1007: 6, 1008: 7, 1009: 8, 1011: 9, 1014: 10, 1015: 11, 1016: 12, 1018: 13, 1026: 14, 1027: 15, 1028: 16, 1029: 17, 1030: 18, 1031: 19, 1032: 20, 1033: 21, 1034: 22, 1035: 23, 1037: 24, 1038: 25, 1039: 26, 1040: 27, 1041: 28, 1042: 29, 1043: 30, 1044: 31, 1045: 32, 1046: 33, 1047: 34, 1048: 35, 1049: 36, 1050: 37, 1051: 38, 1052: 39, 1053: 40, 1054: 41, 1055: 42, 1056: 43, 1057: 44, 1058: 45, 1059: 46, 1060: 47, 1061: 48, 1062: 49, 1063: 50, 1064: 51, 1065: 52, 1066: 53, 1067: 54, 1068: 55, 1069: 56, 1070: 57, 1071: 58, 1074: 59, 1075: 60, 1076: 61, 1077: 62, 1078: 63, 1079: 64, 1082: 65, 1084: 66, 1087: 67, 1097: 68, 1098: 69, 1105: 70, 1107: 71, 1109: 72, 1112: 73, 1113: 74, 1114: 75, 1115: 76, 1116: 77, 1119: 78, 1120: 79, 1122: 80}


# Automated User (Sum of RMSE)

In [63]:

sum_rms = 0
actual_users = 0

for USER in user_item_context_encodings[0]:
    N = 10
    K_similar_users, wanted_user = rating_prediction(USER)
    user_rated_items = get_rated_items(wanted_user)

    inferred_ratings = KNN(user_rated_items, K_similar_users)
    mapped_ratings = get_mapped_ratings_from_context(inferred_ratings, user_item_context_reverse_encodings, context_T)
    predicted_df = get_predictions(mapped_ratings, encoded_df)

    chosen = recommendation_generation(N, predicted_df)
    
    mapped_ratings = get_mapped_ratings_translated(inferred_ratings, context_T, user_item_context_reverse_encodings)
    
    res = get_result(mapped_ratings, encoded_df)
    
    actual_ratings = merge_df(res, witheld, USER)
    
    merged_result = res.copy()
    merged_result['actual_rating'] = actual_ratings
    merged_result[(merged_result['actual_rating'] != np.nan)]  
    merged_result.dropna(inplace=True)

    if len(merged_result) != 0:
        actual_users += 1
        rms = mean_squared_error(merged_result['actual_rating'], merged_result['predicted_rating'], squared=False)
        sum_rms += rms


print("actual users: " + str(actual_users))
print(sum_rms / actual_users)
    


NameError: name 'K' is not defined

# Automated User (Concatenate User DF)

In [None]:

sum_rms = 0
actual_users = 0

combined_users = []

for USER in user_item_context_encodings[0]:
    N = 10
    K_similar_users, wanted_user = rating_prediction(USER)
    user_rated_items = get_rated_items(wanted_user)

    inferred_ratings = KNN(user_rated_items, K_similar_users)
    mapped_ratings = get_mapped_ratings_from_context(inferred_ratings, user_item_context_reverse_encodings, context_T)
    predicted_df = get_predictions(mapped_ratings, encoded_df)

    chosen = recommendation_generation(N, predicted_df)
    
    mapped_ratings = get_mapped_ratings_translated(inferred_ratings, context_T, user_item_context_reverse_encodings)
    
    res = get_result(mapped_ratings, encoded_df)
    
    actual_ratings = merge_df(res, witheld, USER)
    
    merged_result = res.copy()
    merged_result['actual_rating'] = actual_ratings
    merged_result[(merged_result['actual_rating'] != np.nan)]  
    merged_result.dropna(inplace=True)

    if len(merged_result) != 0:
        combined_users.append(merged_result)


df_final = pd.concat(combined_users)
print(mean_squared_error(df_final['actual_rating'], df_final['predicted_rating'], squared=False))



1.5776319110780084
