In [1]:
### Imports
import gzip
from collections import defaultdict
import math
import numpy as np
import string
import random
import string
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from matplotlib import cm, colors
import seaborn as sns
import pandas as pd
import os
import time
import itertools
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings("ignore")

In [2]:
### EVALUATION / METRICS
######################################
def convert_to_np_array(A):
    """
    If A is not already an array, convert it to an array
    """
    if not isinstance(A, np.ndarray): return np.array(A)
    else: return A

def get_MSE(A, B):
    """
    Given list A and list B:
    Return the mean squared error between A and B
    """
    return np.mean((convert_to_np_array(A) - convert_to_np_array(B))**2)

def inner(A, B):
    """
    Return the dot product between list A and list B
    """
    return np.dot(convert_to_np_array(A), convert_to_np_array(B))

def get_SSE(A, B):
    """
    Given list A and list B:
    Return the sum of squared errors between A and B
    """
    return np.sum((convert_to_np_array(A) - convert_to_np_array(B))**2)

def get_SE(A,B):
    """
    Given list A and list B:
    Return the squared error between each element
    """
    return (convert_to_np_array(A) - convert_to_np_array(B))**2

def get_accuracy(A,B):
    """
    Given list A and list B:
    Return the accuracy
    """
    return np.sum(convert_to_np_array(A) == convert_to_np_array(B)) / len(A)

def get_BER(y_actual, y_predicted):
    """
    "Return the balanced error rate between positive (1) and negative(0) instances
    """

    TP, FP, TN, FN = 0, 0, 0, 0
    n_pos, n_neg = 0, 0
    for actual, pred in zip(y_actual, y_predicted):
        if actual==1:
            n_pos += 1
            if actual==pred:
                TP += 1
            else:
                FN += 1
        else:
            n_neg += 1
            if actual==pred:
                TN += 1
            else:
                FP += 1
    FPR = FP / (FP + TN)
    FNR = FN / (FN + TP)
        
    return (1/2) * (FPR + FNR)

def get_errorMetrics_binary(y_actual, y_predicted, beta=1):
    """
    Return a set of error metrics between positive (1) and negative (0) instances
    This is valid for a binary class case
    Return a dictionary containing all calculated values
    """

    output = {}
    TP, FP, TN, FN = 0, 0, 0, 0
    n_pos, n_neg = 0, 0
    for actual, pred in zip(y_actual, y_predicted):
        if actual==1:
            n_pos += 1
            if actual==pred:
                TP += 1
            else:
                FN += 1
        else:
            n_neg += 1
            if actual==pred:
                TN += 1
            else:
                FP += 1
    ###
    TPR, FNR = TP / n_pos, FN / n_pos
    FPR, TNR = FP / n_neg, TN / n_neg
    prec = TP / (TP + FP)
    recall = TP / (TP + FN)
    output["TP"], output["FP"], output["TN"], output["FN"] = TP, FP, TN, FN
    output["TPR"], output["FPR"], output["FNR"], output["TNR"] = TPR, FPR, FNR, TNR
    output["precision"], output["recall"] = prec, recall
    output["BER"] = (1/2) * (FPR + FNR)
    output[f"F{beta}_Score"] = (1 + beta**2) * (prec * recall) / ((beta**2)*prec + recall)
    output["F_Score"] = 2 * (prec * recall) / (prec + recall)

    return output

In [3]:
### SIMILARITY FUNCTIONS
######################################
def jaccard_sim(A,B):
    """
    Return the Jaccard similarity between list A and list B
    """
    if not isinstance(A, set): A = set(A)
    if not isinstance(B, set): B = set(B)
    n_intersect = len(A.intersection(B))
    if n_intersect == 0: return 0
    n_union = len(A.union(B))
    if n_union == 0: return 0

    return n_intersect / n_union

def cosine_sim_binary(A,B, denom_over_all=True):
    """
    Return the cosine similarity between set A and set B (Binary interactions)
    """
    if not isinstance(A, set): A = set(A)
    if not isinstance(B, set): B = set(B)
    n_intersect = len(A.intersection(B))
    # if n_intersect == 0: return 0

    if denom_over_all:
        total_interactions = np.sqrt(len(A) * len(B))
    else:
        total_interactions = n_intersect
    if total_interactions == 0:
        return 0
    return n_intersect / total_interactions

############# Design structures to record shared items
def cosine_sim(x_tuple, y_tuple, denom_over_all):
    """
    Calculate the cosine similarity between lists x and y
    Input are lists of tuples: [(id1, rating), (id2, rating), ...]
    """
    # Get shared items
    x_ids, y_ids = set(), set()
    x_ratings, y_ratings = [], []
    shared_ratings_x, shared_ratings_y = [], []
    shared_tuples_x, shared_tuples_y = [], []
    for tuple in x_tuple:
        x_ids.add(tuple[0])
        x_ratings.append(tuple[1])
    for tuple in y_tuple:
        y_ids.add(tuple[0])
        y_ratings.append(tuple[1])
    shared_ids = x_ids.intersection(y_ids)
    # if len(shared_ids) == 0: return 0

    shared_tuples_x = [tuple for tuple in x_tuple if tuple[0] in shared_ids]
    shared_tuples_x.sort()
    shared_tuples_y = [tuple for tuple in y_tuple if tuple[0] in shared_ids]
    shared_tuples_y.sort()
    shared_ratings_x = [tuple[1] for tuple in shared_tuples_x]
    shared_ratings_y = [tuple[1] for tuple in shared_tuples_y]

    if denom_over_all:
        # Use all items in the denominator
        x_norm = np.sum([xi**2 for xi in x_ratings])
        y_norm = np.sum([yi**2 for yi in y_ratings])
    else:
        # Only use shared items in the denominator
        x_norm = np.sum([xi**2 for xi in shared_ratings_x])
        y_norm = np.sum([yi**2 for yi in shared_ratings_y])
    denom = np.sqrt(x_norm) * np.sqrt(y_norm)

    if denom == 0: return 0
    numer = sum([xi*yi for xi,yi in zip(shared_ratings_x, shared_ratings_y)])

    return numer / denom

def pearson_sim(x_tuple, y_tuple):
    """
    Calculate the pearson similarity between lists x and y
    Input are lists of tuples: [(id1, rating), (id2, rating), ...]
    Unlike Cosine sim, ONLY shared items can be considered
    If id1 or id2 is not in the relevant training data structure, use meanValue as its respective mean
    """
    # Unpack averages
    x_avgs = {tuple[0][0]:tuple[1] for tuple in x_tuple}
    y_avgs = {tuple[0][0]:tuple[1] for tuple in y_tuple}
    # Get shared items
    shared_ratings_x, shared_ratings_y = [], []
    shared_tuples_x, shared_tuples_y = [], []
    x_ids = {tuple[0][0] for tuple in x_tuple}
    y_ids = {tuple[0][0] for tuple in y_tuple}
    shared_ids = x_ids.intersection(y_ids)
    # if len(shared_ids) == 0: return 0

    shared_tuples_x = [tuple[0] for tuple in x_tuple if tuple[0][0] in shared_ids]
    shared_tuples_x.sort()
    shared_tuples_y = [tuple[0] for tuple in y_tuple if tuple[0][0] in shared_ids]
    shared_tuples_y.sort()
    shared_ratings_x = [tuple[1] - x_avgs[tuple[0]] for tuple in shared_tuples_x] ### Pearson --> Subtract the mean from each value
    shared_ratings_y = [tuple[1] - y_avgs[tuple[0]] for tuple in shared_tuples_y]

    # Only use shared items in the denominator
    x_norm = np.sum([xi**2 for xi in shared_ratings_x])
    y_norm = np.sum([yi**2 for yi in shared_ratings_y])
    denom = np.sqrt(x_norm * y_norm)

    if denom == 0: return 0
    numer = sum([xi*yi for xi,yi in zip(shared_ratings_x, shared_ratings_y)])

    return numer / denom


In [4]:
### COLLABORATIVE FILTERING
######################################


def predictValue_bySim_devFromMean(user_id, item_id, sim_func, type, meanValue, value_bounds=None, denom_over_all=None, **kwargs):
    """
    Predict some value (e.g., rating) that the user (user_id) will give an item (item_id) based on
    the input simularity function (sim_func). However, instead of predicting the rating directly,
    predict the deviation from the global mean rating

    # Necessary Data Structures --> These are built using training data ONLY
    itemsPerUser: A dictionary containing the list of items each user interacted with and corresponding values
       ex: itemsPerUser[user1] = [(item1, 2), (item3, 1), (item5, 5), ...]
    usersPerItem: A dictionary containing the list of users that interacted with each item and corresponding values
       ex: usersPerItem[item1] = [(user1, 2), (user4, 2), (user27, 2), ...]
    itemAverages: A dictionary containing the mean value for each item
    userAverages: A dictionary containing the mean value for each user
    #--- value_bounds: The min/max values that can be outputted as typically there's a scale (e.g., 1-5 stars)
    #--- meanValue: The mean of all values in valueDict.values()

    # Gathering similarity weights:
    If type==0:
       # Predict the rating as a weighted sum of ratings that user_id has given to other items #
       For each item (item_id2) that user_id has interacted with (except for item_id):
          Calculate item_id2's similarity to item_id based on shared user interactions
          Track these values as similarity weights
    else:
       # Predict the rating as a weighted sum of ratings that other users have given to item_id #
       For each user (user_id2) that have interacted with item_id (except for user_id):
          Calculate user_id2's similarity to user_id based on shared item interactions
          Track these values as similarity weights
    """
    # Edge case 1: Return global mean value if user_id or item_id are unseen
    if (user_id not in itemsPerUser) or (item_id not in usersPerItem): return meanValue
    # Initialize variables
    if denom_over_all is None: denom_over_all = True
    if value_bounds is None: value_bounds = (-np.inf, np.inf)
    values, similarities = [], []

    if type == "item":
        # Predict the rating as a weighted combination of how other items rated by user_id were
        # rated by similar users
        # if user_id not in userAverages: return meanValue # Skip keys that are not in the dict
        avg_value = userAverages[user_id]
        for user_id2,value in usersPerItem[item_id]:
            if user_id2 == user_id: continue
            # if user_id2 not in userAverages: continue  # Skip keys that are not in the dict
            if sim_func == jaccard_sim:
                simset_user_id = {tuple[0] for tuple in itemsPerUser[user_id] if tuple[0] != item_id}
                simset_user_id2 = {tuple[0] for tuple in itemsPerUser[user_id2] if tuple[0] != item_id}
                similarities.append(sim_func(simset_user_id, simset_user_id2))
            elif sim_func == cosine_sim_binary:
                if denom_over_all is None: denom_over_all = True
                simset_user_id = {tuple[0] for tuple in itemsPerUser[user_id] if tuple[0] != item_id}
                simset_user_id2 = {tuple[0] for tuple in itemsPerUser[user_id2] if tuple[0] != item_id}
                similarities.append(sim_func(simset_user_id, simset_user_id2, denom_over_all))
            elif sim_func == cosine_sim:
                if denom_over_all is None: denom_over_all = True
                simset_user_id = {tuple for tuple in itemsPerUser[user_id] if tuple[0] != item_id}
                simset_user_id2 = {tuple for tuple in itemsPerUser[user_id2] if tuple[0] != item_id}
                similarities.append(sim_func(simset_user_id, simset_user_id2, denom_over_all))
            elif sim_func == pearson_sim:
                simset_user_id = {(tuple, itemAverages[tuple[0]]) for tuple in itemsPerUser[user_id] if tuple[0] != item_id}
                simset_user_id2 = {(tuple, itemAverages[tuple[0]]) for tuple in itemsPerUser[user_id2] if tuple[0] != item_id}
                similarities.append(sim_func(simset_user_id, simset_user_id2))
            else:
                # Sim function not programmed
                print("Invalid sim_func")
                return None
            values.append(value - userAverages[user_id2])
    else:
        # Predict user_id's rating of item_id based on a weighted combination of how other users who
        # rated item_id rated other items
        # if item_id not in itemAverages: return meanValue # Skip keys that are not in the dict
        avg_value = itemAverages[item_id]
        for item_id2,value in itemsPerUser[user_id]:
            if item_id2 == item_id: continue
            # if item_id2 not in itemAverages: continue # Skip keys that are not in the dict
            if sim_func == jaccard_sim:
                simset_item_id = {tuple[0] for tuple in usersPerItem[item_id] if tuple[0] != user_id}
                simset_item_id2 = {tuple[0] for tuple in usersPerItem[item_id2] if tuple[0] != user_id}
                similarities.append(sim_func(simset_item_id, simset_item_id2))
            elif sim_func == cosine_sim_binary:
                if denom_over_all is None: denom_over_all = True
                simset_item_id = {tuple[0] for tuple in usersPerItem[item_id] if tuple[0] != user_id}
                simset_item_id2 = {tuple[0] for tuple in usersPerItem[item_id2] if tuple[0] != user_id}
                similarities.append(sim_func(simset_item_id, simset_item_id2, denom_over_all))
            elif sim_func == cosine_sim:
                if denom_over_all is None: denom_over_all = True
                simset_item_id = {tuple for tuple in usersPerItem[item_id] if tuple[0] != user_id}
                simset_item_id2 = {tuple for tuple in usersPerItem[item_id2] if tuple[0] != user_id}
                similarities.append(sim_func(simset_item_id, simset_item_id2, denom_over_all))
            elif sim_func == pearson_sim:
                simset_item_id = {(tuple, userAverages[tuple[0]]) for tuple in usersPerItem[item_id] if tuple[0] != user_id}
                simset_item_id2 = {(tuple, userAverages[tuple[0]]) for tuple in usersPerItem[item_id2] if tuple[0] != user_id}
                similarities.append(sim_func(simset_item_id, simset_item_id2))
            else:
                # Sim function not programmed
                print("Invalid sim_func")
                return None
            values.append(value - itemAverages[item_id2])
    # Edge case 2: Return global mean value if there are no similar items
    if np.sum(similarities) == 0: return meanValue

    numerator = np.sum([value*sim for value,sim in zip(values, similarities)])
    denominator = np.sum(similarities)
    output = avg_value + (numerator / denominator)
    if output < value_bounds[0]: return value_bounds[0]
    if output > value_bounds[1]: return value_bounds[1]

    return output

In [5]:
### OTHER HELPFUL FUNCTIONS
######################################
def get_rec_structs(train_data):
    """
    Extract stats used for creating the classifier features
    Input is (user_id, item_id, value), ...
    Typically value is rating, but can be other things (e.g., hours played)

    itemsPerUser: Records each item in the training set that each user interacted with (along with the corresponding value)
    usersPerItem: Records each user in the training set that each item interacted with (along with the corresponding value)
    valueDict: Records the value for each (user, item) tuple
    userAverages: Gives the average value for each user
    itemAverages: Gives the average value for each item

    """
    ### Record which items each user interacted with and which users interacted with which item
    itemsPerUser = defaultdict(list)
    usersPerItem = defaultdict(list)
    valueDict = {}
    for u,b,v in train_data:
        itemsPerUser[u].append((b,v))
        usersPerItem[b].append((u,v))
        valueDict[(u,b)] = v

    ### Calculate user and item average ratings
    userAverages = {}
    itemAverages = {}
    for u,tuples in itemsPerUser.items():
        values = [value for item,value in tuples]
        # values = [value for item,value in tuples if value != 0]
        # if len(values) == 0: continue
        userAverages[u] = sum(values) / len(values)
    for i,tuples in usersPerItem.items():
        values = [value for user,value in tuples]
        # values = [value for user,value in tuples if value != 0]
        # if len(values) == 0: continue
        itemAverages[i] = sum(values) / len(values)

    rec_structs = {"itemsPerUser":itemsPerUser,
                  "usersPerItem":usersPerItem,
                  "valueDict":valueDict,
                  "userAverages":userAverages,
                  "itemAverages":itemAverages,}

    return rec_structs


def unpack_rec_structs(rec_structs):
    """
    Take the input recommender_structs and return the itemized contents
    Not used in this assignment
    """
    itemsPerUser = rec_structs["itemsPerUser"]
    usersPerItem = rec_structs["usersPerItem"]
    valueDict = rec_structs["valueDict"]
    userAverages = rec_structs["userAverages"]
    itemAverages = rec_structs["itemAverages"]

    return (itemsPerUser, usersPerItem, valueDict, userAverages, itemAverages)

def pd_get_rec_structs(df, user_col, item_col, val_col, user_limit=None, item_limit=None, seed=None):
    """
    Like get_rec_structs(), but is customized for the pandas library
    Extract stats used for creating the classifier features
    Input is df which has at least a user_col, item_col, val_col, ...
    Typically value is rating, but can be other things (e.g., hours played)

    itemsPerUser: Records each item in the training set that each user interacted with (along with the corresponding value)
    usersPerItem: Records each user in the training set that each item interacted with (along with the corresponding value)
    valueDict: Records the value for each (user, item) tuple
    userAverages: Gives the average value for each user
    itemAverages: Gives the average value for each item

    user/item limit allows for estimating with less data to save time
    """
    def random_shuffle(x, seed):
        x = list(x)
        return random.Random(seed).shuffle(x)
    #####
    df2 = df.copy()
    df2["itemsPerUser"] = list(zip(df2[item_col], df2[val_col]))
    df2["usersPerItem"] = list(zip(df2[user_col], df2[val_col]))
    df2["user_item"] = list(zip(df2[user_col], df2[item_col]))
    #
    if user_limit is None: user_limit = len(df)
    if item_limit is None: item_limit = len(df)
    if seed is None: seed = 100
    # print(df2)
    itemsPerUser = df2.groupby(user_col)["itemsPerUser"].apply(lambda x: list(x)[:item_limit]).to_dict()
    usersPerItem = df2.groupby(item_col)["usersPerItem"].apply(lambda x: list(x)[:user_limit]).to_dict()

    valueDict = df2[["user_item", val_col]].set_index("user_item", drop=True).to_dict()[val_col]
    userAverages = df2.groupby(user_col)[val_col].mean().to_dict()
    itemAverages = df2.groupby(item_col)[val_col].mean().to_dict()
    meanValue = np.mean([val for val in valueDict.values()])

    return itemsPerUser, usersPerItem, valueDict, userAverages, itemAverages, meanValue

### Functions to read files (From homework stubs)
def readGz(path):
    output = []
    for l in gzip.open(path, mode = 'rt', encoding = "utf-8"):
        output.append(eval(l))
    return output

def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        u,b,r = l.strip().split(',')
        r = int(r)
        yield u,b,r

In [6]:
%%time
### Load review data
seed = 100

minmax_scalers = pd.read_csv("data/minmax_scalers.csv")
# test_df_results = pd.read_csv("data/test_df_results.csv")

# Train/valid/train2/test
train_filepath = "data/train_df.csv"
valid_filepath = "data/valid_df.csv"
train2_filepath = "data/train_df2.csv"
test_filepath = "data/test_df.csv"
train_df = pd.read_csv(train_filepath)
valid_df = pd.read_csv(valid_filepath)
train_df2 = pd.read_csv(train2_filepath)
test_df = pd.read_csv(test_filepath)

# Further split the data into X/y pairs for convenience
class_col = "playtime_log" # This what we want to predict
X_train, y_train = train_df[["user_id", "item_id"]], train_df[class_col]
X_valid, y_valid = valid_df[["user_id", "item_id"]], valid_df[class_col]
X_test, y_test = test_df[["user_id", "item_id"]], test_df[class_col]
X_train2, y_train2 = train_df2[["user_id", "item_id"]], train_df2[class_col]

X_test

CPU times: total: 453 ms
Wall time: 454 ms


Unnamed: 0,user_id,item_id
0,u5084,31130
1,u11861,233270
2,u4949,220200
3,u1808,4000
4,u5401,1280
...,...,...
169222,u11442,238960
169223,u12710,4000
169224,u10496,8190
169225,u6112,48700


### Collaborative Filtering

In [7]:
%%time
### Get recommendation structures
t0 = time.time()
user_limitPerItem, item_limitPerUser = 115, 151  ### Q3 for # players per game, Q# for # games per player
user_col, item_col, val_col = "user_id", "item_id", "playtime_log"
itemsPerUser, usersPerItem, valueDict, userAverages, itemAverages, meanValue = pd_get_rec_structs(train_df2, user_col, item_col, val_col, user_limit=user_limitPerItem, item_limit=item_limitPerUser, seed=seed)
# itemsPerUser, usersPerItem, valueDict, userAverages, itemAverages, meanValue = pd_get_rec_structs(train_df2, user_col, item_col, val_col)

print(len(itemsPerUser))
print(len(usersPerItem))
print(len(valueDict))
print(len(userAverages))
print(len(itemAverages))
print(meanValue)
print()
t_elapsed = time.time() - t0
print(f"Time elapsed = {t_elapsed // 60} min and {t_elapsed % 60}s ({t_elapsed}s)")

14551
9214
568502
14551
9214
3.309611240257363

Time elapsed = 0.0 min and 1.0676863193511963s (1.0676863193511963s)
CPU times: total: 1.08 s
Wall time: 1.07 s


In [8]:
%%time
### Getting all Collaborative filtering parameters
model_names = ["jaccard_sim", "cosine_sim_binary", "cosine_sim", "pearson_sim"]
similarity_funcs = [jaccard_sim, cosine_sim_binary, cosine_sim, pearson_sim]
types = ["user", "item"]
value_bounds = (0, np.inf)

model_cols = []
model_params = []
for model_name,sim_func in zip(model_names, similarity_funcs):
    for type in types:
        model_cols.append(f"{model_name}_{type[0]}")
        model_params.append({"sim_func":sim_func, "type":type, "value_bounds":value_bounds})

t0 = time.time()
# Get similarity scores for the validation set
for iter,(title,params) in enumerate(zip(model_cols, model_params)):
    t_start = time.time()
    print(f"Iteration {iter}: {params}")
    X_test[title + "_pred"] = [predictValue_bySim_devFromMean(row[1]["user_id"], row[1]["item_id"], meanValue=meanValue, **params) for row in X_test.iterrows()]
    print(f"{time.time() - t_start} seconds elapsed")
print(f"-----\nTotal time: {time.time() - t0}s")
print()

Iteration 0: {'sim_func': <function jaccard_sim at 0x0000022F059E1620>, 'type': 'user', 'value_bounds': (0, inf)}
217.26468467712402 seconds elapsed
Iteration 1: {'sim_func': <function jaccard_sim at 0x0000022F059E1620>, 'type': 'item', 'value_bounds': (0, inf)}
338.10482263565063 seconds elapsed
Iteration 2: {'sim_func': <function cosine_sim_binary at 0x0000022F059E18A0>, 'type': 'user', 'value_bounds': (0, inf)}
192.60677409172058 seconds elapsed
Iteration 3: {'sim_func': <function cosine_sim_binary at 0x0000022F059E18A0>, 'type': 'item', 'value_bounds': (0, inf)}
319.8633773326874 seconds elapsed
Iteration 4: {'sim_func': <function cosine_sim at 0x0000022F059E16C0>, 'type': 'user', 'value_bounds': (0, inf)}
834.2532353401184 seconds elapsed
Iteration 5: {'sim_func': <function cosine_sim at 0x0000022F059E16C0>, 'type': 'item', 'value_bounds': (0, inf)}
962.6482048034668 seconds elapsed
Iteration 6: {'sim_func': <function pearson_sim at 0x0000022F059E19E0>, 'type': 'user', 'value_boun

In [9]:
t_elapsed = time.time() - t0
print(f"Time elapsed to train model = {t_elapsed // 60} min and {t_elapsed % 60}s ({t_elapsed}s)")

Time elapsed to train model = 82.0 min and 5.793335676193237s (4925.793335676193s)


In [10]:
X_test

Unnamed: 0,user_id,item_id,jaccard_sim_u_pred,jaccard_sim_i_pred,cosine_sim_binary_u_pred,cosine_sim_binary_i_pred,cosine_sim_u_pred,cosine_sim_i_pred,pearson_sim_u_pred,pearson_sim_i_pred
0,u5084,31130,0.000000,1.149872,0.000000,1.160911,0.000000,1.139369,0.093175,0.928410
1,u11861,233270,4.002157,4.111752,4.009689,4.036022,4.058618,4.067302,9.543464,3.355340
2,u4949,220200,6.592512,6.130139,6.582512,6.142174,6.660515,6.185463,7.440528,5.237917
3,u1808,4000,7.785576,6.875094,7.784912,6.932624,8.012718,8.474173,0.000000,7.264237
4,u5401,1280,1.225296,1.380342,1.227562,1.390992,1.397568,1.344395,118.158063,0.000000
...,...,...,...,...,...,...,...,...,...,...
169222,u11442,238960,3.678588,4.617372,3.682926,4.553322,3.996505,4.679888,3.169614,5.594044
169223,u12710,4000,7.757600,8.045949,7.756390,8.008331,7.500237,7.860650,8.513014,8.575232
169224,u10496,8190,5.293708,4.876471,5.274781,4.892238,5.683755,4.893628,3.172978,5.643275
169225,u6112,48700,6.885990,6.299791,6.867490,6.305038,6.809989,6.402964,4.794068,6.579474


In [11]:
### Save model
model_cols2 = [title + "_pred" for title in model_cols]
# Save the one with the best results
test_results_CF = X_test[["user_id", "item_id"] + model_cols2].copy()
test_results_CF[test_results_CF[model_cols2] < 0] = 0
test_results_CF["playtime"] = test_df["playtime"]
test_results_CF["playtime_log"] = test_df["playtime_log"]
# test_results_CF["baseline_preds"] = test_df["playtimeLog_pred"]

test_results_CF

Unnamed: 0,user_id,item_id,jaccard_sim_u_pred,jaccard_sim_i_pred,cosine_sim_binary_u_pred,cosine_sim_binary_i_pred,cosine_sim_u_pred,cosine_sim_i_pred,pearson_sim_u_pred,pearson_sim_i_pred,playtime,playtime_log
0,u5084,31130,0.000000,1.149872,0.000000,1.160911,0.000000,1.139369,0.093175,0.928410,0,0.000000
1,u11861,233270,4.002157,4.111752,4.009689,4.036022,4.058618,4.067302,9.543464,3.355340,715,6.573680
2,u4949,220200,6.592512,6.130139,6.582512,6.142174,6.660515,6.185463,7.440528,5.237917,232,5.451038
3,u1808,4000,7.785576,6.875094,7.784912,6.932624,8.012718,8.474173,0.000000,7.264237,231,5.446737
4,u5401,1280,1.225296,1.380342,1.227562,1.390992,1.397568,1.344395,118.158063,0.000000,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
169222,u11442,238960,3.678588,4.617372,3.682926,4.553322,3.996505,4.679888,3.169614,5.594044,20,3.044522
169223,u12710,4000,7.757600,8.045949,7.756390,8.008331,7.500237,7.860650,8.513014,8.575232,12936,9.467847
169224,u10496,8190,5.293708,4.876471,5.274781,4.892238,5.683755,4.893628,3.172978,5.643275,207,5.337538
169225,u6112,48700,6.885990,6.299791,6.867490,6.305038,6.809989,6.402964,4.794068,6.579474,0,0.000000


In [31]:
### Save results
test_results_CF.to_csv("data/test_results_CF.csv", index=False)

In [36]:
### Save an alternative where the predictions are limited to the max of the training playtime_log value
max_playtime_log = y_train2.max()
print(max_playtime_log)
test_results_CF2 = test_results_CF.copy()

# test_results_CF2[test_results_CF2[model_cols2] > max_playtime_log]
for col in model_cols2:
    test_results_CF2[col + "_upperBounded"] = np.where(test_results_CF2[col] > max_playtime_log, max_playtime_log, test_results_CF2[col])
test_results_CF2 = test_results_CF2[["user_id", "item_id", "playtime", "playtime_log", "pearson_sim_u_pred_upperBounded", "pearson_sim_i_pred_upperBounded"]]

test_results_CF2

13.326792093561902


Unnamed: 0,user_id,item_id,playtime,playtime_log,pearson_sim_u_pred_upperBounded,pearson_sim_i_pred_upperBounded
0,u5084,31130,0,0.000000,0.093175,0.928410
1,u11861,233270,715,6.573680,9.543464,3.355340
2,u4949,220200,232,5.451038,7.440528,5.237917
3,u1808,4000,231,5.446737,0.000000,7.264237
4,u5401,1280,0,0.000000,13.326792,0.000000
...,...,...,...,...,...,...
169222,u11442,238960,20,3.044522,3.169614,5.594044
169223,u12710,4000,12936,9.467847,8.513014,8.575232
169224,u10496,8190,207,5.337538,3.172978,5.643275
169225,u6112,48700,0,0.000000,4.794068,6.579474


In [37]:
### Save results
test_results_CF2.to_csv("data/test_results_CF_upperBoundedPearson.csv", index=False)