# Missing data prediction for collaborative filtering

In [None]:
import pandas as pd
import numpy as np

# Load MovieLens 100K dataset into a dataframe of pandas
names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('u.data', sep='\t', names=names)
df.head()

In [None]:
# Select 500 most active users and 500 most active items from the dataset
n_most_active_users = 500
n_most_active_items = 500

user_ids = df.groupby('user_id').count().sort_values(by='rating', ascending=False).head(n_most_active_users).index
item_ids = df.groupby('item_id').count().sort_values(by='rating', ascending=False).head(n_most_active_items).index
df = df[(df['user_id'].isin(user_ids)) & (df['item_id'].isin(item_ids))]

In [None]:
# Map new internal ID for items
i_ids = df['item_id'].unique().tolist()
item_dict = dict(zip(i_ids, [i for i in range(len(i_ids))]))
df['item_id'] = df['item_id'].map(item_dict)

# Split Dataset

In [None]:
# The number of training users and active users
n_training_users = 300
n_active_users = n_most_active_users - n_training_users

# The number of GIVEN ratings for active users
GIVEN = 20

# Randomly select users from the most active users as training set
random_uids = np.random.choice(df.user_id.unique(), n_training_users, replace=False)
train_df = df[df['user_id'].isin(random_uids)]
# Map new internal ID for all users in the training set
u_ids = train_df['user_id'].unique().tolist()
user_dict = dict(zip(u_ids, [i for i in range(len(u_ids))]))
train_df['user_id'] = train_df['user_id'].map(user_dict)

# The rest of users are active users for testing
remain_df = df[~df['user_id'].isin(random_uids)]
# Map new internal ID for all active users
u_ids = remain_df['user_id'].unique().tolist()
user_dict = dict(zip(u_ids, [i for i in range(len(u_ids))]))
remain_df['user_id'] = remain_df['user_id'].map(user_dict)

# Randomly select GIVEN ratings for active users
active_df = remain_df.groupby('user_id').sample(n=GIVEN, random_state=1024)

test_df = remain_df[~remain_df.index.isin(active_df.index)]

In [None]:
train_ds

In [None]:
# Convert the format of datasets to matrices
df_zeros = pd.DataFrame({'user_id': np.tile(np.arange(0, n_training_users), n_most_active_items), 'item_id': np.repeat(np.arange(0, n_most_active_items), n_training_users), 'rating': 0})
train_ds = df_zeros.merge(train_df, how='left', on=['user_id', 'item_id']).fillna(0.).pivot_table(values='rating_y', index='user_id', columns='item_id')

df_zeros = pd.DataFrame({'user_id': np.tile(np.arange(0, n_active_users), n_most_active_items), 'item_id': np.repeat(np.arange(0, n_most_active_items), n_active_users), 'rating': 0})
active_ds = df_zeros.merge(active_df, how='left', on=['user_id', 'item_id']).fillna(0.).pivot_table(values='rating_y', index='user_id', columns='item_id')
test_ds = df_zeros.merge(test_df, how='left', on=['user_id', 'item_id']).fillna(0.).pivot_table(values='rating_y', index='user_id', columns='item_id')

train_ds, active_ds, test_ds

In [None]:
# Predicting All Missing Data in training set
imputed_train_ds = train_ds.values.copy()

# Predicting the missing values

In [None]:
## Put all your implementation for your solutioin in this cell only to predict the missing values; 
## NOTE 1: DO NOT change anything in the rest of the cells in this framework, 
## otherwise the changes might cause errors and make your implementation invalid.

## Note 2: 
## The user-item rating matrix is imputed_train_ds, 
## and the missing values are those 0s in imputed_train_ds. 
## You are required to predict them by using the solution in the given report. 

## The following parameters are required in the given report, 
## which is named "Effective Missing Data Prediction for Collaborative Filtering", 
## and you will need to use them. But, please do not change their values. 
LAMBDA = 0.7    # λ
GAMMA = 10      # γ
DELTA = 10      # δ
ITA = 0.7       # η
THETA = 0.7     # θ
EPSILON = 1e-9

imputed_train_ds
mat = imputed_train_ds

def usermean(u):
    meanu = mat[u,]
    meanu = meanu[meanu!=0]
    return meanu.mean()
    
def itemmean(i):
    meani = mat[:,i]
    meani = meani[meani!=0]
    return meani.mean()

def sim_user(a, u):
    subset_i = pd.DataFrame({'a':mat[a], 'u':mat[u]}) # subset of items both users rated
    subset_i = subset_i[(subset_i.T != 0).all()]
    
    numer = 0
    denom1 = 0
    denom2 = 0
    ramean = usermean(a)
    rumean = usermean(u)
    for i in list(subset_i.index):
        rai = subset_i['a'][i]
        rui = subset_i['u'][i]
        numer += (rai-ramean)*(rui-rumean)
        denom1 += (rai-ramean)**2
        denom2 += (rui-rumean)**2
    
    return (numer/(denom1**(1/2)*denom2**(1/2)), len(subset_i))

def sim_item(i, j):
    subset_u = pd.DataFrame({'i':mat[:, i], 'j':mat[:, j]}) # subset of items both users rated
    subset_u = subset_u[(subset_u.T != 0).all()]
    
    numer = 0
    denom1 = 0
    denom2 = 0
    rimean = itemmean(i)
    rjmean = iteammean(j)
    for u in list(subset_u.index):
        rui = subset_u['i'][u]
        ruj = subset_u['j'][u]
        numer += (rui-rimean)*(ruj-rjmean)
        denom1 += (rui-rimean)**2
        denom2 += (ruj-rjmean)**2
    
    return (numer/(denom1**(1/2)*denom2**(1/2)), len(subset_u))

def sim_user_m(a, u):
    r,n = sim_user(a, u)
    return min(n,GAMMA)/GAMMA * r

def sim_item_m(i, j):
    r,n = sim_item(i, j)
    return min(n,DELTA)/DELTA * r


def get_similar_users(u):
    similar_users = []
    for x in range(len(mat)):
        if sim_user_m(u, x) > LAMBDA:
            similar_users.append(x)
    return similar_users

def get_similar_items(i):
    similar_items = []
    for x in range(len(mat[0])):
        if sim_item_m(i, x) > THETA:
            similar_items.append(x)
    return similar_items

def prediction(u, i):
    umean = usermean(u)
    
    similar_users = get_similar_users(u)
    firstpartnumer = 0
    firstpartdenom = 0
    for ua in similar_users:
        meanua = usermean(ua)
        result = sim_user_m(ua, u)
        firstpartnumer += result*(mat[ua,i]-meanua)
        firstpartdenom += result
    
    imean = itemmean(i)
    
    similar_items = get_similar_items(i)
    secondpartnumer = 0
    secondpartdenom = 0
    for ik in similar_users:
        meanik = itemmean(ik)
        result = sim_item_m(ik, i)
        secondpartnumer += result*(mat[u,ik]-meanik)
        secondpartdenom += result
    
    return LAMBDA * (umean+firstpartnumer/firstpartdenom) + (1-LAMBDA) * (imean+secondpartnumer/secondpartdenom)

# Evaluation

### Compute Pearson Correlation Coefficient of All Pairs of Items between active set and imputed training set

In [None]:
imputed_train_ds = pd.DataFrame(imputed_train_ds)
imputed_train_ds

In [None]:
active_user_pearson_corr = np.zeros((active_ds.shape[0], train_ds.shape[0]))

# Compute Pearson Correlation Coefficient of All Pairs of Users between active set and imputed training set
for i, user_i_vec in enumerate(active_ds.values):
    for j, user_j_vec in enumerate(imputed_train_ds.values):
        
        # ratings corated by the current pair od users
        mask_i = user_i_vec > 0
        mask_j = user_j_vec > 0

        # corrated item index, skip if there are no corrated ratings
        corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
        if len(corrated_index) == 0:
            continue

        # average value of user_i_vec and user_j_vec
        mean_user_i = np.sum(user_i_vec) / (np.sum(np.clip(user_i_vec, 0, 1)) + EPSILON)
        mean_user_j = np.sum(user_j_vec) / (np.sum(np.clip(user_j_vec, 0, 1)) + EPSILON)

        # compute pearson corr
        user_i_sub_mean = user_i_vec[corrated_index] - mean_user_i
        user_j_sub_mean = user_j_vec[corrated_index] - mean_user_j

        r_ui_sub_r_i_sq = np.square(user_i_sub_mean)
        r_uj_sub_r_j_sq = np.square(user_j_sub_mean)

        r_ui_sum_sqrt = np.sqrt(np.sum(r_ui_sub_r_i_sq))
        r_uj_sum_sqrt = np.sqrt(np.sum(r_uj_sub_r_j_sq))

        sim = np.sum(user_i_sub_mean * user_j_sub_mean) / (r_ui_sum_sqrt * r_uj_sum_sqrt + EPSILON)

        # significance weighting
        weighted_sim = (min(len(corrated_index), GAMMA) / GAMMA) * sim

        active_user_pearson_corr[i][j] = weighted_sim

active_user_pearson_corr

## Predict Ratings of Testing Set

In [None]:
K = 10

test_ds_pred = np.zeros_like(test_ds.values)

for (i, j), rating in np.ndenumerate(test_ds.values):

    if rating > 0:

        sim_user_ids = np.argsort(active_user_pearson_corr[i])[-1:-(K + 1):-1]

        #==================user-based==================#
        # the coefficient values of similar users
        sim_val = active_user_pearson_corr[i][sim_user_ids]

        # the average value of the current user's ratings
        sim_users = imputed_train_ds.values[sim_user_ids]
        user_mean = np.sum(active_ds.values[i]) / (np.sum(np.clip(active_ds.values[i], 0, 1)) + EPSILON)
        sim_user_mean = np.sum(sim_users, axis=1) / (np.sum(np.clip(sim_users, 0, 1), axis=1) + EPSILON)

        # select the users who rated item j
        mask_rated_j = sim_users[:, j] > 0
        
        # sim(u, v) * (r_vj - mean_v)
        sim_r_sum_mean = sim_val[mask_rated_j] * (sim_users[mask_rated_j, j] - sim_user_mean[mask_rated_j])
        
        user_based_pred = user_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val[mask_rated_j]) + EPSILON)
        user_based_pred = np.clip(user_based_pred, 0, 5)

        test_ds_pred[i][j] = user_based_pred
        
test_ds_pred


## Compute MAE and RMSE

In [None]:
# MAE
MAE = np.sum(np.abs(test_ds_pred - test_ds.values)) / np.sum(np.clip(test_ds.values, 0, 1))

# RMSE
RMSE = np.sqrt(np.sum(np.square(test_ds_pred - test_ds.values)) / np.sum(np.clip(test_ds.values, 0, 1)))

print("MAE: {}, RMSE: {}" .format(MAE, RMSE))