In [248]:
import numpy as np
import pandas as pd

In [249]:
"""
Loading the data
"""
# training data
train = pd.read_csv('./data/train.txt', sep='\t', header=None, dtype=int)
train.index += 1
train.columns += 1
train = train.replace(0, np.nan) # replace 0's with NaN (missing)
# testing data
def parse_test(path):
    u = [] # list of users to make predictions for
    p = {} # keys are users, values are list of movies to predict for that user
    d = pd.DataFrame([]) # data containing ratings for users
    with open(path) as f:
        for line in f:
            if (len(line) == 0):
                continue
            split = line.split(' ')
            uid = int(split[0])
            mid = int(split[1])
            r = int(split[2])
            indices = d.index.values
            if uid not in indices:
                user = pd.Series(np.zeros(1000))
                user.name = uid
                d = d.append(user)
                p[uid] = []
                u.append(uid)
            if r != 0:
                d.set_value(uid, mid, r) # (r,c,new_value)
            else:
                l = p.get(uid)
                l.append(mid)
                p[uid] = l
    d.columns += 1
    d = d.replace(0, np.nan)
    return u, p , d
u5, p5, d5 = parse_test('./data/test5.txt')
u10, p10, d10 = parse_test('./data/test10.txt')
u20, p20, d20 = parse_test('./data/test20.txt')

In [250]:
"""
Helper fuctions
"""

def vector_length(v):
    """
    Returns the length of the vector v
    """
    return np.sqrt(v.dot(v))

def users_who_rated_movie(df, mid):
    """
    Returns a dataframe containing users who rated a movie with movie id mid
    """
    return df[df.loc[:, mid].notnull()]
    
def similarity(u, v):
    """
    Computes the similarity score between two vectors u and v.
    """
    cp = pd.DataFrame([u, v])
    cp = cp.loc[:, (cp.notnull()).all(axis=0)]
    u = cp.iloc[0]
    v = cp.iloc[1]
    if len(u) == 0:
        return 0
    if len(u) == 1:
        return 1 - 0.25 * abs(u.iloc[0] - v.iloc[0]) # scale of 0 to 1
    num = u.dot(v)
    den = vector_length(u) * vector_length(v)
    if den != 0:
        return num/den
    return 0


In [264]:
"""
User-Based Collaborative Filtering: Cosine Similarity
"""
def user_cosine_similarity_predict(train, test, users, movies, outpath, k=5):
    with open(outpath, 'w') as output:
        for active_uid in users[0:2]:
            active_ratings = test.loc[active_uid]
            active_ratings_mean = active_ratings.dropna().mean()
            for mid in movies[active_uid]:
                subset = users_who_rated_movie(train, mid)
                weights = subset.apply(similarity, args=(active_ratings,), axis=1)
                weights.sort_values(inplace=True, ascending=False)
                ratings = subset.loc[:, mid].reindex(weights.index)
                weights = weights[:k]
                ratings = ratings[:k]
                if sum(weights) == 0:
                    predicted = active_ratings_mean
                else:
                    predicted = weights.dot(ratings) / sum(weights)
                if predicted < 0.5:
                    predicted = 1
                s = str(active_uid) + ' ' + str(mid) + ' ' + str(int(round(predicted))) + '\n'
                print(s)
                output.write(s)
                
user_cosine_similarity_predict(train,  d5, u5, p5, './out.txt')

201 1 4

201 111 3

201 268 4

201 283 4

201 291 4

201 305 4

201 331 3

201 740 2

202 259 3

202 292 4

202 682 3

202 872 2

202 877 2

202 880 4

202 887 4

202 895 2

202 948 2



In [252]:
"""
User-Based Collaborative Filtering: Pearson Correlation
"""
def user_pearson_correlation_predict(train, test, users, movies, outpath, k=5, case_amp=1, iuf=False):
    with open(outpath, 'w') as output:
        train_means = train.mean(axis=1)
        test_means = test.mean(axis=1)
        train = train.sub(train_means, axis=0)
        test = test.sub(test_means, axis=0)
        for active_uid in users:
            active_ratings = test.loc[active_uid]
            for mid in movies[active_uid]:
                subset = users_who_rated_movie(train, mid)
                weights = subset.apply(similarity, axis=1, args=(active_ratings,))
                weights.sort_values(inplace=True, ascending=False)
                ratings = subset.loc[:, mid].reindex(weights.index)
                weights = weights[:k]
                ratings = ratings[:k]
                den = sum(abs(weights))
                if den == 0:
                    predicted = test_means.loc[active_uid]
                else:
                    predicted = test_means.loc[active_uid] + weights.dot(ratings) / den
                if predicted < 0.5:
                    predicted = 1
                if predicted > 5:
                    predicted = 5
                s = str(active_uid) + ' ' + str(mid) + ' ' + str(int(round(predicted))) + '\n'
                print(s)
                output.write(s)

In [253]:
def iuf_transform(u):
    """
    Transforms the movie's ratings vector u by multiplying it by its Inverse User Frequency
    (IUF). IUF = log(m/m_j), where m = total number of users, m_j = number of users that rated
    item j.
    """
    m = len(u)
    m_j = len(u.dropna())
    if m_j == 0:
        return u
    return np.log(m/m_j) * u

def iuf_transform_df(df):
    """
    Applies the IUF transformation to the entire data set.
    """
    iuf = df.apply(iuf_transform, axis=0)
    return iuf
    
def case_amplification(weights, p=2.5):
    """
    Amplifies weights, effectively exaggerating high and low weights.
    """
    return weights * weights.abs().pow(p-1)

In [254]:
"""
Item-Based Collaborative Filtering: Find items that that are similar to each other

Adjusted Cosine Similarity: determining the similarity between items i and j
1. Subtract the user's average rating from each of the user's ratings
2. Compute the cosine similarity between the ratings given to i and j
    Only consider users who have rated both items i and j
"""

def item_adjusted_cosine_predict(train, test, users, movies, outpath, k=5):
    with open (outpath, 'w') as output:
        # Average ratings by users
        user_train_means = train.mean(axis=1)
        user_test_means = test.mean(axis=1)
        # Average ratings by movie
        movie_train_means = train.mean(axis=0)
        # Apply adjustment
        train_adj = train.sub(train_means, axis=0)
        test_adj = test.sub(test_means, axis=0)
        for active_uid in users:
            active_user_ratings = test.loc[active_uid].dropna()
            for mid in movies[active_uid]:
                # Get the ratings for the movie we want to predict
                active_movie_ratings = train_adj.loc[:, mid]
                # Each row is a movie, each column is a user
                subset = train_adj.loc[:, active_user_ratings.index].T
                weights = subset.apply(similarity, axis=1, args=(active_movie_ratings,))
                weights.sort_values(inplace=True, ascending=False)
                ratings = active_user_ratings.reindex(weights.index)
                weights = weights[:k]
                ratings = ratings[:k]
                den = sum(abs(weights))
                if den == 0:
                    predicted = movie_train_means.loc[mid]
                else:
                    predicted = weights.dot(ratings) / den
                if predicted < 0.5:
                    predicted = 1
                if predicted > 5:
                    predicted = 5
                s = str(active_uid) + ' ' + str(mid) + ' ' + str(int(round(predicted))) + '\n'
                print(s)
                output.write(s)