In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.metrics.pairwise import pairwise_distances

In [2]:
BASE  = './bytecup2016data'
IINFO = BASE + '/invited_info_train.txt'
QINFO = BASE + '/question_info.txt'
UINFO = BASE + '/user_info.txt'
VAL   = BASE + '/validate_nolabel.txt'

invdata = pd.read_csv(IINFO, delim_whitespace=True, header=None, names=["qid", "uid", "answered"])
qdata   = pd.read_csv(QINFO, delim_whitespace=True, header=None, names=["qid", "qtag", "wseq", "cseq", "nvotes", "nans", "ntqans"])
udata   = pd.read_csv(UINFO, delim_whitespace=True, header=None, names=["uid", "exptag", "wseq", "cseq"])
valdata = pd.read_csv(VAL)

merged_data = qdata.merge(invdata,on="qid", how="left").merge(udata, on="uid", how="right")
ratings_mtx_df = merged_data.pivot_table(values='answered',
                                             index='uid',
                                             columns='qid')


In [6]:
ratings_mtx_df = ratings_mtx_df.reindex(udata.uid)
ratings_mtx_df = pd.concat([ratings_mtx_df,pd.DataFrame(columns=qdata.qid)])

In [8]:
data = ratings_mtx_df.fillna(0)

In [23]:
user_preferences = data.as_matrix()
user_similarity = pairwise_distances(user_preferences, metric='cosine')

In [7]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return pred

In [8]:
user_prediction = predict(user_preferences, user_similarity, type='user')

In [9]:
user_predictions_df = pd.DataFrame(data=user_prediction,index=data.index,columns=data.columns)

In [11]:
valdata["label"] = ""

In [19]:
n_validation_users = len(valdata.index)
for i in range(n_validation_users):
    qid_val = valdata.iloc[i]['qid']
    uid_val = valdata.iloc[i]['uid']
    valdata.iloc[i, valdata.columns.get_loc('labels')] = user_predictions_df.loc[uid_val,qid_val]

In [21]:
valdata.to_csv('user-based_results.csv', separator=",")

In [22]:
# Using SVD
from scipy.sparse.linalg import svds

#get SVD components from train matrix. Choose k.
u, s, vt = svds(user_preferences, k = 20)
s_diag_matrix=np.diag(s)
pred = np.dot(np.dot(u, s_diag_matrix), vt)

In [24]:
svd_predictions_df = pd.DataFrame(data=pred,index=data.index,columns=data.columns)

In [25]:
valdata["label"] = ""

In [26]:
n_validation_users = len(valdata.index)
for i in range(n_validation_users):
    qid_val = valdata.iloc[i]['qid']
    uid_val = valdata.iloc[i]['uid']
    valdata.iloc[i, valdata.columns.get_loc('labels')] = svd_predictions_df.loc[uid_val,qid_val]

In [27]:
valdata.to_csv('svd-based_results.csv', separator=",")