In [4]:
import pandas as pd
import numpy as np
import nmf

udata = pd.read_table('ml-100k/u.data', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])

R = pd.DataFrame(index=range(0,943),columns=range(0,1682))

for row in udata.itertuples():
    R[row[1]-1][row[2]-1] = row[3]

R = R.fillna(0)

W = R.copy()
W = W.fillna(0)
W[W > 0] = 1

In [5]:
### Problem 1 ###
k = [10, 50, 100]

U10, V10 = nmf.nmf(R, k[0])
U50, V50 = nmf.nmf(R, k[1])
U100, V100 = nmf.nmf(R, k[2])
# NMF_model10 = decomposition.NMF(k[0])
# NMF_model50 = decomposition.NMF(k[1])
# NMF_model100 = decomposition.NMF(k[2])

# U10 = NMF_model10.fit_transform(R, W=W)
# V10 = NMF_model10.components_
# U50 = NMF_model50.fit_transform(R, W=W)
# V50 = NMF_model50.components_
# U100 = NMF_model100.fit_transform(R, W=W)
# V100 = NMF_model100.components_

def lse(w, r, u, v):
    error = w * (r - u.dot(v))**2
    return error.sum().sum()

err10 = lse(W, R, U10, V10)
err50 = lse(W, R, U50, V50)
err100 = lse(W, R, U100, V100)

In [6]:
print("For k = %d, the least squared error is %f" % (k[0], err10))
print("For k = %d, the least squared error is %f" % (k[1], err50))
print("For k = %d, the least squared error is %f" % (k[2], err100))

For k = 10, the least squared error is 3733.325285
For k = 50, the least squared error is 68.479936
For k = 100, the least squared error is 6.340439


In [None]:
### Problem 2 ###
# k-fold
from scipy.sparse import csr_matrix
from sklearn.cross_validation import KFold
from sklearn.metrics import auc
import matplotlib.pyplot as plt

A = csr_matrix(R.values)
N = A.getnnz()
nonzero_indices = zip(A.nonzero()[0], A.nonzero()[1])

def abs_err(actual, u, v, w, size):
    error = abs(w * (u.dot(v) - actual))
    return error.sum().sum() / size

def set_values(M, rows, columns, value):
    for index in zip(rows, columns):
        M.ix[index] = value

kf = KFold(n=N, n_folds=10, shuffle=True)
errors = []
i = 1
for train_index, test_index in kf:
    ri = A.nonzero()[0][test_index]
    ci = A.nonzero()[1][test_index]
    train = R.copy()
    set_values(train, ri, ci, 0)
    w = pd.DataFrame(index=range(0,943),columns=range(0,1682))
    w = w.fillna(0)
    set_values(w, ri, ci, 1)
    test_size = len(test_index)
    u,v = nmf.nmf(train, 100)
    errors.append(abs_err(R, u, v, w, test_size))
    print(errors[-1])
    
    ### Problem 3 ###
    precs = []
    recalls = []
    for threshold in range(1,6):
        print("-------------------")
        print("\tThreshold = %d" % threshold)
        test_result = pd.DataFrame(u.dot(v))
        pred = test_result.lookup(A.nonzero()[0][test_index], A.nonzero()[1][test_index])
        true = R.lookup(A.nonzero()[0][test_index], A.nonzero()[1][test_index])
        precision = 0 if np.sum(pred > threshold) == 0 \
                      else np.sum(true[pred > threshold] > threshold) / float(np.sum(pred > threshold)) * 100
        print("\tprecision: %.2f%%" % precision)
        recall = 0 if np.sum(true > threshold) == 0 \
                   else np.sum(pred[true > threshold] > threshold) / float(np.sum(true > threshold)) * 100
        print("\trecall: %.2f%%" % recall)
        precs.append(precision)
        recalls.append(recall)
    print("\tarea under curve: %.2f" % auc(recalls, precs))
    plt.cla()
    plt.xlim(xmin=0, xmax=np.max(recalls)*1.1)
    plt.ylim(ymin=0, ymax=np.max(precs)*1.1)
    plt.scatter(recalls, precs, s=60, marker='o')
    plt.plot(recalls, precs)
    plt.fill_between(recalls, precs, 0)
    plt.title("ROC Precision over Recall " + str(i))
    plt.xlabel('Recall (%)')
    plt.ylabel('Precision (%)')
    plt.savefig('graphs/problem3_ROC' + str(i))
    plt.close()
    i = i + 1

print("Highest absolute error is %f" % max(errors))
print("Lowest absolute error is %f" % min(errors))

1.04997694103
-------------------
	Threshold = 1
	precision: 94.98%
	recall: 98.56%
-------------------
	Threshold = 2
	precision: 87.45%
	recall: 92.70%
-------------------
	Threshold = 3
	precision: 70.82%
	recall: 67.73%
-------------------
	Threshold = 4
	precision: 40.00%
	recall: 34.67%
-------------------
	Threshold = 5
	precision: 0.00%
	recall: 0.00%
	area under curve: 5035.88
1.09236301626
-------------------
	Threshold = 1
	precision: 94.53%
	recall: 98.56%
-------------------
	Threshold = 2
	precision: 87.02%
	recall: 92.38%
-------------------
	Threshold = 3
	precision: 69.01%
	recall: 68.20%
-------------------
	Threshold = 4
	precision: 37.07%
	recall: 27.05%
-------------------
	Threshold = 5
	precision: 0.00%
	recall: 0.00%
	area under curve: 5131.41
1.08051959449
-------------------
	Threshold = 1
	precision: 94.04%
	recall: 99.03%
-------------------
	Threshold = 2
	precision: 87.20%
	recall: 93.45%
-------------------
	Threshold = 3
	precision: 68.28%
	recall: 68.86

In [None]:
### Problem 4 ###
new_W = R.copy()
new_R = W.copy()

U10, V10 = nmf.nmf(R, k[0])
U50, V50 = nmf.nmf(R, k[1])
U100, V100 = nmf.nmf(R, k[2])
# U10 = NMF_model10.fit_transform(new_R, W=new_W)
# V10 = NMF_model10.components_
# U50 = NMF_model50.fit_transform(new_R, W=new_W)
# V50 = NMF_model50.components_
# U100 = NMF_model100.fit_transform(new_R, W=new_W)
# V100 = NMF_model100.components_

new_err10 = lse(W, new_R, U10, V10)
new_err50 = lse(W, new_R, U50, V50)
new_err100 = lse(W, new_R, U100, V100)

In [None]:
print("For k = %d, the least squared error is %f" % (k[0], new_err10))
print("For k = %d, the least squared error is %f" % (k[1], new_err50))
print("For k = %d, the least squared error is %f" % (k[2], new_err100)) 
# Total squared error decreased

In [None]:
new_errors = []
new_NMF_model100 = decomposition.NMF(100, alpha=l, l1_ratio=0.0)
lambdas = [0.01, 0.1, 1]

i = 1
for train_index, test_index in kf:
    ri = A.nonzero()[0][test_index]
    ci = A.nonzero()[1][test_index]
    # set 10% test ratings to 0
    train = new_R.copy()
    set_values(train, ri, ci, 0)
    # set 10% test weights to 0
    w = new_W.copy()
    set_values(w, ri, ci, 0)
    # create filter for the 10% testing samples
    testw = pd.DataFrame(index=range(0,943),columns=range(0,1682))
    testw = w.fillna(0)
    set_values(testw, ri, ci, 1)
    test_size = len(test_index)
    # run alternating least square for each lambda value
    for l in lambdas:
        err = []
        u = new_NMF_model100.fit_transform(train, W = w)
        v = new_NMF_model100.components_
        err.append(abs_err(R, u, v, testw, test_size))
        print("lambda = %f; error = %f" % (l, err[-1]))
    new_errors.append(err)
    
    new_precs = []
    new_recalls = []
    for threshold in range(1,6):
        print("-------------------")
        print("\tThreshold = %d" % threshold)
        test_result = pd.DataFrame(u.dot(v))
        pred = test_result.lookup(A.nonzero()[0][test_index], A.nonzero()[1][test_index])
        true = R.lookup(A.nonzero()[0][test_index], A.nonzero()[1][test_index])
        precision = 0 if np.sum(pred > threshold) == 0 \
                      else np.sum(true[pred > threshold] > threshold) / float(np.sum(pred > threshold)) * 100
        print("\tprecision: %.2f%%" % precision)
        recall = 0 if np.sum(true > threshold) == 0 \
                   else np.sum(pred[true > threshold] > threshold) / float(np.sum(true > threshold)) * 100
        print()"\trecall: %.2f%%" % recall) 
        new_precs.append(precision)
        new_recalls.append(recall)
    print("\tarea under curve: %.2f" % auc(new_recalls, new_precs))
    plt.cla()
    plt.xlim(xmin=0, xmax=np.max(new_recalls)*1.1)
    plt.ylim(ymin=0, ymax=np.max(new_precs)*1.1)
    plt.scatter(new_recalls, new_precs, s=60, marker='o')
    plt.plot(new_recalls, new_precs)
    plt.fill_between(new_recalls, new_precs, 0)
    plt.title("new ROC Precision over Recall " + str(i))
    plt.xlabel('Recall (%)')
    plt.ylabel('Precision (%)')
    plt.savefig('graphs/problem4_ROC' + str(i))
    plt.close()
    i = i + 1

print "Highest absolute error is %f" % max(errors)
print "Lowest absolute error is %f" % min(errors)

In [None]:
### Problem 5 ###
type(R)

