In [68]:
import pandas as pd
import numpy as np
from numpy import *

In [6]:
def load_jokes(file):
    jokes = np.genfromtxt(file, delimiter=',', dtype=str)
    jokes = np.array(jokes[:,1])
    return jokes

In [7]:
jokes = load_jokes('jokes.csv')

In [8]:
jokes[2]

"Q. What's 200 feet long and has 4 teeth? A. The front row at a Willie Nelson Concert."

In [42]:
data = np.genfromtxt('modified_jester_data.csv',delimiter=',')
data

array([[ 3.18, 19.79,  1.34, ...,  0.  ,  0.  ,  0.  ],
       [15.08, 10.71, 17.36, ..., 11.34,  6.68, 12.07],
       [ 0.  ,  0.  ,  0.  , ...,  0.  ,  0.  ,  0.  ],
       ...,
       [16.58, 16.63, 15.85, ...,  0.  ,  0.  ,  0.  ],
       [ 3.67,  4.45,  3.67, ...,  3.77,  3.77,  3.28],
       [ 9.88, 11.73,  9.16, ...,  0.  ,  0.  ,  0.  ]])

In [43]:
data[2]

array([ 0.  ,  0.  ,  0.  ,  0.  , 20.03, 20.27, 20.03, 20.27,  0.  ,
        0.  , 18.33, 18.57, 20.37, 17.17,  4.64,  4.11,  3.14, 20.03,
       20.03, 20.03, 18.28,  0.  , 19.25,  0.  ,  0.  , 18.48, 18.28,
       18.28, 19.93,  0.  , 17.17, 18.28,  0.  ,  0.  , 19.98, 18.33,
        0.  , 17.17, 20.08, 18.33, 18.52, 20.27, 20.27,  0.  , 20.27,
        0.  , 17.17, 18.33, 20.08, 18.28,  0.  ,  0.  , 18.28, 18.33,
        0.  , 18.23,  0.  ,  0.  ,  0.  , 20.27, 17.46, 18.28,  0.  ,
        0.  , 18.04, 18.28,  0.  , 18.28, 19.25,  0.  ,  0.  ,  0.  ,
        0.  ,  0.  ,  0.  , 19.93,  0.  ,  0.  ,  0.  , 20.08,  0.  ,
        0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  , 20.03,
        0.  ,  0.  ,  0.  , 20.08,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        0.  ])

In [69]:
def pearsSim(inA,inB):
    if len(inA) < 3 : return 1.0
    return 0.5 + 0.5 * corrcoef(inA, inB, rowvar = 0)[0][1]

In [45]:
def standEst(dataMat, user, simMeas, item):
    n = shape(dataMat)[1]
    simTotal = 0.0; ratSimTotal = 0.0
    for j in range(n):
        userRating = dataMat[user,j]
        if userRating == 0: continue
        overLap = nonzero(logical_and(dataMat[:,item]>0, \
                                      dataMat[:,j]>0))[0]
        if len(overLap) == 0: similarity = 0
        else: similarity = simMeas(dataMat[overLap,item], \
                                   dataMat[overLap,j])
        #print 'the %d and %d similarity is: %f' % (item, j, similarity)
        simTotal += similarity
        ratSimTotal += similarity * userRating
    if simTotal == 0: return 0
    else: return ratSimTotal/simTotal

In [46]:
def svdEst(dataMat, user, simMeas, item):
    n = shape(dataMat)[1]
    simTotal = 0.0; ratSimTotal = 0.0
    data=mat(dataMat)
    U,Sigma,VT = la.svd(data)
    Sig4 = mat(eye(4)*Sigma[:4]) #arrange Sig4 into a diagonal matrix
    xformedItems = data.T * U[:,:4] * Sig4.I  #create transformed items
    for j in range(n):
        userRating = data[user,j]
        if userRating == 0 or j==item: continue
        similarity = simMeas(xformedItems[item,:].T,\
                             xformedItems[j,:].T)
        #print 'the %d and %d similarity is: %f' % (item, j, similarity)
        simTotal += similarity
        ratSimTotal += similarity * userRating
    if simTotal == 0: return 0
    else: return ratSimTotal/simTotal

In [59]:
def cross_validate_user(dataMat, user, test_ratio, estMethod=standEst, simMeas=pearsSim):
    number_of_items = np.shape(dataMat)[1]
    rated_items_by_user = np.array([i for i in range(number_of_items) if dataMat[user,i]>0])
    test_size = test_ratio * len(rated_items_by_user)
    test_indices = np.random.randint(0, len(rated_items_by_user), test_size)
    withheld_items = rated_items_by_user[test_indices]
    original_user_profile = np.copy(dataMat[user])
    dataMat[user, withheld_items] = 0 # So that the withheld test items is not used in the rating estimation below
    error_u = 0.0
    count_u = len(withheld_items)

    # Compute absolute error for user u over all test items
    for item in withheld_items:
        # Estimate rating on the withheld item
        estimatedScore = estMethod(dataMat, user, simMeas, item)
        error_u = error_u + abs(estimatedScore - original_user_profile[item])

    # Now restore ratings of the withheld items to the user profile
    for item in withheld_items:
        dataMat[user, item] = original_user_profile[item]
    # Return sum of absolute errors and the count of test cases for this user
    # Note that these will have to be accumulated for each user to compute MAE
    return error_u, count_u

In [91]:
def test(dataMat, test_ratio, estMethod):
    
    tot_error = 0
    tot_count = 0
    
    for i in range(dataMat.shape[0]):
        if estMethod == 'standEst':
            error_u, count_u = cross_validate_user(dataMat, i, test_ratio, standEst)
            tot_error = tot_error + error_u
            tot_count = tot_count + count_u
        
        if estMethod == 'svdEst':
            error_u, count_u = cross_validate_user(dataMat, i, test_ratio, svdEst)
            tot_error = tot_error + error_u
            tot_count = tot_count + count_u
            
    MAE = tot_error/tot_count
            
    print ('Mean Absoloute Error for ',estMethod,' : ', MAE)

In [None]:
test(data, 0.2, 'standEst')

In [None]:
test(data, 0.2, 'svdEst')

In [86]:
def print_most_similar_jokes(dataMat, jokes, queryJoke, k, metric=pearsSim):
    
    print(" Query Joke Number:", queryJoke,":", jokes[queryJoke], "\n")
    
    data_transposed = dataMat.T
    joke_list = []
    for i in range(data_transposed.shape[0]):
        
        distance = metric(data_transposed[queryJoke], data_transposed[i])
        
        user_dist = [distance, i]
        
        joke_list.append(user_dist)
    
    joke_list.sort()
    
    k_jokes = joke_list[1:k+1]
    
    print("The best", k, "similar jokes are:", "\n")
    
    for i in range(len(k_jokes)):
        print(jokes[k_jokes[i][1]])

In [87]:
print_most_similar_jokes(data, jokes, 1, 4)

 Query Joke Number: 1 : This couple had an excellent relationship going until one day he came home from work to find his girlfriend packing. He asked her why she was leaving him and she told him that she had heard awful things about him. "What could they possibly have said to make you move out?" "They told me that you were a pedophile." He replied "That's an awfully big word for a ten year old." 

The best 4 similar jokes are: 

Q: If a person who speaks three languages is called "tri-lingual" and a person who speaks two languages is called "bi-lingual" what do calla person who only speaks one language?A: American!
Q. What's O. J. Simpson's Internet address? A.	Slash slash backslash slash slash escape.
What's the difference between a MacIntosh and anEtch-A-Sketch? You don't have to shake the Mac to clear the screen.
Q:  What did the blind person say when given some matzah?A:  Who the hell wrote this?
