In [1]:
import dask.dataframe as dd
import numpy as np
import pandas as pd

In [2]:
ratings = dd.read_table("ml-100k/u.data",names = ["userID","movieID","rating","_"],usecols = ["userID","movieID","rating"],
                    dtype = {"rating":np.float64}).set_index("userID")

In [3]:
ratings.head()

Unnamed: 0_level_0,movieID,rating
userID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,55,5.0
1,203,4.0
1,183,5.0
1,150,5.0
1,68,4.0


In [4]:
joinedRatings = ratings.join(ratings,lsuffix='_1', rsuffix='_2')
joinedRatings.head()

Unnamed: 0_level_0,movieID_1,rating_1,movieID_2,rating_2
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,55,5.0,55,5.0
1,55,5.0,203,4.0
1,55,5.0,183,5.0
1,55,5.0,150,5.0
1,55,5.0,68,4.0


In [5]:
joinedRatings.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 4 entries, movieID_1 to rating_2
dtypes: float64(2), int64(2)

In [6]:
uniqueJoinedRatings = joinedRatings.query("movieID_1 < movieID_2")
uniqueJoinedRatings.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 4 entries, movieID_1 to rating_2
dtypes: float64(2), int64(2)

In [7]:
moviePairs = uniqueJoinedRatings.set_index("movieID_1")
moviePairs.head()
moviePairs.shape

(Delayed('int-c03bca2f-26e7-4cfc-bba8-77e7bcd5860f'), 3)

In [8]:
moviePairRatings = moviePairs.groupby(["movieID_1","movieID_2"])

In [13]:
moviePairRatings.count().info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 2 entries, rating_1 to rating_2
dtypes: int64(2)

In [9]:
moviePairRatings.size().shape

(dd.Scalar<size-ag..., dtype=int32>,)

In [10]:
from math import sqrt
def computeCosineSimilarity(ratingPairs):
    x = ratingPairs["rating_1"]
    y = ratingPairs["rating_2"]
    
    #sum_xx = x @ x.T
    #sum_yy = y @ y.T
    #sum_xy = x @ y.T
    sum_xx = x.dot(x.T)
    sum_yy = y.dot(y.T)
    sum_xy = x.dot(y.T)
    
    numerator = sum_xy
    denominator = sqrt(sum_xx) * sqrt(sum_yy)

    score = 0
    if (denominator):
        score = (numerator / (float(denominator)))

    return pd.Series({"score":score, "numPairs":ratingPairs.shape[0]})

In [17]:
moviePairSimilarities = moviePairRatings.apply(computeCosineSimilarity, meta={'score': 'f8', 'numPairs': 'i8'})

moviePairSimilarities = moviePairSimilarities.compute()

In [18]:
movieNames = pd.read_table("ml-100k/u.item",names = ["movieID","title"],usecols = ["movieID","title"],
                    sep ="|",index_col = "movieID", encoding = "cp1252")
movieNames.head()


Unnamed: 0_level_0,title
movieID,Unnamed: 1_level_1
1,Toy Story (1995)
2,GoldenEye (1995)
3,Four Rooms (1995)
4,Get Shorty (1995)
5,Copycat (1995)


In [19]:
movieID = 50
scoreThreshold = 0.97
coOccurenceThreshold = 50


In [20]:
filteredResults = moviePairSimilarities.query(
    "((movieID_1 == @movieID) or (movieID_2 == @movieID)) and (score>@scoreThreshold) and (numPairs>@coOccurenceThreshold)"
    ).sort_values(by="score",ascending=False) 
filteredResults

Unnamed: 0_level_0,Unnamed: 1_level_0,score,numPairs
movieID_1,movieID_2,Unnamed: 2_level_1,Unnamed: 3_level_1
50,172,0.989552,345.0
50,181,0.985723,480.0
50,174,0.98176,380.0
50,141,0.978939,68.0
50,178,0.977658,109.0
50,408,0.977595,92.0
50,498,0.976469,138.0
50,194,0.975151,204.0
50,169,0.974868,103.0
50,114,0.974182,58.0


In [21]:
for i,v in filteredResults.iterrows():
    if i[0] == movieID:
        recommended = i[1]
    else:
        recommended = i[0]
    nR = movieNames.loc[recommended,"title"]
    print(nR,v["score"])

Empire Strikes Back, The (1980) 0.9895522078385338
Return of the Jedi (1983) 0.9857230861253026
Raiders of the Lost Ark (1981) 0.981760098872619
20,000 Leagues Under the Sea (1954) 0.9789385605497993
12 Angry Men (1957) 0.9776576120448436
Close Shave, A (1995) 0.9775948291054827
African Queen, The (1951) 0.9764692222674887
Sting, The (1973) 0.9751512937740359
Wrong Trousers, The (1993) 0.9748681355460885
Wallace & Gromit: The Best of Aardman Animation (1996) 0.9741816128302572
Indiana Jones and the Last Crusade (1989) 0.9735394829992481
North by Northwest (1959) 0.9734534315266805
Philadelphia Story, The (1940) 0.9734294611633468
Bridge on the River Kwai, The (1957) 0.9727591639531913
Casablanca (1942) 0.9726570623726027
L.A. Confidential (1997) 0.9725071588724558
Usual Suspects, The (1995) 0.9724956031333988
Around the World in 80 Days (1956) 0.972453952686859
Right Stuff, The (1983) 0.9716966796009308
Glory (1989) 0.9716728035350273
Princess Bride, The (1987) 0.9713874963443219
Pinoc

In [1]:
moviePairSimilarities.to_cvs("similarities.json")

NameError: name 'moviePairSimilarities' is not defined