In [2]:
import pandas as pd
import numpy as np

In [3]:
ratings = pd.read_table("ml-100k/u.data",names = ["userID","movieID","rating","_"],usecols = ["userID","movieID","rating"],
                    dtype = {"rating":np.float64},index_col = "userID")

In [4]:
ratings.head()

Unnamed: 0_level_0,movieID,rating
userID,Unnamed: 1_level_1,Unnamed: 2_level_1
196,242,3.0
186,302,3.0
22,377,1.0
244,51,2.0
166,346,1.0


In [5]:
joinedRatings = ratings.join(ratings,lsuffix='_1', rsuffix='_2')
joinedRatings.head()

Unnamed: 0_level_0,movieID_1,rating_1,movieID_2,rating_2
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,61,4.0,61,4.0
1,61,4.0,189,3.0
1,61,4.0,33,4.0
1,61,4.0,160,4.0
1,61,4.0,20,4.0


In [6]:
joinedRatings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20200812 entries, 1 to 943
Data columns (total 4 columns):
movieID_1    int64
rating_1     float64
movieID_2    int64
rating_2     float64
dtypes: float64(2), int64(2)
memory usage: 770.6 MB


In [7]:
uniqueJoinedRatings = joinedRatings.query("movieID_1 < movieID_2")
uniqueJoinedRatings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10050406 entries, 1 to 943
Data columns (total 4 columns):
movieID_1    int64
rating_1     float64
movieID_2    int64
rating_2     float64
dtypes: float64(2), int64(2)
memory usage: 383.4 MB


In [8]:
moviePairs = uniqueJoinedRatings.set_index(["movieID_1","movieID_2"])
moviePairs.head()
moviePairs.shape

(10050406, 2)

In [9]:
moviePairRatings = moviePairs.groupby(["movieID_1","movieID_2"])

In [10]:
moviePairRatings.count().info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 983206 entries, (1, 2) to (1679, 1680)
Data columns (total 2 columns):
rating_1    983206 non-null int64
rating_2    983206 non-null int64
dtypes: int64(2)
memory usage: 18.8 MB


In [11]:
moviePairRatings.size().shape

(983206,)

In [None]:
import random
def chivato(x):
    l =random.randint(0,300000)
    if l == 0:
        print(x)
    return pd.Series({"a":2,"b":3.0})

In [13]:
moviePairRatings.apply(chivato)

                     rating_1  rating_2
movieID_1 movieID_2                    
169       1091            4.0       2.0
          1091            5.0       4.0
          1091            5.0       2.0
          1091            5.0       4.0
          1091            3.0       2.0
          1091            5.0       3.0
          1091            3.0       4.0
          1091            5.0       1.0
          1091            3.0       2.0
          1091            1.0       1.0
          1091            3.0       3.0
          1091            5.0       4.0
          1091            4.0       2.0
          1091            4.0       1.0
                     rating_1  rating_2
movieID_1 movieID_2                    
240       1355            1.0       1.0
          1355            3.0       1.0
          1355            4.0       4.0
                     rating_1  rating_2
movieID_1 movieID_2                    
273       1183            3.0       3.0
          1183            5.0       1.0


Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
movieID_1,movieID_2,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2,2.0,3.0
1,3,2.0,3.0
1,4,2.0,3.0
1,5,2.0,3.0
1,6,2.0,3.0
1,7,2.0,3.0
1,8,2.0,3.0
1,9,2.0,3.0
1,10,2.0,3.0
1,11,2.0,3.0


In [16]:
from math import sqrt
def computeCosineSimilarity(ratingPairs):
    x = ratingPairs["rating_1"]
    y = ratingPairs["rating_2"]
    
    #sum_xx = x @ x.T
    #sum_yy = y @ y.T
    #sum_xy = x @ y.T
    sum_xx = x.dot(x.T)
    sum_yy = y.dot(y.T)
    sum_xy = x.dot(y.T)
    
    numerator = sum_xy
    denominator = sqrt(sum_xx) * sqrt(sum_yy)

    score = 0
    if (denominator):
        score = (numerator / (float(denominator)))

    return pd.Series({"score":score, "numPairs":ratingPairs.shape[0]})

In [17]:
moviePairSimilarities = moviePairRatings.apply(computeCosineSimilarity)

In [19]:
movieNames = pd.read_table("ml-100k/u.item",names = ["movieID","title"],usecols = ["movieID","title"],
                    sep ="|",index_col = "movieID", encoding = "cp1252")
movieNames.head()


Unnamed: 0_level_0,title
movieID,Unnamed: 1_level_1
1,Toy Story (1995)
2,GoldenEye (1995)
3,Four Rooms (1995)
4,Get Shorty (1995)
5,Copycat (1995)


In [20]:
movieID = 50
scoreThreshold = 0.97
coOccurenceThreshold = 50


In [21]:
filteredResults = moviePairSimilarities.query(
    "((movieID_1 == @movieID) or (movieID_1 == @movieID)) and (score>@scoreThreshold) and (numPairs>@coOccurenceThreshold)"
    ).sort_values(by="score",ascending=False) 
filteredResults

Unnamed: 0_level_0,Unnamed: 1_level_0,numPairs,score
movieID_1,movieID_2,Unnamed: 2_level_1,Unnamed: 3_level_1
50,172,345.0,0.989552
50,181,480.0,0.985723
50,174,380.0,0.98176
50,141,68.0,0.978939
50,178,109.0,0.977658
50,408,92.0,0.977595
50,498,138.0,0.976469
50,194,204.0,0.975151
50,169,103.0,0.974868
50,114,58.0,0.974182


In [22]:
for i,v in filteredResults.iterrows():
    if i[0] == movieID:
        recommended = i[1]
    else:
        recommended = i[0]
    nR = movieNames.loc[recommended,"title"]
    print(nR,v["score"])

Empire Strikes Back, The (1980) 0.989552207839
Return of the Jedi (1983) 0.985723086125
Raiders of the Lost Ark (1981) 0.981760098873
20,000 Leagues Under the Sea (1954) 0.97893856055
12 Angry Men (1957) 0.977657612045
Close Shave, A (1995) 0.977594829105
African Queen, The (1951) 0.976469222267
Sting, The (1973) 0.975151293774
Wrong Trousers, The (1993) 0.974868135546
Wallace & Gromit: The Best of Aardman Animation (1996) 0.97418161283
Indiana Jones and the Last Crusade (1989) 0.973539482999
North by Northwest (1959) 0.973453431527
Philadelphia Story, The (1940) 0.973429461163
Bridge on the River Kwai, The (1957) 0.972759163953
Casablanca (1942) 0.972657062373
L.A. Confidential (1997) 0.972507158872
Around the World in 80 Days (1956) 0.972453952687
Right Stuff, The (1983) 0.971696679601
Glory (1989) 0.971672803535
Princess Bride, The (1987) 0.971387496344
Pinocchio (1940) 0.971249981891
Good Will Hunting (1997) 0.971057126565
Vertigo (1958) 0.970218559546


In [1]:
moviePairSimilarities.to_cvs("similarities.json")

NameError: name 'moviePairSimilarities' is not defined