In [1]:
import numpy as np
from numpy import dot
from numpy.linalg import norm

In [2]:
def cos_sim(A, B):
    return dot(A, B) / (norm(A)*norm(B))

In [3]:
doc1 = np.array([0,1,1,1])
doc2 = np.array([1,0,1,1])
doc3 = np.array([2,0,2,2])

In [4]:
cos_sim(doc1, doc2)

0.6666666666666667

In [5]:
cos_sim(doc1, doc3)

0.6666666666666667

In [6]:
cos_sim(doc2, doc3)

1.0000000000000002

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
data = pd.read_csv('movies_metadata.csv', low_memory=False)
data.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [9]:
data = data.head(20000)

In [10]:
data['overview'].isnull().sum()

135

In [11]:
data['overview'] = data['overview'].fillna('')

In [12]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['overview'])
tfidf_matrix.shape

(20000, 47487)

In [14]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim.shape

(20000, 20000)

In [15]:
title_to_index = dict(zip(data['title'], data.index))

In [16]:
idx = title_to_index['Father of the Bride Part II']
idx


4

In [18]:
cosine_sim

array([[1.        , 0.01575748, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.01575748, 1.        , 0.04907345, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.04907345, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.08375766],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.08375766, 0.        ,
        1.        ]])

In [22]:
temp = list(enumerate(cosine_sim[4]))
temp[:11]

[(0, 0.0),
 (1, 0.0),
 (2, 0.025004916790732457),
 (3, 0.0),
 (4, 1.0),
 (5, 0.0),
 (6, 0.03297982155878723),
 (7, 0.0),
 (8, 0.032751274283663236),
 (9, 0.0),
 (10, 0.0)]

In [23]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = title_to_index[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [idx[0] for idx in sim_scores]
    return data['title'].iloc[movie_indices]

In [24]:
get_recommendations('The Dark Knight Rises')

12481                            The Dark Knight
150                               Batman Forever
1328                              Batman Returns
15511                 Batman: Under the Red Hood
585                                       Batman
9230          Batman Beyond: Return of the Joker
18035                           Batman: Year One
19792    Batman: The Dark Knight Returns, Part 1
3095                Batman: Mask of the Phantasm
10122                              Batman Begins
Name: title, dtype: object

In [25]:
def dist(x, y):
    return np.sqrt(np.sum((x-y)**2))

In [26]:
doc1 = np.array((2,3,0,1))
doc2 = np.array((1,2,3,1))
doc3 = np.array((2,1,2,2))
docQ = np.array((1,1,0,1))

In [27]:
dist(doc1, docQ)

2.23606797749979

In [28]:
dist(doc2, docQ)


3.1622776601683795

In [29]:
dist(doc3, docQ)


2.449489742783178

In [30]:
doc1 = "apple banana everyone like likey watch card holder"
doc2 = "apple banana coupon passport love you"

In [31]:
tokenized_doc1 = doc1.split()
tokenized_doc2 = doc2.split()

In [32]:
tokenized_doc1

['apple', 'banana', 'everyone', 'like', 'likey', 'watch', 'card', 'holder']

In [35]:
tokenized_doc2

['apple', 'banana', 'coupon', 'passport', 'love', 'you']

In [36]:
union = set(tokenized_doc1).union(set(tokenized_doc2))
union

{'apple',
 'banana',
 'card',
 'coupon',
 'everyone',
 'holder',
 'like',
 'likey',
 'love',
 'passport',
 'watch',
 'you'}

In [37]:
intersection = set(tokenized_doc1).intersection(set(tokenized_doc2))
intersection

{'apple', 'banana'}

In [38]:
len(intersection) / len(union)

0.16666666666666666