# Content-based filtering

working with a smaller db for the sake of example and learning

In [2]:
import pandas

movies = pandas.read_csv("movies_small.csv", sep=";")

In [3]:
movies

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,190000000,"[{""id"": 28, ""name"": ""Action""}]",http://www.furious7.com/,168259,"[{""id"": 830, ""name"": ""car race""}, {""id"": 3428,...",en,Furious 7,Deckard Shaw seeks revenge against Dominic Tor...,102.322217,"[{""name"": ""Universal Pictures"", ""id"": 33}, {""n...","[{""iso_3166_1"": ""JP"", ""name"": ""Japan""}, {""iso_...",2015-04-01,1506249360,137,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Vengeance Hits Home,Furious 7,7.3,4176
1,200000000,"[{""id"": 16, ""name"": ""Animation""}, {""id"": 10751...",http://www.disney.go.com/cars/,49013,"[{""id"": 830, ""name"": ""car race""}, {""id"": 9663,...",en,Cars 2,Star race car Lightning McQueen and his pal Ma...,49.98659,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2011-06-11,559852396,106,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Ka-ciao!,Cars 2,5.8,2033
2,170000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 878, ""na...",http://marvel.com/guardians,118340,"[{""id"": 8828, ""name"": ""marvel comic""}, {""id"": ...",en,Guardians of the Galaxy,"Light years from Earth, 26 years after being a...",481.098624,"[{""name"": ""Marvel Studios"", ""id"": 420}, {""name...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2014-07-30,773328629,121,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,All heroes start somewhere.,Guardians of the Galaxy,7.9,9742
3,145000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.kungfupanda.com/,140300,"[{""id"": 478, ""name"": ""china""}, {""id"": 779, ""na...",en,Kung Fu Panda 3,"Continuing his ""legendary adventures of awesom...",56.747978,"[{""name"": ""Twentieth Century Fox Film Corporat...","[{""iso_3166_1"": ""CN"", ""name"": ""China""}, {""iso_...",2016-01-23,521170825,95,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Grab destiny by the rice dumplings.,Kung Fu Panda 3,6.7,1603
4,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
5,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


to do content based filtering we will be working with the "overview" column, which contains a description of each movie. 

Our objective will be to recommend similar movies to a movie a user has selected - eg. on Amazon a user is looking at Kung Fu Panda 3 and we recommend similar titles based on the description of the movies. We will do this by comparing every description in our db to every other description in our db. To do this we will use the sidekick library (sklearn), which will vectorize our text - turning words into numerical values by which to compare each other.

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words="english")

movies["overview"] = movies["overview"].fillna("")  # convert None values into a string

In [6]:
tfidf_matrix = tfidf.fit_transform(movies["overview"])

In [7]:
# show the array:

tfidf_matrix.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.33333333, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.33333333, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.33333333,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.33333333, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.33333333,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

In [8]:
# make it prettier with pandas:

pandas.DataFrame(tfidf_matrix.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,111,112,113,114,115,116,117,118,119,120
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.190097,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.155883,0.0
2,0.237761,0.237761,0.237761,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.237761,0.0,0.0,0.0,0.389934
3,0.0,0.0,0.0,0.0,0.270444,0.0,0.0,0.270444,0.0,0.0,...,0.0,0.270444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.13565,0.271301,0.0,0.13565,0.0,...,0.13565,0.0,0.0,0.0,0.13565,0.0,0.0,0.0,0.0,0.111235
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.353458,...,0.0,0.0,0.0,0.176729,0.0,0.0,0.176729,0.176729,0.14492,0.0


In [10]:
# then make our columns feature names instead of the vectorized values:

pandas.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

Unnamed: 0,26,abducted,accuser,adventure,adventures,assumes,attorney,awesomeness,bane,barsoom,...,terrorist,threats,toretto,transported,villainous,wanted,war,weary,world,years
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.190097,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.155883,0.0
2,0.237761,0.237761,0.237761,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.237761,0.0,0.0,0.0,0.389934
3,0.0,0.0,0.0,0.0,0.270444,0.0,0.0,0.270444,0.0,0.0,...,0.0,0.270444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.13565,0.271301,0.0,0.13565,0.0,...,0.13565,0.0,0.0,0.0,0.13565,0.0,0.0,0.0,0.0,0.111235
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.353458,...,0.0,0.0,0.0,0.176729,0.0,0.0,0.176729,0.176729,0.14492,0.0


## some explaining

the floats in the columns above are tfidf values. They indicate a calculation based on the frequency of the word in that movie overview but also the rarity of the word in the dataframe. So the higher the values, the rarer the word. tfidf stands for "term frequency–inverse document frequency" - see https://en.wikipedia.org/wiki/Tf%E2%80%93idf . Wiki says:  "A high weight in tf–idf is reached by a high term frequency (in the given document) and a low document frequency of the term in the whole collection of documents"

a simple way of putting it is it represents or attempts to calculate the importance of a word for that document (movie description in our case). For example: Furious7 has a relatively low amount of non-stop words in its overview (10, in fact, by my count), so any words it has that no other movies have will get a high coefficient - .333333 given our df. If it were to have one of those words twice in the overview it would be even higher. If we were studying Pauls epistle to the Romans compared to all Roman-era Greek literature, there would likely be high tfidf coefficients for words like "righteousness" "justification" "faith" etc.

# Similarity matrix

To compare the overview vector we will use an algorithm called linear kernel. There are others as well, but this one is pretty fast.

since we are using pairwise analysis, we give the linear_kernel function the same matrix twice, so we are crossing the matrix with itself.

In [11]:
from sklearn.metrics.pairwise import linear_kernel
similarity_matrix = linear_kernel(tfidf_matrix, tfidf_matrix)

In [12]:
similarity_matrix

array([[1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , 0.        , 0.        ,
        0.02259057],
       [0.        , 0.        , 1.        , 0.        , 0.0433744 ,
        0.        ],
       [0.        , 0.        , 0.        , 1.        , 0.        ,
        0.03213867],
       [0.        , 0.        , 0.0433744 , 0.        , 1.        ,
        0.01612024],
       [0.        , 0.02259057, 0.        , 0.03213867, 0.01612024,
        1.        ]])

the matrix compares all the movie overviews to the movie overviews. its a list of lists. the first list (Furious7) does not share any words with any other movie so it has zeros...other than in the first slot, which is itself - so there is a 1 there - it is equal. Lets look at the second movie though.

In [13]:
similarity_matrix[1]

array([0.        , 1.        , 0.        , 0.        , 0.        ,
       0.02259057])

in the second movie (Cars 2) we have some similarity with the sixth movie, so there must be word(s) in common. Notice though there is no similarity between Cars 2 and Furious 7...which you may expect since theyre about cars...but the overviews dont have any words in common, so we are limited by only taking into account one column - one group of data. For a better comparison we could also take into account the keywords column, joining it to the overview. before creating our matrix/vector. 

## find most similar movies

In [14]:
movie_title = "John Carter"

In [22]:
idx = movies.loc[movies["title"] == movie_title].index[0]
idx

5

In [15]:
sorted(similarity_matrix[idx], reverse=True)  # John carter has a index of 5 so we grab that one.

[1.0000000000000004,
 0.032138674066915646,
 0.022590565795326856,
 0.016120240648257757,
 0.0,
 0.0]

so we sorted the list by similarity...but now we lost track of which movie is which - we lost the indexing. So we must use the enumerate function.

In [24]:
scores = list(enumerate(similarity_matrix[idx]))
scores

[(0, 0.0),
 (1, 0.022590565795326856),
 (2, 0.0),
 (3, 0.032138674066915646),
 (4, 0.016120240648257757),
 (5, 1.0000000000000004)]

now that we have the indices we can sort the scores:

In [25]:
sorted(scores, reverse=True)

[(5, 1.0000000000000004),
 (4, 0.016120240648257757),
 (3, 0.032138674066915646),
 (2, 0.0),
 (1, 0.022590565795326856),
 (0, 0.0)]

BUT look - it sorted the tuples by the first value, the index value...thats not helpful, we want to sort by the second value, the similarity value.

So we will provide an arg in sorted() that will sort by the similarity score. we will use a lambda function. The arg is key=lambda x: x[1] this is equivalent to a lambda function, which the sorted() function allows and looks for. A lambda function is a one line anonymous funciton.

In [None]:
sorted(scores, key=lambda x: x[1], reverse=True)

In [None]:
def similar_movies(movie_title, nr_movies):
    

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=77af85d6-9c30-46af-a4d4-97aefa89991e' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>