## Movie Recommender
Given a movie plot from the movie corpus recommend top-k similar movies 

In [12]:
import re
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
def get_similarity_scores(search_vector, corpus_vectors):
    """Compute the cosine similarity between a given vector vs all corpus vectors"""
    scores = []
    ### function expect 2D array (n_samples, n_features)
    search_vector = search_vector.reshape(1, -1)
    for corpus_vector in corpus_vectors:
      corpus_vector = corpus_vector.reshape(1, -1)
      score = cosine_similarity(search_vector, corpus_vector)
      ### just extract and store the scalar score
      scores.append(score[0][0])
    return np.array(scores)


In [14]:
### TEST CODE for get_similarity_scores - with small sample data

# from sklearn.feature_extraction.text import TfidfVectorizer

# corpus_text = ["Sun is out and its nice",
#                "Sun is bright and I feel so good", 
#                "Earth is the only planet to sustain life",
#                "Sun is shining and I feed good", 
# ]
# search_text = "Sun is bright and I feel so good"

# v = TfidfVectorizer(stop_words="english")
# corpus_tfidf_vectors = v.fit_transform(corpus_text).toarray()
# df_test = pd.DataFrame({"plot":corpus_text, "tf_idf":list(corpus_tfidf_vectors)})
# display(df_test.head())
# df_test["tf_idf"][0]

# test_search_vector = df_test.loc[3, "tf_idf"]
# test_corpus_vectors = df_test["tf_idf"].values
# test_scores = get_similarity_scores(search_vector=test_search_vector, 
#                                     corpus_vectors=test_corpus_vectors)
# test_scores

In [15]:
def get_similar_movies(movie_index, df, k=3, use_embed='sbert'):
  ### Get the associated TF-IDF vectors for the given movie index and whole corpus
  search_vector = df.loc[movie_index, use_embed]
  corpus_vectors = df[use_embed].values

  ### Calculate the similarity score for given movie against whole corpus of movies 
  scores = get_similarity_scores(search_vector, corpus_vectors)

  ### Sort the scores in descending order and grab the sorted indices
  ### and return a dataframe containing `k` matching movies in order of similarity,
  ### first one will always be the movie we are searching for
  sorted_idx = np.flip(scores.argsort())[:k+1]
  df_matches = df.iloc[sorted_idx].copy().drop("index", axis=1)
  df_matches["score"] = scores[sorted_idx]
  return df_matches 

In [16]:
### TEST CODE for get_similar_movies logic - with small sample data

# scores = get_similar_movies(movie_index=1, df=df_test, k=4)
# ### sort the scores in descending order and grab the indices
# scores = np.array(scores)
# sorted_idx = np.flip(scores.argsort())

# ### Return top k indices 
# k = 4
# print(scores)
# print(sorted_idx)
# print(scores[sorted_idx])
# display(df_test)
# df_test.iloc[sorted_idx[0:k+1]]

### Use TF-IDF embeddings 

In [17]:
### load the movie data with embeds
df = pd.read_parquet("../artifacts/movie_plots_embeddings.parquet")
df

Unnamed: 0,index,url,title,plot,tfidf,sbert
0,0,https://en.wikipedia.org/wiki/White_Noise_(200...,White Noise (2005 film),Jonathan Rivers is an architect and lives with...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.07947263, 0.03986683, 0.015421576, 0.00145..."
1,1,https://en.wikipedia.org/wiki/Coach_Carter,Coach Carter,"Ken Carter lives in Richmond, California. He b...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.035826106, 0.05858802, 0.009567295, -0.051..."
2,2,https://en.wikipedia.org/wiki/Elektra_(2005_film),Elektra (2005 film),"After being killed,[a] Elektra Natchios is rev...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.020833697, 0.029291159, -0.04200214, 0.009..."
3,3,https://en.wikipedia.org/wiki/Racing_Stripes,Racing Stripes,"During a thunderstorm, a traveling circus, Cir...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.10842049, -0.0009454515, -0.026789859, -0...."
4,4,https://en.wikipedia.org/wiki/Tom_and_Jerry:_B...,Tom and Jerry: Blast Off to Mars,Tom chases Jerry as usual from their house and...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.033220917, -0.0066618803, 0.015374336, -0...."
...,...,...,...,...,...,...
3503,4038,https://en.wikipedia.org/wiki/Whitney_Houston:...,Whitney Houston: I Wanna Dance with Somebody,"In 1983, 19-year-old Whitney Houston is being ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.015936706, -0.045859165, 0.038560156, -0.0..."
3504,4039,https://en.wikipedia.org/wiki/The_Pale_Blue_Eye,The Pale Blue Eye,"In October 1830, alcoholic retired detective A...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.054327104, -0.025792062, -0.04702887, 0.00..."
3505,4040,https://en.wikipedia.org/wiki/Women_Talking_(f...,Women Talking (film),"A young woman sleeps alone, in bed. There are ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.022267314, 0.009472288, -0.030728644, 0.055..."
3506,4041,https://en.wikipedia.org/wiki/A_Man_Called_Otto,A Man Called Otto,"Otto Anderson is a 63-year-old widower, living...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.0014724878, 0.071103156, -0.013785284, 0.0..."


In [35]:
def search_by_title(search_term):
  ### Search a movie by name in the corpus
  df_filtered = df[df["title"].str.contains(search_term, flags=re.IGNORECASE, regex=True)]
  return df_filtered.index.tolist(), df_filtered.title.tolist()

In [19]:
search_by_title("batman")

([66, 130, 1973, 2066, 3278],
 ['Batman Begins',
  'The Batman vs. Dracula',
  'Batman: The Killing Joke (film)',
  'The Lego Batman Movie',
  'The Batman (film)'])

In [20]:
### TEST CODE - TF-IDF

### Find similar movies to "Batman Begins" (index = 66)
df_matches = get_similar_movies(movie_index=66, df=df, k=7, use_embed="tfidf")
df_matches[["title", "score", "url"]]

Unnamed: 0,title,score,url
66,Batman Begins,1.0,https://en.wikipedia.org/wiki/Batman_Begins
3278,The Batman (film),0.476559,https://en.wikipedia.org/wiki/The_Batman_(film)
2066,The Lego Batman Movie,0.425181,https://en.wikipedia.org/wiki/The_Lego_Batman_...
685,The Dark Knight,0.403091,https://en.wikipedia.org/wiki/The_Dark_Knight
130,The Batman vs. Dracula,0.359107,https://en.wikipedia.org/wiki/The_Batman_vs._D...
1973,Batman: The Killing Joke (film),0.29197,https://en.wikipedia.org/wiki/Batman:_The_Kill...
1889,I Smile Back,0.239213,https://en.wikipedia.org/wiki/I_Smile_Back
1291,A Dark Truth,0.198412,https://en.wikipedia.org/wiki/A_Dark_Truth


In [21]:
### TEST CODE - SBERT

### Find similar movies to "Batman Begins" (index = 66)
df_matches = get_similar_movies(movie_index=66, df=df, k=7, use_embed="sbert")
df_matches

Unnamed: 0,url,title,plot,tfidf,sbert,score
66,https://en.wikipedia.org/wiki/Batman_Begins,Batman Begins,"In Gotham City, a young Bruce Wayne falls down...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.03008927, 0.03546111, -0.108691104, 0.0958...",1.0
2066,https://en.wikipedia.org/wiki/The_Lego_Batman_...,The Lego Batman Movie,Within the DC superhero dimension of the Lego ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.037649337, -0.0074571595, -0.048542943, 0....",0.654825
685,https://en.wikipedia.org/wiki/The_Dark_Knight,The Dark Knight,A gang of masked criminals robs a mafia-owned ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.043647073, -0.004039985, -0.04245383, -0.0...",0.631577
3278,https://en.wikipedia.org/wiki/The_Batman_(film),The Batman (film),"On Halloween, Gotham City mayor Don Mitchell J...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.055248376, 0.013966305, -0.084762715, 0.01...",0.599469
1973,https://en.wikipedia.org/wiki/Batman:_The_Kill...,Batman: The Killing Joke (film),"In Gotham City, Batman investigates a crime sc...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.0379011, -0.011636484, -0.041242454, -0.02...",0.552321
130,https://en.wikipedia.org/wiki/The_Batman_vs._D...,The Batman vs. Dracula,"The Joker and the Penguin break out of Arkham,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.009792135, 0.017339878, -0.0689215, 0.08963...",0.523882
2738,https://en.wikipedia.org/wiki/Birds_of_Prey_(2...,Birds of Prey (2020 film),Four years after the defeat of the Enchantress...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.05983926, -0.022711568, -0.0588209, -0.041...",0.47768
1227,https://en.wikipedia.org/wiki/Setup_(2011_film),Setup (2011 film),"In Detroit three friends, Sonny (50 Cent), Dav...","[0.0, 0.015891056698329152, 0.0, 0.0, 0.0, 0.0...","[-0.06026122, 0.023599347, -0.080508195, -0.02...",0.460975
