Starting with importing some modules

In [1]:
import gc
import pandas as pd
import numpy as np
from ast import literal_eval
from nltk.stem import PorterStemmer
import psycopg2
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MultiLabelBinarizer
import warnings; warnings.simplefilter('ignore')

  from tqdm.autonotebook import tqdm, trange


Reading data for movies metadata and associated keywords

In [2]:
metadata = pd.read_csv("data/movies_metadata.csv")
keywords = pd.read_csv("data/keywords.csv")

Now I am going to filter out low rated movies from my dataset. It won't make any sense to recommend movies to viewers if the movies are just not good even if there is high similarity scores.

We will use IMDB formula to decide movie rating,

$$ 
Weighted\ Rating\ (WR) =  \left(\frac{v}{v+m}\right) R + \left(\frac{m}{v+m}\right) C
$$

where,

- v is the number of votes for the movie
- m is the minimum votes required to be listed in the chart
- R is the average rating of the movie
- C is the mean vote across the whole report

In [3]:
metadata = metadata[metadata["vote_average"].notna() == True]
v = metadata["vote_count"]
C = metadata["vote_average"].mean()

vote_counts = metadata["vote_count"].astype(int)

m = vote_counts.quantile(0.75)

# metadata = metadata[metadata["vote_count"]>m]

print("v: ", C, "\nm: ", m)

v:  5.618207215134184 
m:  34.0


In [4]:
WR = (v/(v+m))*metadata["vote_average"] + (m/(v+m))*C
metadata["WR"] = WR

Applying two data transformations,

- stemming keywords before concatinating
- dropping rows from keywords dataset if keywords are not present

In [5]:
ps = PorterStemmer()

def stemmer(ps_instance, word):
    return ps_instance.stem(word)

def getkeys(X):
    if X ==[]:
        return ""
    keys = []
    for key in X:
        keys.append(stemmer(ps,key["name"]))
    return ",".join(keys)

keywords["keywords"] = keywords["keywords"].astype(str).apply(literal_eval)
keywords["keywords"] = keywords["keywords"].apply(getkeys)
keywords = keywords[keywords["keywords"].replace("", pd.NA).notna()]
keywords["id"] = keywords["id"].astype(str)

Now I will work on the movies_metadata to only retain required columns

In [6]:
movies = metadata[[ "imdb_id", "id", "genres", "original_language", "original_title", "overview", "tagline", "title", "WR"]]
movies["id"] = movies["id"].astype(str)
movies = pd.merge(movies, keywords, on='id', how='left')

del metadata
del keywords

In [7]:
def getGenres(X):
    genres = []
    for genre in X:
        # genres.append(stemmer(ps,genre["name"]))
        genres.append(genre["name"].lower())
    return genres

movies["genres"] = movies["genres"].apply(literal_eval).apply(getGenres)

movies = movies[movies["overview"].notna()].reset_index(drop=True)

Applied One-Hot encoding to genres, I found that there are in total 21 genres in my movies dataset.

In [8]:
mlb = MultiLabelBinarizer()
genres_one_hot = pd.DataFrame(mlb.fit_transform(movies["genres"]), columns=mlb.classes_, index=movies.index)
genres_encoded = pd.concat([movies['id'], genres_one_hot], axis=1)

movies["embd_genres"] = genres_encoded[['action', 'adventure', 'animation', 'comedy', 'crime',
       'documentary', 'drama', 'family', 'fantasy', 'foreign', 'history',
       'horror', 'music', 'mystery', 'romance', 'science fiction', 'thriller',
       'tv movie', 'war', 'western']].apply(lambda row: list(row), axis=1)

embeddings = pd.concat([movies["id"], movies["embd_genres"]],axis=1)

Now, my data is ready! I am going to insert all embeddings in a PostgreSQL database, where I have enabled vector storage by installing pgvector extension.

In [11]:
model = SentenceTransformer("all-MiniLM-L6-v2")

def encode(X):
    try:
        return model.encode(X)
    except:
        return model.encode("")
    
# docker inspect -f '{{range.NetworkSettings.Networks}}{{.IPAddress}}{{end}}' pgvector-container
  
conn = psycopg2.connect(
  database="projects",
  user="postgres",
  password="Podcast@321",
  host="127.0.0.1",
  port=5432
)

cursor = conn.cursor()

for row in range(movies.shape[0]):

    id = movies.iloc[row]["id"]

    title = str(movies.iloc[row]["title"])

    wr = movies.iloc[row]["WR"]

    embd_overview = encode(movies.iloc[row]["overview"])
    embd_overview_str = "[{}]".format(",".join(map(str, embd_overview)))

    embd_tagline = encode(movies.iloc[row]["tagline"])
    embd_tagline_str = "[{}]".format(",".join(map(str, embd_tagline)))

    embd_keywords = encode(movies.iloc[row]["keywords"])
    embd_keywords_str = "[{}]".format(",".join(map(str, embd_keywords)))

    embd_genres = embeddings.iloc[row]["embd_genres"]
    embd_genres_str = "[{}]".format(",".join(map(str, embd_genres)))

    try:
        cursor.execute("""INSERT INTO movies_recommend.embeddings (id,
                        embd_overview,
                        embd_tagline,
                        embd_keywords,
                        embd_genres) 
                        VALUES (%s, %s, %s, %s,%s)""",(id, embd_overview_str, embd_tagline_str, embd_keywords_str, embd_genres_str))
    
        cursor.execute("""INSERT INTO movies_recommend.movies(id,
                        title,
                        weighted_rating)
                        VALUES (%s, %s, %s)""", (id, title, str(wr)))
    except psycopg2.Error as e:
        print(f"Error inserting row {row}: {e}")
        conn.rollback()

    if row%1000 == 0:
        conn.commit()

: 

In [202]:
# select base_movie, id, title, 1-cosine_distance as cosine_similarity
# from (
# select b.title as base_movie, a.id, a.title, a.embd_overview <-> b.embd_overview as cosine_distance
# from movies_recommend.embeddings a
# left join movies_recommend.embeddings b
# on 1=1
# and b.id = 216015
# )
# order by cosine_similarity desc
# limit 50;