In [1]:
import time
import torch
import joblib
import psycopg2
import numpy as np
import pandas as pd
from fuzzywuzzy import process
from collections import Counter
from scipy.sparse import csr_matrix
from tqdm.autonotebook import tqdm, trange
from psycopg2.extras import execute_values
from sklearn.neighbors import NearestNeighbors
from sentence_transformers import SentenceTransformer


conn = psycopg2.connect(
    dbname="mov_db_1",
    user="bulldogg",
    password="21101314",
    host="localhost",
    port="5433"
)

  from tqdm.autonotebook import tqdm, trange


In [2]:

conn.close()

In [2]:
print("Started loading data")
start_load = time.time()
Q = np.load('data/Q.npy')
Q_b = np.load('bayesian_data/Q_b.npy')

movie_mapper = joblib.load('data/movie_mapper.joblib')
movie_inv_mapper = joblib.load('data/movie_inv_mapper.joblib')

movie_mapper_b = joblib.load('bayesian_data/movie_mapper_b.joblib')
user_inv_mapper_b = joblib.load('bayesian_data/user_inv_mapper_b.joblib')
movie_inv_mapper_b = joblib.load('bayesian_data/movie_inv_mapper_b.joblib')

titles = pd.read_csv('upgraded_movielens_latest/titles.csv')
ratings = pd.read_csv('upgraded_movielens_latest/filtered_ratings.csv')
tags = pd.read_csv('upgraded_movielens_latest/upgraded_movies.csv')
end_load = time.time()
print("Completed loading data")
print(f'Total Data Loadung Time : {end_load-start_load}')

Started loading data
Completed loading data
Total Data Loadung Time : 8.176977157592773


In [3]:
print(Q.shape)
print(Q_b.shape)

(82129, 300)
(82129, 300)


In [4]:
titles.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [5]:
tags.head()

Unnamed: 0,movieId,tag
0,1.0,"overview: Led by Woody, Andy's toys live happi..."
1,2.0,overview: When siblings Judy and Peter discove...
2,3.0,overview: A family wedding reignites the ancie...
3,4.0,"overview: Cheated on, mistreated and stepped o..."
4,5.0,overview: Just when George Banks has recovered...


In [6]:
movie_id = 3  # Replace with the desired movieId
tag = tags.loc[tags['movieId'] == movie_id, 'tag'].values[0]
print(tag)

overview: A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée opens a restaurant at the local bait shop, alarming the locals who worry she'll scare the fish away. But she's less interested in seafood than she is in cooking up a hot time with Max. tagline: Still Yelling. Still Fighting. Still Ready for Love. keywords: fishing sequel old man best friend wedding italian restaurant old friends duringcreditsstinger pranks casts: MaxGoldman JohnGustafson ArielGustafson MariaSophiaColettaRagetti MelanieGustafson director: HowardDeutch vote_average: 6.478 vote_count: 365 popularity: 30.376 Genres : Comedy Romance


In [7]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119


In [8]:
unique_movie_ids_rat = ratings['movieId'].unique().tolist()
print(len(unique_movie_ids_rat))

82129


In [9]:
unique_movie_ids_tag = tags['movieId'].unique().tolist()
print(len(unique_movie_ids_tag))

85407


In [10]:
unique_movie_ids_rat = ratings['movieId'].unique()

tags_filtered = tags[tags['movieId'].isin(unique_movie_ids_rat)]

tags = tags_filtered

print(len(tags))

82129


In [11]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS device")
else:
    device = torch.device("cpu")
    print("MPS device not found, using CPU")

model = SentenceTransformer('all-mpnet-base-v2')
model = model.to(device)


Using MPS device


In [12]:
sentences = "This is an example sentence"

with torch.no_grad():
    embeddings = model.encode(sentences, convert_to_tensor=True)
    embeddings = embeddings.cpu().numpy() 

In [13]:
print(embeddings.shape)
print(type(embeddings))

(768,)
<class 'numpy.ndarray'>


In [14]:
def id_to_all(id):
    movie_indices = movie_mapper[id]
    usr_vec = Q_b[movie_indices]
    title = titles.loc[titles['movieId'] == id, 'title'].values[0]
    tag = tags.loc[tags['movieId'] == id, 'tag'].values[0]
    with torch.no_grad():
        embeddings = model.encode(tag, convert_to_tensor=True)
        embeddings = embeddings.cpu().numpy()
    return movie_indices, usr_vec, title, tag, embeddings

In [15]:
# cur = conn.cursor()

# cur.execute("""
# CREATE TABLE IF NOT EXISTS movies (
#     movieId INTEGER PRIMARY KEY,
#     movie_indices INTEGER,
#     usr_vec VECTOR(300),
#     title TEXT,
#     tag TEXT,
#     embeddings VECTOR(768)
# );
# """)

In [16]:
cur = conn.cursor()

insert_query = """
INSERT INTO movies (movieId, movie_indices, usr_vec, title, tag, embeddings)
VALUES (%s, %s, %s, %s, %s, %s)
ON CONFLICT (movieId) DO UPDATE SET
    movie_indices = EXCLUDED.movie_indices,
    usr_vec = EXCLUDED.usr_vec,
    title = EXCLUDED.title,
    tag = EXCLUDED.tag,
    embeddings = EXCLUDED.embeddings
"""

cur.execute("""
CREATE TABLE IF NOT EXISTS movies (
    movieId INTEGER PRIMARY KEY,
    movie_indices INTEGER,
    usr_vec VECTOR(300),
    title TEXT,
    tag TEXT,
    embeddings VECTOR(768)
);
""")
    

In [17]:
for i in tqdm(unique_movie_ids_rat):
    movie_indices, usr_vec, title, tag, embeddings = id_to_all(i)
    movie_id = int(i) if isinstance(i, np.integer) else i
    movie_indices = int(movie_indices) if isinstance(movie_indices, np.integer) else movie_indices
    usr_vec_str = ','.join(map(str, usr_vec))
    embeddings_str = ','.join(map(str, embeddings))
    usr_vec_str = "["+usr_vec_str+"]"
    embeddings_str = "["+embeddings_str+"]"

    try:
        cur.execute(insert_query, (movie_id, movie_indices, usr_vec_str, title, tag, embeddings_str))
        conn.commit()
    except Exception as e:
        print(f"Error inserting movie_id {movie_id}: {str(e)}")
        conn.rollback()
        
cur.close()
conn.close()

print("Data insertion completed.")

  0%|          | 0/82129 [00:00<?, ?it/s]

Data insertion completed.


In [30]:
conn = psycopg2.connect(
    dbname="mov_db_1",
    user="bulldogg",
    password="21101314",
    host="localhost",
    port="5433"
)
cur = conn.cursor()
cur.execute("""
CREATE INDEX ON movies USING ivfflat (embeddings vector_cosine_ops);
""")

cur.execute("""
CREATE INDEX ON movies USING ivfflat (usr_vec vector_cosine_ops);
""")

In [82]:
def get_content_based_recommendation(movie_id, k):
    embeddings_query = "SELECT embeddings FROM movies WHERE movieId = %s;"
    cur.execute(embeddings_query, (movie_id,))
    result = cur.fetchone()

    if result:
        embedding = result[0]
    else:
        print(f"No embedding found for movieId {movie_id}")

    knn_query = """
    SELECT movieId, title, embeddings <-> %s::vector AS distance
    FROM movies
    WHERE movieId != %s  -- Exclude the query movie itself
    ORDER BY embeddings <-> %s::vector
    LIMIT %s;
    """

    cur.execute(knn_query, (embedding, movie_id, embedding, k))
    results = cur.fetchall()
    return results


def get_collaborative_recommendation(movie_id, k):
    usr_vec_query = "SELECT usr_vec, title FROM movies WHERE movieId = %s;"
    cur.execute(usr_vec_query, (movie_id,))
    result = cur.fetchone()
    title = result[1]
    if result:
        usr_vec = result[0]
    else:
        print(f"No usr_vec found for movieId {movie_id}")

    query = """
    SELECT movieId, title, usr_vec <-> %s::vector AS distance
    FROM movies
    WHERE movieId != %s  -- Exclude the query movie itself
    ORDER BY usr_vec <-> %s::vector
    LIMIT %s;
    """

    cur.execute(query, (usr_vec, movie_id, usr_vec, k))
    results = cur.fetchall()
    return results, title

def min_max_normalize(distances):
    min_dist = min(distances)
    max_dist = max(distances)
    if min_dist == max_dist:
        return [1.0 for _ in distances]  # All distances are the same
    return [(d - min_dist) / (max_dist - min_dist) for d in distances]

def merge_recommendations(content_recs, collab_recs, k):
    content_distances = [rec[2] for rec in content_recs]
    collab_distances = [rec[2] for rec in collab_recs]
    
    norm_content_distances = min_max_normalize(content_distances)
    norm_collab_distances = min_max_normalize(collab_distances)
    
    content_recs_norm = [(rec[0], rec[1], norm_dist) for rec, norm_dist in zip(content_recs, norm_content_distances)]
    collab_recs_norm = [(rec[0], rec[1], norm_dist) for rec, norm_dist in zip(collab_recs, norm_collab_distances)]
    for i in content_recs_norm:
        print(i)
    rec_dict = {}
    for rec in content_recs_norm + collab_recs_norm:
        movie_id, title, distance = rec
        if movie_id in rec_dict:
            rec_dict[movie_id]['distance_sum'] += distance
            rec_dict[movie_id]['count'] += 1
            if rec in content_recs_norm:
                rec_dict[movie_id]['content_distance'] = distance
            if rec in collab_recs_norm:
                rec_dict[movie_id]['collaborative_distance'] = distance
        else:
            rec_dict[movie_id] = {
                'title': title, 
                'distance_sum': distance, 
                'count': 1,
                'content_distance': distance if rec in content_recs_norm else None,
                'collaborative_distance': distance if rec in collab_recs_norm else None
            }

    merged_recs = []
    for movie_id, data in rec_dict.items():
        avg_distance = data['distance_sum'] / data['count']
        merged_recs.append({
            'movie_id': movie_id,
            'title': data['title'],
            'avg_distance': avg_distance,
            'count': data['count'],
            'content_distance': data['content_distance'],
            'collaborative_distance': data['collaborative_distance']
        })
    merged_recs.sort(key=lambda x: (-x['count'], x['avg_distance']))

    return merged_recs[:k]

def get_hybrid_recommendations(id, k):
    content_recs = get_content_based_recommendation(movie_id=id, k=k)
    collab_recs, title = get_collaborative_recommendation(movie_id=id, k=k)
    merged_recs = merge_recommendations(content_recs, collab_recs, k)
    return merged_recs, title

recs, title = get_hybrid_recommendations(1, 20)


(3114, 'Toy Story 2 (1999)', 0.0)
(78499, 'Toy Story 3 (2010)', 0.26176473617419777)
(201588, 'Toy Story 4 (2019)', 0.5059667623297233)
(115875, 'Toy Story Toons: Hawaiian Vacation (2011)', 0.7457663329631926)
(120474, 'Toy Story That Time Forgot (2014)', 0.836783951341208)
(115879, 'Toy Story Toons: Small Fry (2011)', 0.857455786289913)
(282863, 'Minions & More 1 (2022)', 0.8671423388345981)
(119456, "Bugs Bunny's Looney Christmas Tales (1979)", 0.8903141083469309)
(131130, 'Tom and Jerry: A Nutcracker Tale (2007)', 0.8937138841641549)
(263819, 'My Friends Tigger & Pooh: Friendly Tails (2008)', 0.9001606804175745)
(208112, 'Rudolph the Red-Nosed Reindeer & the Island of Misfit Toys (2001)', 0.9108474576456742)
(5765, 'Looney, Looney, Looney Bugs Bunny Movie, The (1981)', 0.9108618724743925)
(278688, "Yogi Bear's All-Star Comedy Christmas Caper (1982)", 0.9208856067579309)
(176299, 'Tom and Jerry: Willy Wonka and the Chocolate Factory (2017)', 0.9546905332414606)
(198331, "Buddy's Day 

In [83]:
print(f"Because you watched {title}")
for i, recs in enumerate(recs):
    print(f"Recommendation {i+1} ==> {recs}")

Because you watched Toy Story (1995)
Recommendation 1 ==> {'movie_id': 3114, 'title': 'Toy Story 2 (1999)', 'avg_distance': 0.052727205255106796, 'count': 2, 'content_distance': 0.0, 'collaborative_distance': 0.10545441051021359}
Recommendation 2 ==> {'movie_id': 78499, 'title': 'Toy Story 3 (2010)', 'avg_distance': 0.5926946700664113, 'count': 2, 'content_distance': 0.26176473617419777, 'collaborative_distance': 0.9236246039586248}
Recommendation 3 ==> {'movie_id': 736, 'title': 'Twister (1996)', 'avg_distance': 0.0, 'count': 1, 'content_distance': None, 'collaborative_distance': 0.0}
Recommendation 4 ==> {'movie_id': 780, 'title': 'Independence Day (a.k.a. ID4) (1996)', 'avg_distance': 0.17601200134544315, 'count': 1, 'content_distance': None, 'collaborative_distance': 0.17601200134544315}
Recommendation 5 ==> {'movie_id': 648, 'title': 'Mission: Impossible (1996)', 'avg_distance': 0.38699681117745244, 'count': 1, 'content_distance': None, 'collaborative_distance': 0.3869968111774524

In [None]:

# def merge_recommendations(content_recs, collab_recs, k):
#     combined_recs = content_recs + collab_recs
#     counts = Counter([rec[0] for rec in combined_recs])
#     rec_dict = {}
#     for rec in combined_recs:
#         movie_id, title, distance = rec
#         if movie_id in rec_dict:
#             rec_dict[movie_id]['distance_sum'] += distance
#             rec_dict[movie_id]['count'] += 1
#             if 'content_distance' in rec_dict[movie_id]:
#                 rec_dict[movie_id]['collaborative_distance'] = distance
#             else:
#                 rec_dict[movie_id]['content_distance'] = distance
#         else:
#             rec_dict[movie_id] = {
#                 'title': title, 
#                 'distance_sum': distance, 
#                 'count': 1,
#                 'content_distance': distance if rec in content_recs else None,
#                 'collaborative_distance': distance if rec in collab_recs else None
#             }

#     merged_recs = []
#     for movie_id, data in rec_dict.items():
#         avg_distance = data['distance_sum'] / data['count']
#         merged_recs.append({
#             'movie_id': movie_id,
#             'title': data['title'],
#             'avg_distance': avg_distance,
#             'count': data['count'],
#             'content_distance': data['content_distance'],
#             'collaborative_distance': data['collaborative_distance']
#         })
#     merged_recs.sort(key=lambda x: (-x['count'], x['avg_distance']))

#     return merged_recs

In [None]:
# cur = conn.cursor()

# cur.execute("""
# CREATE TABLE IF NOT EXISTS movies (
#     movieId INTEGER PRIMARY KEY,
#     movie_indices INTEGER,
#     usr_vec VECTOR(300),
#     title TEXT,
#     tag TEXT,
#     embeddings VECTOR(768)
# );
# """)

# cur.execute("""
# INSERT INTO items (embedding) VALUES 
#     ('[0.02, 0.02, 0.03]'),
#     ('[0.06, 0.05, 0.06]'),
#     ('[0.08, 0.08, 0.09]');
# """)

# conn.commit()

# cur.execute("SELECT * FROM items;")
# rows = cur.fetchall()

# for row in rows:
#     print(f"ID: {row[0]}, Embedding: {row[1]}")

# cur.execute("DROP TABLE IF EXISTS items;")

# cur.close()
# conn.close()