In [3]:
import pandas as pd
import numpy as np
from numpy.linalg import norm

from transformers import DistilBertTokenizer, DistilBertModel
import torch
from tqdm import tqdm

from langchain_community.vectorstores.faiss import FAISS
from langchain.embeddings.base import Embeddings
from langchain.schema import Document

In [8]:
rootpath = '../'

In [2]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')



In [4]:
class MyEmbeddings(Embeddings):
    def embed_documents(self, docs):
        split_docs = pd.DataFrame([doc.split("<CONCAT>") for doc in docs])
        split_docs = split_docs.fillna("")

        final_embeddings = None

        for col in split_docs.columns:
            inputs = tokenizer(split_docs[col].values.tolist(), return_tensors="pt", padding=True, truncation=True)
            with torch.no_grad():
                outputs = model(**inputs)
            last_hidden_states = outputs.last_hidden_state
            sentence_embeddings = torch.mean(last_hidden_states, dim=1)
            
            if final_embeddings is None:
                final_embeddings = sentence_embeddings
            else:
                final_embeddings = torch.cat((final_embeddings, sentence_embeddings), dim=1)

        return final_embeddings

    def embed_query(self, query):
        embedded_doc = self.embed_documents([query])
        return embedded_doc[0]

In [5]:
faiss_store = FAISS.load_local(
    "../faiss_index", MyEmbeddings(), allow_dangerous_deserialization=True
)

In [6]:
vector_dic = {}

faiss_index = faiss_store.index

num_vectors = faiss_index.ntotal
all_vectors = np.zeros((num_vectors, faiss_index.d), dtype='float32')
faiss_index.reconstruct_n(0, num_vectors, all_vectors)

for i, doc_id in enumerate(faiss_store.docstore._dict.keys()):
    vector = all_vectors[i]
    metadata = faiss_store.docstore._dict[doc_id].metadata
    video_id = metadata['id']
    vector_dic[video_id] = vector

In [11]:
train_data = pd.read_csv(rootpath + "data_exports/joined_train_data_FE.csv")

In [13]:
# Given a user id, return the recommendation scores for all videos in a pandas dataframe
def get_recommendation_scores(user_id):
    watched_videos = train_data[train_data["user_id"] == user_id]["video_id"].tolist()

    avg_vector = np.zeros(3840)

    for video_id in watched_videos:
        avg_vector += vector_dic[str(video_id)]
    
    avg_vector /= len(watched_videos)

    scores = {}
    for video_id, vector in vector_dic.items():
        try:
            if int(video_id) in watched_videos:
                scores[video_id] = float("-inf")
                continue
        except:
            continue

        cosine_similarity = np.dot(avg_vector, vector) / (norm(avg_vector) * norm(vector))
        scores[video_id] = cosine_similarity * 5

    scores = pd.DataFrame.from_dict(scores, orient="index", columns=["score"])
    scores = scores.sort_values(by="score", ascending=False)

    return scores

get_recommendation_scores(14)

Unnamed: 0,score
4989,4.644616
2449,4.643638
2966,4.640736
3145,4.628510
7485,4.626839
...,...
2312,-inf
2314,-inf
7141,-inf
632,-inf
