In [1]:
import pandas as pd
import numpy as np
from numpy.linalg import norm

from transformers import DistilBertTokenizer, DistilBertModel
import torch
from tqdm import tqdm

from langchain_community.vectorstores.faiss import FAISS
from langchain.embeddings.base import Embeddings

In [2]:
rootpath = '../'

In [3]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')



In [4]:
class MyEmbeddings(Embeddings):
    def embed_documents(self, docs):
        split_docs = pd.DataFrame([doc.split("<CONCAT>") for doc in docs])
        split_docs = split_docs.fillna("")

        final_embeddings = None

        for col in split_docs.columns:
            inputs = tokenizer(split_docs[col].values.tolist(), return_tensors="pt", padding=True, truncation=True)
            with torch.no_grad():
                outputs = model(**inputs)
            last_hidden_states = outputs.last_hidden_state
            sentence_embeddings = torch.mean(last_hidden_states, dim=1)
            
            if final_embeddings is None:
                final_embeddings = sentence_embeddings
            else:
                final_embeddings = torch.cat((final_embeddings, sentence_embeddings), dim=1)

        return final_embeddings

    def embed_query(self, query):
        embedded_doc = self.embed_documents([query])
        return embedded_doc[0]

In [5]:
faiss_store = FAISS.load_local(
    "../faiss_index", MyEmbeddings(), allow_dangerous_deserialization=True
)

In [6]:
vector_dic = {}

faiss_index = faiss_store.index

num_vectors = faiss_index.ntotal
all_vectors = np.zeros((num_vectors, faiss_index.d), dtype='float32')
faiss_index.reconstruct_n(0, num_vectors, all_vectors)

for i, doc_id in enumerate(faiss_store.docstore._dict.keys()):
    vector = all_vectors[i]
    metadata = faiss_store.docstore._dict[doc_id].metadata
    video_id = metadata['id']
    vector_dic[video_id] = vector

In [7]:
train_data = pd.read_csv(rootpath + "data_exports/joined_train_data.csv")
val_data = pd.read_csv(rootpath + "data_exports/joined_val_data.csv")

train_val_data = pd.concat([train_data, val_data])

In [None]:
decay_constant = 10
latest_time = pd.to_datetime(train_val_data['time'].max())

# Given a user id, return the recommendation scores for all videos in a pandas dataframe
def get_recommendation_scores(user_id):
    watched_rows = train_val_data[train_val_data["user_id"] == user_id]

    avg_vector = np.zeros(3840)

    sum_coefs = 0

    # for video_id in watched_videos:
    for index, row in watched_rows.iterrows():
        video_id = row["video_id"]
        watch_ratio_coef = row["watch_ratio"]
        time = row["time"]

        time = pd.to_datetime(time)
        time_diff_in_days = (latest_time - time).days

        decay_coef = np.exp(-time_diff_in_days / decay_constant)

        combined_coef = watch_ratio_coef * decay_coef
        sum_coefs += combined_coef

        avg_vector += vector_dic[str(video_id)] * combined_coef
    
    avg_vector /= sum_coefs

    scores = {}
    for video_id, vector in vector_dic.items():
        try:
            int(video_id)
        except:
            continue

        cosine_similarity = np.dot(avg_vector, vector) / (norm(avg_vector) * norm(vector))
        scores[video_id] = (cosine_similarity + 1) * 5 / 2
        
    return scores

In [None]:
all_unique_user_ids = train_val_data["user_id"].unique()

all_recommendations = {}

for user_id in tqdm(all_unique_user_ids):
    recommendations = get_recommendation_scores(user_id)
    all_recommendations[user_id] = recommendations

flat_data = [(user_id, video_id, watch_ratio) 
            for user_id, videos in all_recommendations.items() 
            for video_id, watch_ratio in videos.items()]

# Create DataFrame
recommendations_df = pd.DataFrame(flat_data, columns=["user_id", "video_id", "watch_ratio"])

recommendations_df.to_csv(rootpath + "recommendations/recommendations_caption_test_full.csv")

  0%|          | 0/1411 [00:00<?, ?it/s]

100%|██████████| 1411/1411 [08:03<00:00,  2.92it/s]


In [None]:
recommendations_df.shape

(12585126, 3)