In [63]:
import pandas as pd
import numpy as np
from numpy.linalg import norm

In [2]:
rootpath = "../KuaiRec 2.0/"

In [35]:
captions = pd.read_csv(rootpath + "data/kuairec_caption_category_translated.csv")
cols = captions.columns
english_cols = [col for col in cols if "english" in col]

captions_english = captions[["video_id"] + english_cols]
captions_english["all_english"] = captions_english[english_cols].fillna("").apply(lambda x: "<CONCAT>".join(x), axis=1)
captions_english.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  captions_english["all_english"] = captions_english[english_cols].fillna("").apply(lambda x: "<CONCAT>".join(x), axis=1)


Unnamed: 0,video_id,english_caption,english_first_level_category_name,english_second_level_category_name,english_third_level_category_name,english_topic_tag,all_english
0,0,"The spirit of the young man is tough;程哥, pleas...",Beauty index,Snap of good looks,UNKNOWN,[],"The spirit of the young man is tough;程哥, pleas..."
1,1,man,HighTech Digital,UNKNOWN,UNKNOWN,[],man<CONCAT>HighTech Digital<CONCAT>UNKNOWN<CON...
2,2,"After dinner, get some exercise!",Comedy,Humorous interaction,UNKNOWN,[],"After dinner, get some exercise!<CONCAT>Comedy..."
3,3,"I am unremarkable, unable to stun time or soft...",Photography,Theme Photography,Landscape photography,[],"I am unremarkable, unable to stun time or soft..."
4,4,#Humorous #Thanks快准热浪 #Wulai Market This is a ...,Fashion,Sales promotion,Women's clothing,"[Wulai Market, thanks for Fastly Me Wanting to...",#Humorous #Thanks快准热浪 #Wulai Market This is a ...


In [4]:
print(captions_english.shape)

(10732, 6)


In [29]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from tqdm import tqdm

In [6]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')



In [32]:
from langchain_community.vectorstores.faiss import FAISS
from langchain.embeddings.base import Embeddings
from langchain.schema import Document

documents = [Document(page_content=row["all_english"], metadata={"id": row["video_id"]}) for index, row in captions_english.iterrows()]

class MyEmbeddings(Embeddings):
    def embed_documents(self, docs):
        split_docs = pd.DataFrame([doc.split("<CONCAT>") for doc in docs])
        split_docs = split_docs.fillna("")

        final_embeddings = None

        for col in split_docs.columns:
            inputs = tokenizer(split_docs[col].values.tolist(), return_tensors="pt", padding=True, truncation=True)
            with torch.no_grad():
                outputs = model(**inputs)
            last_hidden_states = outputs.last_hidden_state
            sentence_embeddings = torch.mean(last_hidden_states, dim=1)
            
            if final_embeddings is None:
                final_embeddings = sentence_embeddings
            else:
                final_embeddings = torch.cat((final_embeddings, sentence_embeddings), dim=1)

        return final_embeddings

    def embed_query(self, query):
        embedded_doc = self.embed_documents([query])
        return embedded_doc[0]

faiss_store = None

with tqdm(total=len(documents), desc="Ingesting documents") as pbar:
    for d in documents:
        if faiss_store:
            faiss_store.add_documents([d])
        else:
            faiss_store = FAISS.from_documents([d], MyEmbeddings())
        pbar.update(1)  

Ingesting documents: 100%|██████████| 10732/10732 [11:54<00:00, 15.02it/s]


In [40]:
faiss_store.save_local('faiss_index')

In [50]:
faiss_store = FAISS.load_local(
    "faiss_index", MyEmbeddings(), allow_dangerous_deserialization=True
)

In [51]:
vector_dic = {}

faiss_index = faiss_store.index

num_vectors = faiss_index.ntotal
all_vectors = np.zeros((num_vectors, faiss_index.d), dtype='float32')
faiss_index.reconstruct_n(0, num_vectors, all_vectors)

for i, doc_id in enumerate(faiss_store.docstore._dict.keys()):
    vector = all_vectors[i]
    metadata = faiss_store.docstore._dict[doc_id].metadata
    video_id = metadata['id']
    vector_dic[video_id] = vector

In [53]:
small_matrix = pd.read_csv(rootpath + "data/small_matrix.csv")

In [90]:
# Given a user id, return the recommendation scores for all videos in a pandas dataframe
def get_recommendation_scores(user_id):
    watched_videos = small_matrix[small_matrix["user_id"] == user_id]["video_id"].tolist()

    avg_vector = np.zeros(3840)

    for video_id in watched_videos:
        avg_vector += vector_dic[str(video_id)]
    
    avg_vector /= len(watched_videos)

    scores = {}
    for video_id, vector in vector_dic.items():
        try:
            if int(video_id) in watched_videos:
                scores[video_id] = float("-inf")
                continue
        except:
            continue

        cosine_similarity = np.dot(avg_vector, vector) / (norm(avg_vector) * norm(vector))
        scores[video_id] = cosine_similarity

    scores = pd.DataFrame.from_dict(scores, orient="index", columns=["score"])
    scores = scores.sort_values(by="score", ascending=False)

    return scores

get_recommendation_scores(14)

Unnamed: 0,score
4989,0.926011
2449,0.924867
3145,0.922013
6664,0.921905
8264,0.921281
...,...
6090,-inf
6088,-inf
1027,-inf
1032,-inf
