In [1]:
import ast
import json
import numpy as np
import pandas as pd
import pickle
import torch
from multiprocessing import Pool
from transformers import AutoTokenizer, AutoModel
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct, Filter, FieldCondition, Range, CollectionDescription
from qdrant_client.http import models
import sys
sys.path.append('../utils')

import data_utils as dut 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_pickle('../data/interim/filtered_data.pkl')

In [None]:
# create list of JSON strings
json_list = []
for i in range(len(data)):
    json_string = data.loc[i].to_json()
    json_list.append(json_string)

In [None]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [None]:
# Fields to vectorize and to keep as payloads
vector_fields = ['Title', 'description', 'authors', 'review/summary', 'review/text']
payload_fields = ['publisher', 'publishedDate',  'categories', 'Id', 'review/score']

In [None]:
# Parse each JSON string into a Python dictionary
records = [json.loads(j) for j in json_list]

In [None]:
# Process the records
processed_records = []
for record in records:
    vectors = {}
    payloads = {}

    for key, value in record.items():
        if key in vector_fields and value is not None:
            if isinstance(value, str):
                vectors[key] = dut.vectorize_texts([value], tokenizer, model)[0].tolist()
        elif key in payload_fields:
            payloads[key] = value

    processed_records.append((vectors, payloads))


In [None]:
# Saving the processed records to a file
with open('../data/processed/processed_records.pkl', 'wb') as f:
    pickle.dump(processed_records, f)
