# This colab
- Takes the Dataframe processed earlier
- embeddes 'product-description" into Chroma Vector DB
- Adds the following as metadata
- Adds the image ids s id

# Install Necessay stuff

In [None]:
!pip install langchain_community
!pip install chromadb
!pip install bitsandbytes

# Ref: https://python.langchain.com/docs/integrations/vectorstores/chroma/
!pip install -qU langchain-huggingface

!pip install -qU "langchain-chroma>=0.1.2"

Collecting langchain_community
  Downloading langchain_community-0.3.9-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain<0.4.0,>=0.3.8 (from langchain_community)
  Downloading langchain-0.3.9-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.21 (from langchain_community)
  Downloading langchain_core-0.3.21-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.6.1-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.23.1-py3-none-any.whl.metadata (7.5 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-jso

In [None]:
import warnings
warnings.filterwarnings('ignore')



In [None]:
# mount storage
from google.colab import drive
drive.mount('/content/drive')
#%cd "/gdrive/MyDrive/Interview Kickstart/MLSwitchup/Capstone/ShopTalk"
%cd /content/drive/MyDrive/ik-ml/capstone
%ls

Mounted at /content/drive
/content/drive/MyDrive/ik-ml/capstone
 11-27-brahm-ShopTalkEDA_local.ipynb   [0m[01;34mold[0m/
 [01;34mABO_dataset[0m/                          openai-api-key.gdoc
 [01;34mchroma[0m/                               rag-imges.ipynb
 [01;34mchroma_langchain_db[0m/                  st-embeddings-and-vector-db.ipynb
 [01;34mdata-old[0m/                             st-llm.ipynb
'higgingface-access-token=.gdoc'       st-vecotr-db.ipynb
 [01;34mllama32[0m/                              st-vector-db-02.ipynb


In [None]:
import pandas as pd
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from tqdm import tqdm

class VectorStore ():
  def __init__(self):
    self.preprocessed_data = None
    self.preprocessed_data_file = None
    self.content_col_name = None
    self.metadata_columns = None
    self.vector_store = None
    self.collection_name = None
    self.db_file_path = None
    self.embeddings = None
    self.retriever = None


  # Takes the preprocessed dataframe saved as pickle file
  def loadPreprocessedData(self, pre_processed_data_file, content_col_name):
    self.preprocessed_data_file = pre_processed_data_file
    self.preprocessed_data = pd.read_pickle(self.preprocessed_data_file)
    self.preprocessed_data.dropna(subset=[content_col_name], inplace=True)
    return True

  def getPreprocessedData(self):
    return self.preprocessed_data

  # if the data file exists, DB will be created from the file. if not, new DB is created
  def createVectorStore(self, collection_name, db_file_path, embeddings):
    self.collection_name = collection_name
    self.db_file_path = db_file_path
    self.embeddings = embeddings

    self.vector_store = Chroma(collection_name=self.collection_name, embedding_function=self.embeddings,
                        persist_directory=self.db_file_path,  # Where to save data locally, remove if not necessary
                    )
    return True

  def getVectorStore(self):
    return self.vector_store


  def addDataToVectorStoreIteratively(self, content_col_name, metadata_cols_list):
    self.content_col_name = content_col_name
    self.metadata_columns = metadata_cols_list

    total_count = self.preprocessed_data.shape[0]
    print('Total Records to add {}'.format(total_count))

    # convert the metadata columns
    mdf = self.preprocessed_data[self.metadata_columns]
    count = 0
    for index, row in enumerate(tqdm(self.preprocessed_data.itertuples(), total=total_count)):
      try:
        self.vector_store.add_texts(
            embedding_function=self.embeddings,
            collection_name=self.collection_name,
            ids=row.Index,
            texts = getattr(row, self.content_col_name),
            metadatas=[getattr(row, col) for col in metadata_cols_list]
        )
      except Exception as e:
        print(e)

    return True


  def getRetriever(self):
    if ( self.retriever is None):
      self.retriever = self.vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 1, "fetch_k": 5})
    return self.retriever

  def addDataToVectorStoreBulk(self, content_col_name, metadata_cols_list):
    self.content_col_name = content_col_name
    self.metadata_columns = metadata_cols_list

    # convert the metadata columns
    mdf = self.preprocessed_data[self.metadata_columns]
    # Convert the product descriptions to strings before adding to ChromaDB
    # The tolist() method converts the Pandas Series into a list for ChromaDB
    self.vector_store.add_texts(
        embedding_function=self.embeddings,
        collection_name=self.collection_name,
        # if the df indexes are to be used as ids.if column content, then this requires change
        ids=self.preprocessed_data.index.tolist(),
        texts=self.preprocessed_data[content_col_name].astype(str).tolist(), # Convert to strings
        metadatas=mdf[metadata_cols_list].to_dict(orient="records") ,
    )
    return True



In [None]:
# Test the class with dummay data
vs = VectorStore()
content_col_name = "product_description"
metadata_cols_list = ['brand', 'color']
ids_list = ['main_image_id']
vs.loadPreprocessedData("./ABO_dataset/abo-listings-english-tags.pkl", "product_description")
data = vs.getPreprocessedData()


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2522 entries, B08B85K5C2 to B07VF9MJ27
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   brand                2521 non-null   object
 1   bullet_point         2459 non-null   object
 2   color                1869 non-null   object
 3   fabric_type          281 non-null    object
 4   item_name            2522 non-null   object
 5   item_weight          1133 non-null   object
 6   model_name           287 non-null    object
 7   product_type         2522 non-null   object
 8   style                575 non-null    object
 9   main_image_id        2510 non-null   object
 10  other_image_id       2315 non-null   object
 11  country              2522 non-null   object
 12  marketplace          2522 non-null   object
 13  domain_name          2522 non-null   object
 14  material             1017 non-null   object
 15  item_keywords        2272 non-null   object
 

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
vs.createVectorStore("st_pd", "./chroma_langchain_db", embeddings)


True

In [None]:
vs.addDataToVectorStoreIteratively("product_description", ['brand', 'color'])

Total Records to add 2522


  0%|          | 0/2522 [00:00<?, ?it/s]

index: B08B85K5C2
metadata:  ['UMI', 'Blue']
context: {} Protect your canine companions from external elements with this comfortable dog raincoat! Designed with love for your furry friends, our fabulous dog raincoat is a protective gear that is comfortable to wear! This raincoat features a PA coating which provides durable water repellent. We all know the details really matter when it comes to a functional garment like this raincoat. There is a leash port on the back that just makes your days better! With great attention to the details, reflective tapes on the front and back are to ensure safety and visibility during walks at night. There have adhesive tapes on the neckline and underneath dogs' belly for a perfect fit. Dog lovers, what are you waiting for? Let's get one NOW!


  0%|          | 1/2522 [00:10<7:20:01, 10.47s/it]

Expected metadata to be a dict or None, got str as metadata in upsert.
count: 1
index: B005P32550
metadata:  ['Amazon Collection', 'white']
context: {} All our diamond suppliers confirm that they comply with the Kimberley Process to ensure that their diamonds are conflict free. Imported Carat weight listed is the total for all stones.


  0%|          | 2/2522 [00:13<4:23:16,  6.27s/it]

Expected metadata to be a dict or None, got str as metadata in upsert.
count: 2
index: B07S6BDFL5
metadata:  ['AmazonBasics', 'White/Brass']
context: {} Not yet launched


  0%|          | 3/2522 [00:14<2:27:31,  3.51s/it]

Expected metadata to be a dict or None, got str as metadata in upsert.
count: 3
index: B000V7RDCA
metadata:  ['Amazon Collection', nan]
context: {} Or Collier avec coeur en filigrane Médaillon avec cadres pour 4 petites photos, fermoir à ressort en chaîne avec corde ; Articles qui sont fait main peuvent varier en taille, forme, ou couleur.; Made in the USA


  0%|          | 4/2522 [00:16<2:15:32,  3.23s/it]

Expected metadata to be a dict or None, got str as metadata in upsert.
count: 4
index: B0142RXZUI
metadata:  ['Sixth & Love', 'Black']
context: {} Smooth ,side zip bootie with a tapered toe, decorative zip trim, stacked heel


  0%|          | 5/2522 [00:17<1:42:32,  2.44s/it]

Expected metadata to be a dict or None, got str as metadata in upsert.
count: 5
index: B087JC55QR
metadata:  ['Eono(イオーノ)', ['黑色', 'ブラック', 'blk']]
context: {} ['多重防护・IPX8等级认证 <br>独家防水贴+多个卷提供密闭技术,获得防水国际保护等级IPX8认证,防水深度达30米,可防水30分钟。 防水壳使用高性能材料,手感柔软,结实,使用期限会变高。 在各种场合都能大显身手:户外、潜水、冲浪、海边、泳池、沙滩、自行车、钓鱼、滑雪、温泉、浴室、料理中、雨中等各种场合都不会有水顾,可放心使用。 <p>触摸灵敏度。 <br>由于其与细致的接触反应,所以操作更加轻松。 另外,防水壳的透明度高,能够保持清晰的画质。 "最大可对应7英寸以内的智能手机"可收纳7英寸以内的手机(iPhone、Android)。 注意:手机防水壳可以识别脸部,但不支持指纹识别。 另外,在4.6英寸以下的手机上不适用防水壳,可能会有太多。 <p>附抓握带 <br>业界首款附带把手带的设计,使用者可以单手拿下,轻松拍摄照片和视频。 移动过程中保持拍摄的稳定，带来更顺滑舒适的摄影体验。 注意:在水中拍摄照片时请使用音量键。 由于水压的原因，触摸屏的效果可能会消失。 <p>创新的大开口设计 <br>市场上很多的智能手机防水壳开口很窄,便于放入随身携带。 防水壳采用两倍宽的开口设计,方便取放手机,让您随时舒适地使用。 注意:① 不适用于OtterBox等超厚型手机壳,首先先取下壳后放入手机防水壳。 ② 在海边使用时，因为水温和阳光照射温度差较大，所以袋子内可能会有雾气。 这是正常的现象,不用担心。', '<b>多重防護・IPX8等級認証</b><br> 独創の防水シール＋複数の巻きによる密閉技術を提供しています防水国際保護等級IPX8認定を獲得、水深最大30ｍで30分間の防水が可能です。防水ケースは高性能の素材使い、手触りが柔らかく、丈夫で使用期限が高くなります。『さまざまな場で大活躍してます』アウトドア、ダイビング、サーフィン、海やプール、砂浜、自転車、釣り、スキー、温泉、お風呂、料理中、雨中など、色んな場面で水気を気にせずに、スマホが使えます。<

  0%|          | 6/2522 [00:21<2:02:15,  2.92s/it]

Expected metadata to be a dict or None, got str as metadata in upsert.
count: 6
index: B07ZVWRWLC
metadata:  ['Eono', ['meerkleurig', 'vícebarevné', 'wielokolorowa', 'multicolour', 'Lila Hölzern Lavendel']]
context: {} NUR für die Kompatibilität mit den Versionen 2019 & 2018 & 2017 & 2016 (A1989 mit Touch Bar, A1706 mit Touch Bar und A1708 ohne Touch Bar, Version 2019 -- MV962LL/A, MV992LL/A, MV972LL/A, MV9A2LL/A -- und 2018 -- MR9Q2LL/A, MR9R2LL/A -- und 2017 -- MPXV2LL/A, MPXX2LL/A, MPXW2LL/A, MPXY2LL/A, MPXQ2LL/A, MPXR2LL/A, MPXT2LL/A, MPXU2LL/A -- und 2016 -- MLL42LL/A, MLUQ2LL/A, MLH12LL/A, MLVP2LL/A, MNQF2LL/A, MNQG2LL/A).


  0%|          | 8/2522 [00:28<2:01:57,  2.91s/it]

Expected metadata to be a dict or None, got str as metadata in upsert.
count: 7
index: B07G2L5KZC
metadata:  ['365 by Whole Foods Market', nan]
context: {} Suplementos
Expected metadata to be a dict or None, got str as metadata in upsert.
count: 8
index: B0142RWE0K
metadata:  ['Sixth & Love', 'Black']
context: {} Allover quilting and a cap toe lend classic appeal to this side stretch, stacked heel style


  0%|          | 9/2522 [00:29<1:38:46,  2.36s/it]

Expected metadata to be a dict or None, got str as metadata in upsert.
count: 9
index: B07QJG2F65
metadata:  ['365 by Whole Foods Market', nan]
context: {} ['海鲜', '해산물.', '海鮮', 'Mariscos', 'מאכלי ים', 'المأكولات البحرية', 'Comida do mar', 'Meeresfrüchte']


  0%|          | 10/2522 [00:30<1:14:54,  1.79s/it]

Expected metadata to be a dict or None, got str as metadata in upsert.
count: 10
index: B0753SPQ6K
metadata:  ['AmazonBasics', nan]
context: {} An Amazon Brand.


  0%|          | 11/2522 [00:30<54:55,  1.31s/it]  

Expected metadata to be a dict or None, got str as metadata in upsert.
count: 11
index: B07PGKCLNY
metadata:  ['365 by Whole Foods Market', nan]
context: {} ['365 일상 가치 케이지 프리 엑스트라 라지 브라운 등급 계란은 인간적으로 수확한 계란으로 다음 식사를 할 때 마음의 평화를 제공합니다. 좋아하는 디저트, 저녁 식사 조리법이나 건강한 아침 식사를 위해 이 계란을 사용하시고 이 상자에는 12개의 케이지 프리 엑스트라 라지 브라운 등급 계란이 포함되어 있다는 것을 잘 알고 있습니다. 우리의 가금류는 동물성 부산물이 없는 채식주의 식단을 공급하며 항생제 없이 자연스럽게 자랍니다.', '365 天无礼盒超大号棕色级鸡蛋是人类收获的鸡蛋，让您平静地一餐。 在您最喜爱的甜点、晚餐食中使用这些鸡蛋，或者在早餐时健康，知道这款纸箱包含 12 个无笼的超大棕色级鸡蛋。 我们的家禽为蔬菜食制食物提供食物，不含动物副产品，并且天然地营养，没有抗生素。', '365日常價值籠免費超大棕色級雞蛋是人性地收穫的雞蛋，給你安心與你的下一頓飯。 使用這些雞蛋在您最喜愛的甜點、晚餐食譜或健康早餐，並感覺很好知道這個紙箱含有12個無籠子超大的棕色級雞蛋。 我們的家禽飼養素食，不含動物副產品，天然養殖沒有抗生素。', 'Los huevos de grado marrón extra grande son huevos cosechados humanamente, lo que te da tranquilidad con tu próxima comida. Utiliza estos huevos en tus postres favoritos, recetas de cena o para un desayuno saludable y siéntete bien sabiendo que esta caja contiene 12 huevos de grado marrón extra grande sin jaulas. Nuestras aves de corral se

  0%|          | 11/2522 [00:44<2:48:26,  4.02s/it]

Expected metadata to be a dict or None, got str as metadata in upsert.
count: 12
Total count is 12





True

In [None]:
retriever = vs.getRetriever()

In [None]:
retriever.invoke("Give me all brands")
#retriever.invoke("Give me white color shoes list", filter={"source": "news"})

[Document(metadata={'brand': 'AmazonBasics'}, page_content='An Amazon Brand')]