# ChromaDB get_started

In [1]:
import chromadb

In [2]:
chroma_client = chromadb.HttpClient(host='localhost', port=8000)


In [3]:
collection = chroma_client.create_collection(name="my_collection")

In [4]:
chroma_client.delete_collection(name="my_collection")

In [3]:
collection = chroma_client.get_or_create_collection(name="edmund_car_reviews") 

In [4]:
collection.count() # returns a list of the first 10 items in the collection

450

In [8]:
collection.modify(
    name="new_name",  metadata={"hnsw:space": "cosine"} # l2 is the default
) # Renaming, and altering the distance funtion

In [9]:
collection.metadata

{'hnsw:space': 'cosine'}

# Importing data

The dataset is the Edmunds-Consumer Car Ratings and Reviews, which can be found [here](https://www.kaggle.com/datasets/ankkur13/edmundsconsumer-car-ratings-and-reviews)

In [3]:
import pandas as pd
import os
import pathlib
from pathlib import Path

In [5]:
folder_path = Path("../data/edmund_car_reviews")
files_path = [file_path for file_path in folder_path.iterdir()if file_path.is_file()]
len(files_path)
# we have 50 files

50

In [44]:
files_path[35]

PosixPath('data/edmund_car_reviews/Scraped_Car_Review_ford.csv')

In [100]:
import numpy as np
dtypes = {
        "": np.int8,
        "Review_Date": str,
        "Author_Name": str,
        "Vehicle_Title": str,
        "Review_Title": str,
        "Review": str,
        "Rating": np.float32,
    }


In [103]:
file_path = files_path[0]
filename = Path(file_path).name
car_reviews_df = pd.read_csv(file_path,lineterminator='\n', dtype=dtypes)
car_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     9 non-null      int64  
 1   Review_Date    9 non-null      object 
 2   Author_Name    9 non-null      object 
 3   Vehicle_Title  9 non-null      object 
 4   Review_Title   9 non-null      object 
 5   Review         9 non-null      object 
        9 non-null      float64
dtypes: float64(1), int64(1), object(5)
memory usage: 632.0+ bytes


In [105]:
filename.split(".")[0]

'Scrapped_Car_Review_Bugatti'

In [74]:
car_reviews_df.head()

Unnamed: 0.1,Unnamed: 0,Review_Date,Author_Name,Vehicle_Title,Review_Title,Review,Rating\r
0,0,on 03/29/08 13:43 PM (PDT),miamibeach,2007 Bugatti Veyron 16.4 Coupe 2dr Coupe AWD (...,Best Car Ever,"This is the most extreme car ever, I have own...",5.0
1,1,on 06/11/14 10:11 AM (PDT),ironpeddler,2008 Bugatti Veyron 16.4 Coupe 2dr Coupe AWD (...,Lovely Automoblie,I knew I had to have one once I first laid ey...,4.5
2,2,on 11/16/10 00:00 AM (PST),VeyronDriver,2008 Bugatti Veyron 16.4 Coupe 2dr Coupe AWD (...,Synopsis,Have a friend who purchased a Veyron and have...,3.75
3,3,on 07/07/09 13:10 PM (PDT),bugattiowner,2008 Bugatti Veyron 16.4 Coupe 2dr Coupe AWD (...,More Than a Car,After pondering which car to buy once I had t...,4.75
4,4,on 01/05/09 13:12 PM (PST),AceOfRace,2008 Bugatti Veyron 16.4 Coupe 2dr Coupe AWD (...,Finest Piece Of Automotive Engineering,The Bugatti Veyron is the absolute best autom...,4.5


In [107]:
if "Rating\r" in car_reviews_df.columns:
    car_reviews_df.rename(columns={"Rating\r": "Rating"}, inplace = True)

In [108]:
vehicle_data = car_reviews_df["Vehicle_Title"].str.split(' ', expand=True)
car_reviews_df["Vehicule_Date"] = vehicle_data.iloc[:,0]
car_reviews_df["Vehicule_Brand"] = vehicle_data.iloc[:,1]
car_reviews_df["Vehicule_Model"] = vehicle_data.iloc[:,2]
car_reviews_df["Vehicule_Info"] = vehicle_data.iloc[:,2:].stack().groupby(level=0).agg(" ".join)


In [115]:
metadata = car_reviews_df[["Review_Title", "Rating", "Vehicule_Date", 
                           "Vehicule_Model","Vehicule_Brand", "Vehicule_Info"]].to_dict(orient="records")
reviews = car_reviews_df["Review"].to_list()
ids = [f"{filename.split('.')[0]}_{i}" for i in range(car_reviews_df.shape[0])]

In [116]:
collection.add(
    documents = reviews,
    metadatas = metadata,
    ids = ids
)

/home/ivo/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:24<00:00, 3.41MiB/s]


In [118]:
collection.count()

9

In [121]:
collection.get(ids=['Scrapped_Car_Review_Bugatti_0'])

{'ids': ['Scrapped_Car_Review_Bugatti_0'],
 'embeddings': None,
 'metadatas': [{'Rating': 5.0,
   'Review_Title': 'Best Car Ever',
   'Vehicule_Brand': 'Bugatti',
   'Vehicule_Date': '2007',
   'Vehicule_Info': 'Veyron 16.4 Coupe 2dr Coupe AWD (8.0L 16cyl Turbo 7AM)',
   'Vehicule_Model': 'Veyron'}],
 'documents': [' This is the most extreme car ever, I have owned almost every supercar and the Veyron is in a whole new level.  When you drive this kind of car no one cares about fuel economy, you only care about the experience.'],
 'data': None,
 'uris': None}