In [33]:
import pandas as pd
import chromadb
from chromadb.utils import embedding_functions
from sentence_transformers import SentenceTransformer


In [34]:
df = pd.read_csv("../data/cleaned_tweets.csv")
print(f" {len(df)} rows")

df.head()

 14452 rows


Unnamed: 0,text,airline_sentiment,cleaned_text
0,@VirginAmerica What @dhepburn said.,neutral,what said
1,@VirginAmerica plus you've added commercials t...,positive,plus youve added commercials to the experience...
2,@VirginAmerica I didn't today... Must mean I n...,neutral,i didnt today must mean i need to take another...
3,@VirginAmerica it's really aggressive to blast...,negative,its really aggressive to blast obnoxious enter...
4,@VirginAmerica and it's a really big bad thing...,negative,and its a really big bad thing about it


In [35]:
model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
model = SentenceTransformer(model_name)

In [36]:
sentences = df["cleaned_text"].tolist()
embeddings = model.encode(sentences, show_progress_bar=True)
print(f"Embedding Shape: {embeddings.shape}")


Batches: 100%|██████████| 452/452 [00:56<00:00,  7.95it/s]

Embedding Shape: (14452, 384)





In [37]:
from sklearn.model_selection import train_test_split

docs = df['cleaned_text'].tolist()
metadatas = df[['airline_sentiment']].to_dict(orient='records')

(
    train_docs, test_docs,
    train_embeds, test_embeds,
    train_meta, test_meta
) = train_test_split(
    docs, 
    embeddings, 
    metadatas,
    test_size=0.2, 
    random_state=42, 
    stratify=df['airline_sentiment'] 
)


In [38]:
train_ids = [f"train_{i}" for i in range(len(train_docs))]
test_ids  = [f"test_{i}"  for i in range(len(test_docs))]


In [39]:

print(f"Train Size: {len(train_ids)}")
print(f"Test Size:  {len(test_ids)}")

Train Size: 11561
Test Size:  2891


In [40]:
client = chromadb.PersistentClient('../chroma_db')

try:
    client.delete_collection(name="airline_train")
    client.delete_collection(name="airline_test")
    print("Existing collections deleted")
except:
    pass

collection_train  = client.get_or_create_collection(name="airline_train" , metadata={"hnsw:space": "cosine"} )
collection_test  = client.get_or_create_collection(name="airline_test" , metadata={"hnsw:space": "cosine"} )

Existing collections deleted



ChromaDB has a hard limit on how many items can be inserted in a single "transaction" or function call, the limit is about 5000.

In [41]:
def add_batched(collection, ids, embeddings, documents, metadatas, batch_size=4000):
    total = len(ids)
    for i in range(0, total, batch_size):
        # Determine the end of the current batch
        end = min(i + batch_size, total)
        
        # Slice the lists
        batch_ids = ids[i:end]
        batch_embeds = embeddings[i:end]
        batch_docs = documents[i:end]
        batch_meta = metadatas[i:end]
        
        # Add to collection
        collection.add(
            ids=batch_ids,
            embeddings=batch_embeds,
            documents=batch_docs,
            metadatas=batch_meta
        )
        print(f"Added batch {i} to {end} for {collection.name}")

In [42]:
# Insert Train Data
add_batched(
    collection=collection_train,
    ids=train_ids,
    embeddings=train_embeds.tolist(),
    documents=train_docs,
    metadatas=train_meta
)

# Insert Test Data
add_batched(
    collection=collection_test,
    ids=test_ids,
    embeddings=test_embeds.tolist(),
    documents=test_docs,
    metadatas=test_meta
)

Added batch 0 to 4000 for airline_train
Added batch 4000 to 8000 for airline_train
Added batch 8000 to 11561 for airline_train
Added batch 0 to 2891 for airline_test


In [43]:
print(f"Total items in Training Collection: {collection_train.count()}")
print(f"Total items in Test Collection:     {collection_test.count()}")

Total items in Training Collection: 11561
Total items in Test Collection:     2891
