In [1]:
import pandas as pd
import chromadb
from chromadb.utils import embedding_functions
from sentence_transformers import SentenceTransformer


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("../data/cleaned_tweets.csv")
print(f" {len(df)} rows")

df.head()

 14452 rows


Unnamed: 0,text,airline_sentiment,cleaned_text
0,@VirginAmerica What @dhepburn said.,neutral,what said
1,@VirginAmerica plus you've added commercials t...,positive,plus youve added commercials to the experience...
2,@VirginAmerica I didn't today... Must mean I n...,neutral,i didnt today must mean i need to take another...
3,@VirginAmerica it's really aggressive to blast...,negative,its really aggressive to blast obnoxious enter...
4,@VirginAmerica and it's a really big bad thing...,negative,and its a really big bad thing about it


In [8]:
from sklearn.model_selection import train_test_split

df_train , df_test = train_test_split(df, test_size=0.2 , random_state= 42, stratify=df['airline_sentiment'])

In [3]:
model_name =  "sentence-transformers/all-mpnet-base-v2"
model = SentenceTransformer(model_name)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [7]:
client = chromadb.PersistentClient('../chroma_db')

collection_train = client.create_collection(name='airline_train_v2' , metadata={"hnsw:space": "cosine"})
collection_test = client.create_collection(name='airline_test_v2' , metadata={"hnsw:space": "cosine"})


In [19]:
from tqdm import tqdm

def store_in_batches(collection, embeddings, metadatas, ids, documents=None, batch_size=500):
  
    total_records = len(ids)
    print(f"Starting batch storage for {total_records} records into '{collection.name}'...")
    
    for i in tqdm(range(0, total_records, batch_size), desc="Uploading Batches"):
        end_idx = i + batch_size
        
        collection.add(
            embeddings = embeddings[i:end_idx],
            metadatas = metadatas[i:end_idx],
            ids = ids[i:end_idx],
            documents =  documents[i:end_idx]
        )
        
    print(f" Successfully stored all {total_records} records.")

In [20]:
def process_and_store(df, collection, data_type="train"):
    print(f"Processing {data_type} data with V2 model...")
    docs = df['cleaned_text'].tolist()
    
    ids = [f"{data_type}_{i}" for i in range(len(df))]

    metadatas = df[['airline_sentiment']].to_dict(orient='records')

    embeddings = model.encode(docs, show_progress_bar=True)

    store_in_batches( collection , embeddings , metadatas ,ids , docs)
    
    print(f" Saved {len(ids)} vectors to {collection.name}")

In [21]:
process_and_store(df_train , collection_train , 'train')

Processing train data with V2 model...


Batches: 100%|██████████| 362/362 [06:24<00:00,  1.06s/it]


Starting batch storage for 11561 records into 'airline_train_v2'...


Uploading Batches: 100%|██████████| 24/24 [00:09<00:00,  2.58it/s]

 Successfully stored all 11561 records.
 Saved 11561 vectors to airline_train_v2





In [22]:
process_and_store(df_test , collection_test , 'test')

Processing test data with V2 model...


Batches: 100%|██████████| 91/91 [01:19<00:00,  1.15it/s]


Starting batch storage for 2891 records into 'airline_test_v2'...


Uploading Batches: 100%|██████████| 6/6 [00:02<00:00,  2.72it/s]

 Successfully stored all 2891 records.
 Saved 2891 vectors to airline_test_v2





In [23]:
import numpy as np
def load_data_from_collection( collection):

    data = collection.get(include = ['embeddings' , 'metadatas'])

    x = np.array(data['embeddings'])
    y = np.array([item['airline_sentiment'] for item in data['metadatas']])

    return x, y


In [24]:
X_train, y_train = load_data_from_collection(collection_train)
X_test, y_test = load_data_from_collection(collection_test)


#### Training and testing

In [25]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_curve, auc
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt

classes = ['negative', 'neutral', 'positive']
y_test_bin = label_binarize(y_test, classes=classes)

In [27]:
# 1. Define Grid
param_grid_lr = {'C': [0.01, 0.1, 1, 10],
                  'solver': ['liblinear', 'lbfgs'] }

# 2. Run Grid Search
grid_lr = GridSearchCV(
    LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42),
    param_grid_lr, cv=3, scoring='f1_macro'
)
grid_lr.fit(X_train, y_train)
best_lr = grid_lr.best_estimator_

# 3. Print Results
print(f"Best LR Params: {grid_lr.best_params_}")
print("\n--- Logistic Regression Report ---")
print(classification_report(y_test, best_lr.predict(X_test)))



Best LR Params: {'C': 1, 'solver': 'liblinear'}

--- Logistic Regression Report ---
              precision    recall  f1-score   support

    negative       0.89      0.88      0.88      1818
     neutral       0.65      0.68      0.66       613
    positive       0.73      0.74      0.74       460

    accuracy                           0.81      2891
   macro avg       0.76      0.76      0.76      2891
weighted avg       0.81      0.81      0.81      2891



In [28]:
# 1. Define Grid (Kept small for speed)
param_grid_svm = {
    'C': [1, 10],            
    'gamma': ['scale', 'auto'],
    'kernel': ['rbf']       
}


# 2. Run Grid Search
grid_svm = GridSearchCV(
    SVC(class_weight='balanced', probability=True, random_state=42),
    param_grid_svm, cv=3, scoring='f1_macro'
)
grid_svm.fit(X_train, y_train)
best_svm = grid_svm.best_estimator_

# 3. Print Results
print(f"Best SVM Params: {grid_svm.best_params_}")
print("\n--- SVM Report ---")
print(classification_report(y_test, best_svm.predict(X_test)))

Best SVM Params: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}

--- SVM Report ---
              precision    recall  f1-score   support

    negative       0.92      0.85      0.88      1818
     neutral       0.62      0.72      0.67       613
    positive       0.72      0.78      0.75       460

    accuracy                           0.81      2891
   macro avg       0.75      0.78      0.77      2891
weighted avg       0.82      0.81      0.82      2891



In [None]:
plt.figure(figsize=(8, 6))

# 1. Logistic Regression Curve
y_prob_lr = best_lr.predict_proba(X_test)
fpr_lr, tpr_lr, _ = roc_curve(y_test_bin.ravel(), y_prob_lr.ravel())
plt.plot(fpr_lr, tpr_lr, label=f'Logistic Regression (AUC = {auc(fpr_lr, tpr_lr):.3f})', lw=2)

# 2. SVM Curve
y_prob_svm = best_svm.predict_proba(X_test)
fpr_svm, tpr_svm, _ = roc_curve(y_test_bin.ravel(), y_prob_svm.ravel())
plt.plot(fpr_svm, tpr_svm, label=f'SVM (AUC = {auc(fpr_svm, tpr_svm):.3f})', lw=2, linestyle='--')

# 3. Settings
plt.plot([0, 1], [0, 1], 'k--', alpha=0.5) # Diagonal line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Best Models Comparison: ROC-AUC (Micro-Average)')
plt.legend(loc="lower right")
plt.grid(alpha=0.3)
plt.show()