In [1]:
!pip install -q scikit-learn pandas numpy matplotlib seaborn torch torchvision torchaudio transformers datasets


In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("reply_classification_dataset.csv")

print("Dataset shape:", df.shape)


Dataset shape: (2129, 2)


In [2]:
df.head()

Unnamed: 0,reply,label
0,Can we discuss pricing??,NEUTRAL
1,"Im excited to explore this further, plz send c...",POSITIVE
2,We not looking for new solutions.,negative
3,Could u clarify features included?,neutral
4,"lets,, schedule a meeting to dive deeper",positive


In [3]:
df.dropna(subset=['reply', 'label'], inplace=True)

In [4]:
import re
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)  # remove links
    text = re.sub(r"[^a-zA-Z\s]", "", text)    # remove special chars
    text = re.sub(r"\s+", " ", text).strip()
    return text

df['clean_text'] = df['reply'].apply(clean_text)

In [5]:
from sklearn.model_selection import train_test_split

X = df['clean_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train size:", len(X_train), "Test size:", len(X_test))

Train size: 1703 Test size: 426


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score

# Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Train model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [7]:
y_pred_lr = lr_model.predict(X_test_tfidf)

In [8]:
print("\n=== Logistic Regression Results ===")
print(classification_report(y_test, y_pred_lr))
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("F1 Score:", f1_score(y_test, y_pred_lr, average="weighted"))


=== Logistic Regression Results ===
              precision    recall  f1-score   support

    NEGATIVE       0.66      0.84      0.74        58
     NEUTRAL       0.00      0.00      0.00         1
    Negative       0.75      0.68      0.71        53
    POSITIVE       0.76      0.76      0.76        49
    negative       0.88      0.59      0.71        39
     neutral       0.99      1.00      0.99       135
    positive       0.86      0.87      0.86        91

    accuracy                           0.84       426
   macro avg       0.70      0.68      0.68       426
weighted avg       0.85      0.84      0.84       426

Accuracy: 0.8427230046948356
F1 Score: 0.840418880822848


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [9]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch

  from .autonotebook import tqdm as notebook_tqdm
W0923 01:41:20.702000 27028 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.
  torch.utils._pytree._register_pytree_node(


In [10]:
# --- Label mapping (add before train_df/test_df) ---
label_list = sorted(df['label'].unique().tolist())   # sorted for consistency
label2id = {label: idx for idx, label in enumerate(label_list)}
id2label = {idx: label for label, idx in label2id.items()}
num_labels = len(label_list)

# --- Build train/test DataFrames with numeric labels ---
train_df = pd.DataFrame({
    "text": X_train.reset_index(drop=True),
    "label": y_train.reset_index(drop=True)
})
test_df = pd.DataFrame({
    "text": X_test.reset_index(drop=True),
    "label": y_test.reset_index(drop=True)
})

train_df['label'] = train_df['label'].map(label2id)
test_df['label'] = test_df['label'].map(label2id)

# --- Create HF Dataset objects ---
hf_train = Dataset.from_pandas(train_df.reset_index(drop=True))
hf_test  = Dataset.from_pandas(test_df.reset_index(drop=True))

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

hf_train = hf_train.map(tokenize_function, batched=True, remove_columns=["text"])
hf_test  = hf_test.map(tokenize_function, batched=True, remove_columns=["text"])

hf_train.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
hf_test.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# --- Load model with label mapping ---
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

# --- Training args (compatible with different HF versions) ---
common_args = dict(
    output_dir="./results",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,   # lower if CPU-only
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1"
)

try:
    training_args = TrainingArguments(eval_strategy="epoch", **common_args)
except TypeError:
    training_args = TrainingArguments(evaluation_strategy="epoch", **common_args)

# --- Metrics ---
def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    labels = pred.label_ids
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

# --- Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_train,
    eval_dataset=hf_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# --- Train (downloads model, takes time) ---
trainer.train()

# --- Evaluate ---
res = trainer.evaluate()
print("\n--- DistilBERT Eval ---")
print(res)


Map: 100%|██████████| 1703/1703 [00:00<00:00, 10040.92 examples/s]
Map: 100%|██████████| 426/426 [00:00<00:00, 12161.21 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  8%|▊         | 50/639 [02:08<29:21,  2.99s/it]

{'loss': 1.7066, 'learning_rate': 1.8435054773082942e-05, 'epoch': 0.23}


 16%|█▌        | 100/639 [04:03<19:16,  2.15s/it]

{'loss': 0.9535, 'learning_rate': 1.6870109546165886e-05, 'epoch': 0.47}


 23%|██▎       | 150/639 [05:58<20:26,  2.51s/it]

{'loss': 0.7053, 'learning_rate': 1.5305164319248827e-05, 'epoch': 0.7}


 31%|███▏      | 200/639 [08:03<19:05,  2.61s/it]

{'loss': 0.6195, 'learning_rate': 1.374021909233177e-05, 'epoch': 0.94}


                                                 
 33%|███▎      | 213/639 [09:09<18:14,  2.57s/it]

{'eval_loss': 0.4771137833595276, 'eval_accuracy': 0.8450704225352113, 'eval_f1': 0.8398057148955772, 'eval_runtime': 31.3146, 'eval_samples_per_second': 13.604, 'eval_steps_per_second': 1.724, 'epoch': 1.0}


 39%|███▉      | 250/639 [10:47<15:20,  2.37s/it]  

{'loss': 0.4433, 'learning_rate': 1.2175273865414712e-05, 'epoch': 1.17}


 47%|████▋     | 300/639 [12:48<13:47,  2.44s/it]

{'loss': 0.511, 'learning_rate': 1.0610328638497653e-05, 'epoch': 1.41}


 55%|█████▍    | 350/639 [14:48<11:20,  2.35s/it]

{'loss': 0.517, 'learning_rate': 9.045383411580595e-06, 'epoch': 1.64}


 63%|██████▎   | 400/639 [16:50<10:03,  2.53s/it]

{'loss': 0.4899, 'learning_rate': 7.480438184663538e-06, 'epoch': 1.88}


                                                 
 67%|██████▋   | 426/639 [18:22<08:13,  2.31s/it]

{'eval_loss': 0.4357277452945709, 'eval_accuracy': 0.8474178403755869, 'eval_f1': 0.8457647521417596, 'eval_runtime': 30.25, 'eval_samples_per_second': 14.083, 'eval_steps_per_second': 1.785, 'epoch': 2.0}


 70%|███████   | 450/639 [19:23<07:44,  2.46s/it]

{'loss': 0.4326, 'learning_rate': 5.915492957746479e-06, 'epoch': 2.11}


 78%|███████▊  | 500/639 [21:28<05:36,  2.42s/it]

{'loss': 0.4788, 'learning_rate': 4.350547730829422e-06, 'epoch': 2.35}


 86%|████████▌ | 550/639 [23:29<03:29,  2.36s/it]

{'loss': 0.3892, 'learning_rate': 2.7856025039123637e-06, 'epoch': 2.58}


 94%|█████████▍| 600/639 [25:29<01:31,  2.35s/it]

{'loss': 0.479, 'learning_rate': 1.2206572769953053e-06, 'epoch': 2.82}


                                                 
100%|██████████| 639/639 [27:52<00:00,  2.78s/it]

{'eval_loss': 0.4304124712944031, 'eval_accuracy': 0.8497652582159625, 'eval_f1': 0.8477047523211964, 'eval_runtime': 35.4194, 'eval_samples_per_second': 12.027, 'eval_steps_per_second': 1.525, 'epoch': 3.0}


100%|██████████| 639/639 [27:54<00:00,  2.62s/it]


{'train_runtime': 1674.3986, 'train_samples_per_second': 3.051, 'train_steps_per_second': 0.382, 'train_loss': 0.6314668252434529, 'epoch': 3.0}


100%|██████████| 54/54 [00:33<00:00,  1.61it/s]



--- DistilBERT Eval ---
{'eval_loss': 0.4304124712944031, 'eval_accuracy': 0.8497652582159625, 'eval_f1': 0.8477047523211964, 'eval_runtime': 34.3064, 'eval_samples_per_second': 12.418, 'eval_steps_per_second': 1.574, 'epoch': 3.0}


In [11]:
# --- Save the trained model and tokenizer ---
save_path = "SvaraAI/saved_model"  # folder where app.py can load it
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model and tokenizer saved at: {save_path}")


Model and tokenizer saved at: SvaraAI/saved_model


In [12]:
preds_output = trainer.predict(hf_test)
preds = np.argmax(preds_output.predictions, axis=1)
print("\nClassification report (DistilBERT):")
print(classification_report(test_df['label'].map(id2label), [id2label[p] for p in preds], digits=4))

100%|██████████| 54/54 [00:32<00:00,  1.66it/s]



Classification report (DistilBERT):
              precision    recall  f1-score   support

    NEGATIVE     0.6623    0.8793    0.7556        58
     NEUTRAL     0.0000    0.0000    0.0000         1
    Negative     0.7826    0.6792    0.7273        53
    POSITIVE     0.7660    0.7347    0.7500        49
    negative     0.9231    0.6154    0.7385        39
     neutral     0.9926    1.0000    0.9963       135
    positive     0.8511    0.8791    0.8649        91

    accuracy                         0.8498       426
   macro avg     0.7111    0.6840    0.6904       426
weighted avg     0.8565    0.8498    0.8477       426



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## Model Comparison

### 1️⃣ Logistic Regression (Baseline) Results

Classification report:

| Label      | Precision | Recall | F1-score | Support |
|-----------|-----------|--------|----------|---------|
| NEGATIVE  | 0.66      | 0.84   | 0.74     | 58      |
| NEUTRAL   | 0.00      | 0.00   | 0.00     | 1       |
| Negative  | 0.75      | 0.68   | 0.71     | 53      |
| POSITIVE  | 0.76      | 0.76   | 0.76     | 49      |
| negative  | 0.88      | 0.59   | 0.71     | 39      |
| neutral   | 0.99      | 1.00   | 0.99     | 135     |
| positive  | 0.86      | 0.87   | 0.86     | 91      |

- **Accuracy:** 0.8427  
- **Weighted F1-score:** 0.8404  

---

### 2️⃣ DistilBERT Results

Classification report:

| Label      | Precision | Recall | F1-score | Support |
|-----------|-----------|--------|----------|---------|
| NEGATIVE  | 0.6623    | 0.8793 | 0.7556   | 58      |
| NEUTRAL   | 0.0000    | 0.0000 | 0.0000   | 1       |
| Negative  | 0.7826    | 0.6792 | 0.7273   | 53      |
| POSITIVE  | 0.7660    | 0.7347 | 0.7500   | 49      |
| negative  | 0.9231    | 0.6154 | 0.7385   | 39      |
| neutral   | 0.9926    | 1.0000 | 0.9963   | 135     |
| positive  | 0.8511    | 0.8791 | 0.8649   | 91      |

- **Accuracy:** 0.8498  
- **Weighted F1-score:** 0.8477  

---

### 3️⃣ Comparison Table

| Aspect                 | Logistic Regression      | DistilBERT             |
|------------------------|------------------------|-----------------------|
| Accuracy               | 0.8427                 | 0.8498                |
| Weighted F1 Score      | 0.8404                 | 0.8477                |
| Context Understanding  | Poor                   | Excellent             |
| Training Speed         | Very fast              | Slower                |
| Inference Speed        | Fast                   | Moderate              |
| Feature Engineering    | Required (TF-IDF, etc) | Minimal               |
| Resource Requirement   | Low                    | Medium/High           |

---

### 4️⃣ Recommendation for Production

- **DistilBERT** is recommended for production because it has **slightly higher accuracy and F1-score** and **better context understanding**, which is important for correctly classifying nuanced replies.  
- **Logistic Regression** can still be used in **resource-constrained environments** where GPU/memory is limited, but it has slightly lower performance and less robust handling of contextual language.
