In [36]:
import pandas as pd
import numpy as np
import re
import torch
import nest_asyncio
import uvicorn
from pyngrok import ngrok
from fastapi import FastAPI
from pydantic import BaseModel
import threading

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

In [37]:
try:
    df = pd.read_csv("/content/reply_classification_dataset.csv")
    print("Loaded")
    print(df.head())
except FileNotFoundError:
    print("Erro: Not Found")


Loaded
                                               reply     label
0                           Can we discuss pricing??   NEUTRAL
1  Im excited to explore this further, plz send c...  POSITIVE
2                We not looking for new solutions.    negative
3                 Could u clarify features included?   neutral
4           lets,, schedule a meeting to dive deeper  positive


In [38]:
import re
print(df['label'].unique())

['NEUTRAL' 'POSITIVE' 'negative' 'neutral' 'positive' 'NEGATIVE'
 'Positive' 'Neutral' 'Negative']


In [39]:
df['label'] = df['label'].str.lower()
print(df['label'].unique())

['neutral' 'positive' 'negative']


In [40]:
def clean_up_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = text.strip()
    text = re.sub(r"plz", "please", text)
    text = re.sub(r"schdule", "schedule", text)
    text = re.sub(r"intrested", "interested", text)
    text = re.sub(r"\s+", " ", text)
    return text
df['reply'] = df['reply'].apply(clean_up_text)
print(df.head())

                                               reply     label
0                           can we discuss pricing??   neutral
1  im excited to explore this further, please sen...  positive
2                  we not looking for new solutions.  negative
3                 could u clarify features included?   neutral
4           lets,, schedule a meeting to dive deeper  positive


In [41]:
labels = df['label'].unique().tolist()
label_to_id = {label: i for i, label in enumerate(labels)}
id_to_label = {i: label for label, i in label_to_id.items()}
df['label_id'] = df['label'].map(label_to_id)
print("Our label to number mapping:")
print(label_to_id)
print("\nOur DataFrame now has a 'label_id' column:")
print(df.head())

Our label to number mapping:
{'neutral': 0, 'positive': 1, 'negative': 2}

Our DataFrame now has a 'label_id' column:
                                               reply     label  label_id
0                           can we discuss pricing??   neutral         0
1  im excited to explore this further, please sen...  positive         1
2                  we not looking for new solutions.  negative         2
3                 could u clarify features included?   neutral         0
4           lets,, schedule a meeting to dive deeper  positive         1


In [42]:
# Splitting the model
from sklearn.model_selection import train_test_split
X = df['reply']
y = df['label_id']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42, stratify=y)
print(f"Data split into {len(X_train)} training samples and {len(X_test)} testing samples.")

Data split into 1703 training samples and 426 testing samples.


In [43]:
#Let's build a simple baseline model.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
tfidf = TfidfVectorizer(max_features=5000)
X_train_vectors = tfidf.fit_transform(X_train)
X_test_vectors = tfidf.transform(X_test)

#Train the Logistic Regression model
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_vectors, y_train)
print("Model trained! ")

# accuracy
predictions = lr_model.predict(X_test_vectors)

# Let's check the scores of the model we build
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions, average='weighted')
print(f"\nModel Performance:")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  Weighted F1-Score: {f1:.4f}")

Model trained! 

Model Performance:
  Accuracy: 0.9953
  Weighted F1-Score: 0.9953


In [44]:
#let's use the transformer - DistilBERT
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
MODEL_NAME = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
train_df = pd.DataFrame({'text': X_train.values, 'label': y_train.values})
test_df = pd.DataFrame({'text': X_test.values, 'label': y_test.values})

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

#applying the tokenizer
def tokenize_data(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

train_dataset = train_dataset.map(tokenize_data, batched=True)
test_dataset = test_dataset.map(tokenize_data, batched=True)

print("Data is tokenized and ready for the model!")

Map:   0%|          | 0/1703 [00:00<?, ? examples/s]

Map:   0%|          | 0/426 [00:00<?, ? examples/s]

Data is tokenized and ready for the model!


In [45]:
# Fine-Tuning the Model
# Load the pre-trained DistilBERT model, but tell it we have 3 labels.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
labels = df['label'].unique().tolist() # Define labels here
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(labels),
    id2label=id_to_label,
    label2id=label_to_id
).to(device)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1': f1_score(labels, predictions, average='weighted')
    }

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    learning_rate=5e-5,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)
print("\nStarting the fine-tuning ")
trainer.train()
print("Fine-tuning complete!")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting the fine-tuning 


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.002144,1.0,1.0
2,No log,0.000968,1.0,1.0
3,No log,0.000785,1.0,1.0


Fine-tuning complete!


In [46]:
final_metrics = trainer.evaluate()

baseline_accuracy = accuracy
baseline_f1 = f1

print(f"{'Model':<30} | {'Accuracy':<12} | {'F1 Score':<20}")
print("-" * 65)
print(f"{'Logistic Regression (Baseline)':<30} | {baseline_accuracy:<12.4f} | {baseline_f1:<20.4f}")
print(f"{'DistilBERT (Fine-Tuned)':<30} | {final_metrics['eval_accuracy']:<12.4f} | {final_metrics['eval_f1']:<20.4f}")
print("-" * 65)
#save the model for using it in api.
BEST_MODEL_PATH = "svara_reply_classifier"
trainer.save_model(BEST_MODEL_PATH)
tokenizer.save_pretrained(BEST_MODEL_PATH)
print(f"\nBest model and tokenizer saved to the '{BEST_MODEL_PATH}' directory.")

Model                          | Accuracy     | F1 Score            
-----------------------------------------------------------------
Logistic Regression (Baseline) | 0.9953       | 0.9953              
DistilBERT (Fine-Tuned)        | 1.0000       | 1.0000              
-----------------------------------------------------------------

Best model and tokenizer saved to the 'svara_reply_classifier' directory.


In [47]:
!zip -r svara_reply_classifier.zip svara_reply_classifier


  adding: svara_reply_classifier/ (stored 0%)
  adding: svara_reply_classifier/model.safetensors (deflated 8%)
  adding: svara_reply_classifier/training_args.bin (deflated 53%)
  adding: svara_reply_classifier/vocab.txt (deflated 53%)
  adding: svara_reply_classifier/config.json (deflated 49%)
  adding: svara_reply_classifier/tokenizer_config.json (deflated 75%)
  adding: svara_reply_classifier/tokenizer.json (deflated 71%)
  adding: svara_reply_classifier/special_tokens_map.json (deflated 42%)


In [48]:
from google.colab import files
files.download("svara_reply_classifier.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>