# Setting up enviorment

In [8]:
# !pip install -q transformers datasets accelerate evaluate scikit-learn torch
# !pip install -U transformers huggingface_hub

In [9]:
import os
import random
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer,AutoModelForSequenceClassification,TrainingArguments,Trainer
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score


In [10]:
import inspect
from transformers import TrainingArguments
print(inspect.signature(TrainingArguments))




In [11]:
import os, random, numpy as np, pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix


# Import file and analyze it 

In [12]:
Seed=42
random.seed(Seed)
np.random.seed(Seed)
os.environ["PYTHONHASHSHEED"]=str(Seed)

In [13]:
df = pd.read_csv('/kaggle/input/sensitive-analysis/sentiment-analysis.csv', sep=",", engine="python")
print("Columns:", df.columns.tolist())
print(df.head(5))

Columns: ['Text, Sentiment, Source, Date/Time, User ID, Location, Confidence Score']
  Text, Sentiment, Source, Date/Time, User ID, Location, Confidence Score
0  I love this product!, Positive, Twitter, 2023-...                     
1  The service was terrible., Negative, Yelp Revi...                     
2  This movie is amazing!, Positive, IMDb, 2023-0...                     
3  I'm so disappointed with their customer suppor...                     
4  Just had the best meal of my life!, Positive, ...                     


# Preprocessing
### we may observe our sentiments are positive or negative convert them to digit 0 or 1

In [15]:
df.columns = df.columns.str.strip()
text_col = 'Text'
label_col = 'Sentiment'

unique = sorted(df[label_col].unique())
label2id = {lab: i for i, lab in enumerate(unique)}
id2label = {i: lab for lab, i in label2id.items()}

df["label"] = df[label_col].map(label2id)

print("Label map:", label2id)
print(df[[text_col, label_col, "label"]].head())


KeyError: 'Sentiment'

# Lets convert our data set into Hugging face format 
### for testing we are using small dataset

In [None]:
data=Dataset.from_pandas(df[[text_col,"label"]])
data_split=data.train_test_split(test_size=0.2,seed=Seed)
dataset=DatasetDict({"train":data_split["train"],"validation":data_split["test"]})
print(dataset)

# Importing our BERT Model

In [None]:
model_base = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_base)

def preprocess(batch):
    return tokenizer(batch[text_col], padding="max_length", max_length=128, truncation=True)

dataset = dataset.map(preprocess, batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
num_labels=len(label2id)
model=AutoModelForSequenceClassification.from_pretrained(model_base,num_labels=num_labels,id2label=id2label,label2id=label2id)

In [None]:
# evaulation metrices
from inspect import signature

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro"),
    }

In [None]:
# hugging face default parameters for model training 
training_args = TrainingArguments(
    output_dir="./bert_sentiment",
    learning_rate=2e-5,
    eval_strategy="epoch", 
    save_strategy="epoch",
    
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
import transformers
print(transformers.__version__)


In [None]:
trainer.train()


In [None]:
pred_out = trainer.predict(dataset["validation"])
preds = np.argmax(pred_out.predictions, axis=-1)
labels = pred_out.label_ids

print("Accuracy:", accuracy_score(labels, preds))
print("Macro F1:", f1_score(labels, preds, average="macro"))
print("\nClassification Report:\n", classification_report(labels, preds, target_names=[str(x) for x in unique]))
print("\nConfusion Matrix:\n", confusion_matrix(labels, preds))


In [None]:
trainer.save_model("./bert_sentiment/final")
tokenizer.save_pretrained("./bert_sentiment/final")
print("Saved to ./bert_sentiment/final")


In [None]:
import torch

test_texts = [
    "I thought it would be terrible but it exceeded my expectations!",
    "The product isn't bad but the customer service ruined everything.",
    "Not the best, not the worst, just okay I guess.",
    "I wanted to love it so badly but it let me down completely.",
    "Despite the flaws, I'm surprisingly happy with my purchase.",
    "The reviews said it was amazing but I found it utterly disappointing.",
    "It's fine if you have low expectations, otherwise you'll hate it.",
    "I can't believe how much I regret buying this piece of junk.",
    "Mixed feelings - great price but questionable quality.",
    "Honestly expected nothing and still managed to be impressed!",
    "Would have been perfect if not for that one major issue.",
    "I'm not sure if I like it or not, very confusing experience.",
    "Started great but ended up being a waste of money.",
    "The worst part? It actually works but I still hate using it.",
    "Surprisingly decent for the price, can't complain much."
]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
test_inputs = tokenizer(test_texts, padding="max_length", max_length=128, truncation=True, return_tensors="pt")
test_inputs = {k: v.to(device) for k, v in test_inputs.items()}

model.eval()
with torch.no_grad():
    outputs = model(**test_inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)

results = pd.DataFrame({
    "Text": test_texts,
    "Predicted_Label": [id2label[pred.item()] for pred in predictions],
    "Confidence": torch.softmax(outputs.logits, dim=-1).max(dim=-1).values.cpu().numpy()
})

print(results.to_string(index=False))

In [None]:
from google.colab import files
import shutil
shutil.make_archive('bert_sentiment_model', 'zip', './bert_sentiment/final')