<a href="https://colab.research.google.com/github/juancvergara1/sentiment_analysis/blob/main/Finetuned_Roberta_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Install Libraries**

In [None]:
!pip install transformers

In [None]:
!pip install datasets

In [None]:
!pip install mlflow

In [None]:
!pip install evaluate

**Import Libraries**

In [None]:
#import libraries
import torch
import pandas as pd
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
#import libraries
import numpy as np
from imblearn.datasets import make_imbalance
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, TrainingArguments, Trainer, BertModel
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from datasets import Dataset,load_dataset, load_from_disk, DatasetDict
import os
import evaluate
import io

**Import Data set and delete empty rows**

In [None]:
#import database
from google.colab import files
uploaded = files.upload()

In [None]:
#Read dataset and delete empty rows

In [None]:
df = pd.read_excel(io.BytesIO(uploaded['csat_roberta.xlsx']))
df['text'].replace('', pd.NaT, inplace=True)
df.dropna(subset=['text'], inplace=True)
df = df.dropna()

In [None]:
df.head

**# Original confusion Matrix and classification report**


In [None]:
#Load initial model and tokenizer
tokenizer_prueba = RobertaTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment-latest')
model_prueba = RobertaForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment-latest', num_labels=3)

In [None]:
# Create the sentiment analysis pipeline
sentiment_analysis = pipeline("sentiment-analysis", model=model_prueba, tokenizer=tokenizer_prueba, padding="max_length", truncation=True, max_length=128, device=0)

In [None]:
# Convert the labels to numerical type
df["labels"] = df["labels"].replace({"NEU":1,"NEG":0,"POS":2})

In [None]:
true_labels = df['labels'].astype(int)
pred_labels = []

In [None]:
# Iterate through each text in the dataframe and predict its sentiment label
label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
pred_labels = []
for text in df['text']:
    output = sentiment_analysis(text)
    pred_label = output[0]['label']
    pred_labels.append(label_map[pred_label])

In [None]:
from sklearn.metrics import confusion_matrix
# Compute confusion matrix
conf_mat = confusion_matrix(df['labels'], pred_labels, labels=[0, 1, 2])
print(conf_mat)

[[423  27   4]
 [ 66  10   6]
 [ 94 125 872]]


In [None]:
# Print classification report
print(classification_report(true_labels, pred_labels))

              precision    recall  f1-score   support

           0       0.73      0.93      0.82       454
           1       0.06      0.12      0.08        82
           2       0.99      0.80      0.88      1091

    accuracy                           0.80      1627
   macro avg       0.59      0.62      0.59      1627
weighted avg       0.87      0.80      0.82      1627



**Load Tokenizer and model**

In [None]:
# Load the tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment-latest')
model = RobertaForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment-latest', num_labels=3)

In [None]:
#Convertir a Dataset
dataset = Dataset.from_pandas(df,preserve_index=False)
train_devtest = dataset.train_test_split(shuffle = True, seed = 200, test_size=0.3)
posts_dev_test = train_devtest['test'].train_test_split(shuffle = True, seed = 200, test_size=0.50)
posts_train_dev_test_dataset = DatasetDict({
    'train': train_devtest['train'],
    'test': posts_dev_test['test'],
    'dev': posts_dev_test['train']})

In [None]:
#Tokenizar Dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True,max_length=128)
tokenized_datasets = posts_train_dev_test_dataset.map(tokenize_function, batched=True)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
#Load the training arguments and hyperparameters
training_args = TrainingArguments(output_dir="/content/roberta_model",
                                  logging_strategy="epoch",
                                  evaluation_strategy="epoch",
                                  per_device_train_batch_size=16,
                                  per_device_eval_batch_size=16,
                                  num_train_epochs=1,
                                  save_strategy = "epoch",
                                  load_best_model_at_end=True,
                                  learning_rate=1e-6
                                  )

In [None]:
#Traine the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    compute_metrics=compute_metrics
)
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.3384,0.300777,0.902499


TrainOutput(global_step=712, training_loss=0.33836819080824265, metrics={'train_runtime': 17997.6404, 'train_samples_per_second': 0.633, 'train_steps_per_second': 0.04, 'total_flos': 749281235334912.0, 'train_loss': 0.33836819080824265, 'epoch': 1.0})

In [None]:
#Save model
save_directory ="/content/finetunedmodel"
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)

**Load Pretrained Model**

In [None]:
save_directory ="/content/finedtunedmodel"

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(save_directory)
# Load the model
model = AutoModelForSequenceClassification.from_pretrained(save_directory)

**Create confusion matrix and classification report with finedtuned model**

In [None]:
#load New dataset
from google.colab import files
uploaded = files.upload()

Saving walmart_csat_roberta.xlsx to walmart_csat_roberta (2).xlsx


In [None]:
df2 = pd.read_excel(io.BytesIO(uploaded['csat_roberta2.xlsx']))
df2['text'].replace('', pd.NaT, inplace=True)
df2.dropna(subset=['text'], inplace=True)
df2 = df2.dropna()

In [None]:
# Create the sentiment analysis pipeline
sentiment_task = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, padding="max_length", truncation=True, max_length=128, device=0)

In [None]:
true_labels = df2['labels'].astype(int)
pred_labels = []

In [None]:
# Iterate through each text in the dataframe and predict its sentiment label
label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
pred_labels = []
for text in df2['text']:
    output = sentiment_task(text)
    pred_label = output[0]['label']
    pred_labels.append(label_map[pred_label])



In [None]:
from sklearn.metrics import confusion_matrix
# Compute confusion matrix
conf_mat = confusion_matrix(df['labels'], pred_labels, labels=[0, 1, 2])
print(conf_mat)

In [None]:
# Print classification report
print(classification_report(true_labels, pred_labels))

              precision    recall  f1-score   support

           0       0.79      0.94      0.86      4479
           1       0.06      0.00      0.00       852
           2       0.95      0.95      0.95     10943

    accuracy                           0.90     16274
   macro avg       0.60      0.63      0.61     16274
weighted avg       0.86      0.90      0.88     16274



**Create CSV file with sentiment from the pretrained model**

In [None]:
# Create empty lists for sentiment labels and probabilities
sentiment_labels = []
sentiment_probs = []

In [None]:
# Iterate over each text in the DataFrame
for text in df2["text"]:
    # Perform sentiment analysis on the text
    output = sentiment_analysis(text)[0]
    # Get the sentiment label and probability
    sentiment_labels.append(output["label"])
    sentiment_probs.append(output["score"])


In [None]:
# Add the sentiment labels and probabilities as columns to the DataFrame
df2["sentiment_label"] = sentiment_labels
df2["sentiment_prob"] = sentiment_probs

In [None]:
# Save the DataFrame as a CSV file
df2.to_csv("/content/sentiment.csv", index=False)