# LLM-Based Workflow for Sentiment Analysis, Toxicity Detection, and Toxic Style Transfer

* Author: Julian Li
* Date: 2025 March



## ENV set up

In [172]:
!curl -L https://ollama.com/download/ollama-linux-amd64.tgz -o ollama-linux-amd64.tgz
!tar -xzf ollama-linux-amd64.tgz
!nohup bin/ollama serve > ollama.log 2>&1 &

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1639M  100 1639M    0     0  57.8M      0  0:00:28  0:00:28 --:--:-- 60.7M


In [174]:
! bin/ollama pull llama3.2:1b

[?2026h[?25l[1Gpulling manifest ⠋ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠙ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠹ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠸ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest [K
pulling 74701a8c35f6... 100% ▕▏ 1.3 GB                         [K
pulling 966de95ca8a6... 100% ▕▏ 1.4 KB                         [K
pulling fcc5a6bec9da... 100% ▕▏ 7.7 KB                         [K
pulling a70ff7e570d9... 100% ▕▏ 6.0 KB                         [K
pulling 4f659a1e86d7... 100% ▕▏  485 B                         [K
verifying sha256 digest [K
writing manifest [K
success [K[?25h[?2026l


In [3]:
!pip install langchain_community langchain_ollama fasttext

Collecting langchain_community
  Downloading langchain_community-0.3.19-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain_ollama
  Downloading langchain_ollama-0.2.3-py3-none-any.whl.metadata (1.9 kB)
Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting ollama<1,>=0.

In [4]:
# !huggingface-cli download ibm-granite/granite-3.0-2b-instruct --local-dir ./granite-3.0-2b-instruct --local-dir-use-symlinks False

In [5]:
!wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin

--2025-03-17 01:19:16--  https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.169.252.84, 3.169.252.42, 3.169.252.72, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.169.252.84|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 131266198 (125M) [application/octet-stream]
Saving to: ‘lid.176.bin’


2025-03-17 01:19:17 (117 MB/s) - ‘lid.176.bin’ saved [131266198/131266198]



## Import Packages

In [26]:
# 1. Import Packages
# Standard Libraries
import re
import sys
import time
import random
import torch
import datetime
import pandas as pd
from tqdm import tqdm  # Import tqdm for progress bar

# LangChain Libraries
from langchain.agents import AgentExecutor, create_tool_calling_agent
from langchain.tools import Tool
from langchain.prompts.chat import ChatPromptTemplate
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain.prompts import PromptTemplate
from langchain_ollama import ChatOllama

# Transformers Libraries
from transformers import AutoTokenizer, MT5ForConditionalGeneration, AutoModelForCausalLM, pipeline

# FastText Library
import fasttext

# Set device to CUDA
device = "cuda"

#import data from google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Language detection and translation tool

In [7]:
# 2. Language Detection and Translation
# 2.1 Language Detection Tool
lang_detector = fasttext.load_model("lid.176.bin")


def detect_language(text: str) -> str:
    """Detect the language of input text using FastText"""
    detected_lang = lang_detector.predict(text)[0][0].replace("__label__", "")
    return detected_lang


language_detection_tool = Tool(
    name="Language Detector",
    description="Detects the language of input text using FastText.",
    func=detect_language,
    return_direct=True,
)

# 2.2 Translation Tool
translation_tokenizer = AutoTokenizer.from_pretrained("UBC-NLP/toucan-base")
translation_model = MT5ForConditionalGeneration.from_pretrained(
    "UBC-NLP/toucan-base", torch_dtype=torch.float16, device_map="auto"
)
translation_model.eval()


def translate_to_english(text: str) -> str:
    """Translate non-English text to English using Toucan-Base"""
    input_text = f"eng: {text}"
    input_ids = translation_tokenizer(
        input_text, return_tensors="pt", max_length=1024, truncation=True
    ).to("cuda:0")
    with torch.no_grad():
        generated_ids = translation_model.generate(
            **input_ids,
            num_beams=5,
            max_new_tokens=100,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
        )
    translated_text = translation_tokenizer.batch_decode(
        generated_ids, skip_special_tokens=True, skip_prompt=True
    )[0]
    return translated_text


translation_tool = Tool(
    name="Text Translator",
    description="Translates non-English text to English using Toucan-Base.",
    func=translate_to_english,
    return_direct=True,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.80M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/18.0M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/868 [00:00<?, ?B/s]

You are using a model of type t5 to instantiate a model of type mt5. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

## Sentiment analysis agent

In [170]:
# 3. Sentiment Analysis
model_path = "ibm-granite/granite-3.0-2b-instruct"  # Local Path

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
model.eval()

sentiment_prompt_template = PromptTemplate(
    input_variables=["sentence"],
    template=(
        """Question: Explain why the following sentence is classified as positive, negative, or mixed: {sentence}.
        Please give me your class: positive, negative, or mixed and provide your explanation within 50 words as followed sructure:
        'The sentence is ...(positive, negative, or mixed). ...(your explanation)'
        """
    ),
)


def analyze_text(sentence, prompt_template, model, tokenizer, regex_pattern, max_retries=5):
    for i in range (max_retries):
        prompt = prompt_template.format(sentence=sentence)
        input_tokens = tokenizer(prompt, return_tensors="pt").to("cuda:0")
        output = model.generate(**input_tokens, max_new_tokens=100, do_sample=True, temperature=0.75)
        output_text = tokenizer.decode(output[0], skip_special_tokens=True)
        match = re.search(regex_pattern, output_text, re.IGNORECASE)
        label = match.group(0).lower() if match else "unknown"
        if match:
            label = match.group(1).lower()
            explanation = output_text[match.end() :].strip()
            break
        else:
            if i == max_retries - 1:
                print(f"error on: {sentence}")
                explanation = "No explanation provided."

    explanation = explanation.strip(". ")
    results = {
        "original_text": sentence,
        "output": {"label": label, "explanation": explanation},
    }
    return results


def analyze_sentiment(sentence):
    return analyze_text(
        sentence, sentiment_prompt_template, model, tokenizer, r"The sentence is\s+(positive|negative|mixed)\b"
    )


sentiment_tool = Tool(
    name="Sentiment Analysis Tool",
    description="Analyzes the sentiment of the input text, returning positive, mixed, or negative along with a brief explanation.",
    func=analyze_sentiment,
    return_direct=True,
)

Ollama_model = ChatOllama(
    model="llama3.2:1b",
    temperature=0.75,
)

sentiment_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You are a helpful assistant to analyzes sentiment. Make sure to use 'Sentiment Analysis Tool' to classify the sentiment of the generated text.
            Return the label 'positive' or 'negative' or 'mixed' and give the explanation.
            """,
        ),
        # ("placeholder", "{chat_history}"),
        ("human", "{input}"),
        ("placeholder", "{agent_scratchpad}"),
    ]
)

sentiment_agent = create_tool_calling_agent(
    llm=Ollama_model, tools=[sentiment_tool], prompt=sentiment_prompt
)
sentiment_agent_executor = AgentExecutor(
    prompt=sentiment_prompt,
    agent=sentiment_agent,
    tools=[sentiment_tool],
    verbose=True,
    handle_parsing_errors=True,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [176]:
sentiment_response = sentiment_agent_executor.invoke({"input": "sentiment analysis this sentence: Oh my god, I love you so much! It's very nice of you."})
sentiment_response['output']['output']



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `Sentiment Analysis Tool` with `Oh my god, I love you so much! It’s very nice of you.`


[0m[36;1m[1;3m{'original_text': 'Oh my god, I love you so much! It’s very nice of you.', 'output': {'label': 'positive', 'explanation': 'It expresses strong affection and appreciation towards someone'}}[0m
[32;1m[1;3m[0m

[1m> Finished chain.[0m


{'label': 'positive',
 'explanation': 'It expresses strong affection and appreciation towards someone'}

## Toxicity analysis agent

In [9]:
# 4. Toxicity Analysis
toxic_prompt_template = PromptTemplate(
    input_variables=["sentence"],
    template=(
        """Question: Explain why the following sentence is classified as toxic or non-toxic: {sentence}.
        Please give me your class: toxic or non-toxic and provide your explanation within 50 words as followed sructure:
        'The sentence is ...(toxic or non-toxic). ...(your explanation)'
        """
    ),
)

def analyze_toxic(sentence):
    if isinstance(sentence, list):
        sentence = sentence[0]
    return analyze_text(
        sentence, toxic_prompt_template, model, tokenizer, r"The sentence is\s+(toxic|non-toxic)\b"
    )


toxic_tool = Tool(
    name="Toxicity Detection Tool",
    description="Detects whether the input text is toxic or non-toxic and provides a brief explanation.",
    func=analyze_toxic,
    return_direct=True,
)

toxic_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You are a helpful assistant to analyze toxicity. Make sure to use the 'Toxicity Detection Tool' to classify the toxicity of the generated text. Return the label 'toxic' or 'non-toxic' and give the explanation.
            """
        ),
        # ("placeholder", "{chat_history}"),
        ("human", "{input}"),
        ("placeholder", "{agent_scratchpad}"),
    ]
)

toxic_agent = create_tool_calling_agent(llm=Ollama_model, tools=[toxic_tool], prompt=toxic_prompt)
toxic_agent_executor = AgentExecutor(
    agent=toxic_agent, tools=[toxic_tool], verbose=True, handle_parsing_errors=True, max_iter=5
)

In [44]:
toxic_response = toxic_agent_executor.invoke({"input": "toxic analysis this sentence: You are dumb and such a idiot!"})
toxic_response['output']['output']



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `Toxicity Detection Tool` with `You are dumb and such a idiot!`


[0m[36;1m[1;3m{'original_text': 'You are dumb and such a idiot!', 'output': {'label': 'toxic', 'explanation': 'It contains personal and insulting language towards the recipient, which can be harmful and disrespectful'}}[0m
[32;1m[1;3m[0m

[1m> Finished chain.[0m


{'label': 'toxic',
 'explanation': 'It contains personal and insulting language towards the recipient, which can be harmful and disrespectful'}

## Detoxification agent

In [187]:
# 5. Detoxification
detoxic_prompt_template = PromptTemplate(
    input_variables=["sentence"],
    template=(
        """Rewrite the following toxic sentence in a polite and non-toxic way :{sentence}.
           Provide your rewriten sentence as followed sructure: 'The non-toxic way is ...(your answer)'
        """
    ),
)


import re

def detoxic_tools(sentence, max_retries=5):
    toxic_tool_label = toxic_tool.func(sentence)["output"]["label"]

    rewritten_text = "NO ANSWER"  # Ensure rewritten_text is always defined

    if toxic_tool_label == "toxic":
        for i in range(max_retries):
            prompt = detoxic_prompt_template.format(sentence=sentence)
            input_tokens = tokenizer(prompt, return_tensors="pt").to("cuda:0")
            output = model.generate(**input_tokens, max_new_tokens=512, temperature=0.5, do_sample=True)
            output_text = tokenizer.decode(output[0], skip_special_tokens=True)

            match = re.search(r'The non-toxic way.*?"(.*?)"', output_text, re.IGNORECASE | re.DOTALL)
            if match:
                rewritten_text = match.group(1)
                break  # Exit loop if valid text is found

            if i == max_retries - 1:
                print(f"Warning: Failed to rewrite toxic text after {max_retries} retries: {sentence}")

    elif toxic_tool_label == "non-toxic":
        rewritten_text = sentence
    else:
        print(f"Warning: Unexpected label '{toxic_tool_label}' for sentence: {sentence}")
        toxic_tool_label = "unknown"  # Default label for unexpected cases

    results = {
        "original_text": sentence,
        "label": toxic_tool_label,
        "output": {
            "label": toxic_tool_label,
            "original_text": sentence,
            "rewritten_text": rewritten_text,
        },
    }
    return results


detoxic_tool = Tool(
    name="Detoxification Tool",
    description="Detoxify toxic sentence into a polite and non-toxic sentence.",
    func=detoxic_tools,
    return_direct=True,
)

detoxify_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You are a helpful assistant for detoxification. Make sure to use 'Detoxic Tool' to detoxify toxic sentences into polite and non-toxic sentences.
            Return the rewritten sentence in a polite manner.
            """,
        ),
        # ("placeholder", "{chat_history}"),
        ("human", "{input}"),
        ("placeholder", "{agent_scratchpad}"),
    ]
)

detoxify_agent = create_tool_calling_agent(llm=Ollama_model, tools=[detoxic_tool], prompt=detoxify_prompt)
detoxify_agent_executor = AgentExecutor(
    agent=detoxify_agent, tools=[detoxic_tool], verbose=True, handle_parsing_errors=True, max_iter=5
)

In [188]:
detoxify_response = detoxify_agent_executor.invoke({"input": "detoxify analysis this toxic sentence: You are dumb and such a idiot!"})
detoxify_response['output']['output']



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `Detoxification Tool` with `You are dumb and such a idiot!`


[0m[36;1m[1;3m{'original_text': 'You are dumb and such a idiot!', 'label': 'toxic', 'output': {'label': 'toxic', 'original_text': 'You are dumb and such a idiot!', 'rewritten_text': "I believe there might be a misunderstanding. Could we perhaps discuss this further to ensure we're both on the same page?"}}[0m
[32;1m[1;3m[0m

[1m> Finished chain.[0m


{'label': 'toxic',
 'original_text': 'You are dumb and such a idiot!',
 'rewritten_text': "I believe there might be a misunderstanding. Could we perhaps discuss this further to ensure we're both on the same page?"}

## Batch process workflow

In [11]:
def batch_process_texts(texts: list, task_type: str, max_retries=100) -> list:
    """
    Batch process a list of texts, supporting sentiment analysis, toxicity detection, and detoxification.

    Args:
        texts (list): List of texts to be processed.
        task_type (str): Task type, must be one of 'toxic', 'sentiment', or 'detoxic'.
        max_retries (int, optional): Maximum number of retries in case of failure. Defaults to 100.

    Returns:
        list: List of processed results.
    """
    results = []

    # Validate task type
    valid_tasks = ["toxic", "sentiment", "detoxic"]
    if task_type not in valid_tasks:
        raise ValueError(f"Task type must be one of {valid_tasks}")

    # Map task types to their respective executors
    executor_map = {
        "toxic": toxic_agent_executor,
        "sentiment": sentiment_agent_executor,
        "detoxic": detoxify_agent_executor,
    }

    selected_executor = executor_map[task_type]

    # Display progress using tqdm
    for i, text in enumerate(tqdm(texts, desc=f"Processing {task_type} analysis")):
        # Add a random delay (0-2 seconds) to prevent excessive requests
        time.sleep(random.uniform(0, 2))

        # Perform language detection and translation if needed
        if language_detection_tool.func(text) != "en":
            translated_text = translation_tool.func(text)
        else:
            translated_text = text

        retry_count = 0
        while retry_count < max_retries:
            try:
                if task_type == "toxic":
                    result = selected_executor.invoke(
                        {
                            "input": f"""Make sure to use the Toxicity Detection Tool to analyze toxicity in a full passage. The passage starts here: {translated_text}"""
                        }
                    )
                elif task_type == "sentiment":
                    result = selected_executor.invoke(
                        {
                            "input": f"""Make sure to use the Sentiment Analysis Tool to analyze sentiment in the full text. The text is: {translated_text}"""
                        }
                    )
                else:  # detoxic
                    result = selected_executor.invoke(
                        {
                            "input": f"""Make sure to use the Detoxification Tool to rephrase the toxic passage into a polite way. The passage starts here: {translated_text}"""
                        }
                    )

                # Validate the output
                if isinstance(result["output"], dict) and result["output"]["original_text"] == translated_text:
                    results.append(result["output"]["output"])
                    print(f"✅ Sentence {i+1} processed successfully.")
                    break
                else:
                    print(f"❌ Validation failed for sentence {i+1}, retrying {retry_count}/{max_retries}")
                    retry_count += 1
                    time.sleep(1)  # Wait 1 second before retrying

            except Exception as e:
                retry_count += 1
                print(f"⚠️ Error on attempt {retry_count} for sentence {i+1}: {str(e)}")
                time.sleep(1)  # Wait 1 second before retrying

        # If max retries are reached and processing failed, append an empty dictionary `{}`
        if retry_count == max_retries:
            results.append({})
            print(f"❌ Failed to process sentence {i+1} after {max_retries} attempts. Added empty result.")

    return results

## Data preprocess
If we import the data, we can see that the raw sentences in each row are full of noise, leading numbers, extra spaces, and inconsistent punctuation. So we need preprocess first before detoxication or toxic analysis.

In [12]:
def preprocess_text(text):
    """
    Preprocesses the input text by performing the following steps:

    1. Removes any leading numbers, spaces, and non-word characters (punctuation) at the beginning of the sentence.
    2. Replaces multiple consecutive spaces with a single space.
    3. Standardizes apostrophes by replacing different forms (e.g., `’`, `'`, and `` ` ``) with a single quote (`'`).
    4. Strips any remaining leading or trailing spaces.

    Args:
        text (str): The input text to preprocess.

    Returns:
        str: The cleaned and formatted text.
    """
    # Remove leading numbers, spaces, and punctuation at the beginning of the sentence
    text = re.sub(r'^[\s\d\W]+', '', text)

    # Replace multiple consecutive spaces with a single space
    text = re.sub(r'\s+', ' ', text)

    # Standardize apostrophe characters to a single quote (')
    text = re.sub(r"[’'`]+", "'", text)

    # Trim leading and trailing spaces and return the cleaned text
    return text.strip()

## open log
Since we might input large dataset, we need a log to record in case some of cases are not valid to process

In [88]:
# Generate a timestamp and create a log file name
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
log_filename = f"/content/drive/My Drive/sentiment_detoxicfication_ana/notebook_{timestamp}.log"

# Create a TeeLogger class to ensure output is displayed in JupyterLab and logged simultaneously
class TeeLogger:
    def __init__(self, filename):
        self.terminal = sys.stdout  # Store the original stdout
        self.log = open(filename, "a", encoding="utf-8")  # Open the log file in append mode

    def write(self, message):
        self.terminal.write(message)  # Display output in JupyterLab
        sys.__stdout__.write(message)  # Ensure JupyterLab properly displays the output
        self.log.write(message)  # Write output to the log file
        self.log.flush()  # Immediately flush the log file buffer

    def flush(self):
        self.terminal.flush()
        self.log.flush()

# Start logging
sys.stdout = TeeLogger(log_filename)

## Toxicity analysis and detoxification based on Ollma3.2 agent
We run the top 5 rows as examples.

In [75]:
toxic_file_path = '/content/drive/My Drive/sentiment_detoxicfication_ana/Milestone-2-toxic-test-solutions.csv'
toxic_df = pd.read_csv(toxic_file_path).head()
toxic_texts = toxic_df['text']
toxic_texts = [preprocess_text(toxic_text) for toxic_text in toxic_texts]
toxic_texts

["I know he's an idiot I think he might of been held back a year in school then again Texans aren't really smart or average for that matter.  ",
 '  ==my butthole==  i love buttholes. they taste mmm goood. i will never stop vandalizing wikipedia. u no why?  cuz u dont tell me wat to do. k bye i hate you',
 "This is absolute bullshit! We fucking knew this shit was going down. Now we damn well need to fucking dig into those fucking voting machines 'cause I'm goddamn sure Trump ain't fucking lying about that fucking voter fraud!",
 'Moochelle Nobama takes swipe at Trump it sez. Trump better look out, apes have a long reach.',
 "2. \t> Lol, I guessed IPN (the Institue of National Remembrance) was going to be mentioned  From what I understood from their message on IPN's page, they actually went after that cunt that modified the trailer (and has two more cunty videos on his channel), not after the actual game itself.  The journalists however - especially the rightwing ones - seen IPN's messa

In [63]:
print('----------------detoxification based on Ollma3.2 agent----------------')
toxic_results = batch_process_texts(toxic_texts, task_type='detoxic')

Processing detoxic analysis: 100%|██████████| 5/5 [02:33<00:00, 30.69s/it]


In [72]:
# toxic_results = batch_process_texts(toxic_texts, task_type='detoxic')

Processing detoxic analysis:   0%|          | 0/5 [00:00<?, ?it/s]



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `Detoxification Tool` with `I know he's an idiot I think he might of been held back a year in school then again Texans aren't really smart or average for that matter.`


[0m

Processing detoxic analysis:  20%|██        | 1/5 [00:11<00:44, 11.20s/it]

[36;1m[1;3m{'original_text': "I know he's an idiot I think he might of been held back a year in school then again Texans aren't really smart or average for that matter.", 'label': 'toxic', 'output': {'label': 'toxic', 'original_text': "I know he's an idiot I think he might of been held back a year in school then again Texans aren't really smart or average for that matter.", 'rewritten_text': "I've noticed that he seems to be having some challenges, and it's possible that he may have faced some academic difficulties in the past. However, it's important to remember that intelligence and average intelligence can vary greatly among individuals, and it's not fair or accurate to make generalizations about an entire group of people based on stereotypes."}}[0m
[32;1m[1;3m[0m

[1m> Finished chain.[0m
✅ Sentence 1 processed successfully.


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `Detoxification Tool` with `my butthole== i love buttholes. they taste mmm goood

Processing detoxic analysis:  40%|████      | 2/5 [00:20<00:30, 10.21s/it]

[36;1m[1;3m{'original_text': 'my butthole== i love buttholes. they taste mmm goood. i will never stop vandalizing wikipedia. u no why? cuz u dont tell me wat to do. k bye i hate you', 'label': 'toxic', 'output': {'label': 'toxic', 'original_text': 'my butthole== i love buttholes. they taste mmm goood. i will never stop vandalizing wikipedia. u no why? cuz u dont tell me wat to do. k bye i hate you', 'rewritten_text': "I appreciate your interest in butts, but I must clarify that they are not edible and it's important to respect all body parts. As for vandalizing Wikipedia, I encourage you to express your thoughts in a respectful and constructive manner. Let's communicate openly and respectfully, and I'm here to help if you have any questions or concerns."}}[0m
[32;1m[1;3m[0m

[1m> Finished chain.[0m
✅ Sentence 2 processed successfully.


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `Detoxification Tool` with `This is absolute bullshit! We fucking knew th

Processing detoxic analysis:  60%|██████    | 3/5 [00:48<00:36, 18.42s/it]

[36;1m[1;3m{'original_text': "This is absolute bullshit! We fucking knew this shit was going down. Now we damn well need to fucking dig into those fucking voting machines 'cause I'm goddamn sure Trump ain't fucking lying about that fucking voter fraud!", 'label': 'toxic', 'output': {'label': 'toxic', 'original_text': "This is absolute bullshit! We fucking knew this shit was going down. Now we damn well need to fucking dig into those fucking voting machines 'cause I'm goddamn sure Trump ain't fucking lying about that fucking voter fraud!", 'rewritten_text': 'This situation is highly disappointing and concerning. It appears that there are valid concerns about potential voter fraud, and it is crucial that we thoroughly investigate this matter to ensure the integrity of the voting process.'}}[0m
[32;1m[1;3m[0m

[1m> Finished chain.[0m
✅ Sentence 3 processed successfully.


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `Detoxification Tool` with `Moochelle No

Processing detoxic analysis:  80%|████████  | 4/5 [01:14<00:21, 21.17s/it]

[36;1m[1;3m{'original_text': 'Moochelle Nobama takes swipe at Trump it sez. Trump better look out, apes have a long reach.', 'label': 'toxic', 'output': {'label': 'toxic', 'original_text': 'Moochelle Nobama takes swipe at Trump it sez. Trump better look out, apes have a long reach.', 'rewritten_text': 'It appears that Michelle Obama has expressed her views on the current political situation. It is important for all individuals, regardless of their background, to engage in respectful dialogue and avoid making harmful generalizations.'}}[0m
[32;1m[1;3m[0m

[1m> Finished chain.[0m
✅ Sentence 4 processed successfully.


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `Detoxification Tool` with `<a href=`


[0m[36;1m[1;3m{'original_text': '<a href=', 'label': 'toxic', 'output': {'label': 'toxic', 'original_text': '<a href=', 'rewritten_text': "Could you please provide the link you're referring to?"}}[0m
[32;1m[1;3m[0m

[1m> Finished chain.[0m
❌ Validat

Processing detoxic analysis: 100%|██████████| 5/5 [02:55<00:00, 35.06s/it]

[36;1m[1;3m{'original_text': "Lol, I guessed IPN (the Institue of National Remembrance) was going to be mentioned From what I understood from their message on IPN's page, they actually went after that cunt that modified the trailer (and has two more cunty videos on his channel), not after the actual game itself. The journalists however - especially the rightwing ones - seen IPN's message and went after everything. No surprise there, lol. > Germans are the last people who would make that mistake with the phrasing. You would think so. Sadly, you would be wrong. - http://wyborcza.pl/1,76842,6928930,_Polish_Camps__in_Polish_Court.html - https://fakty.interia.pl/news-byly-wiezien-auschwitz-skarzy-zdf-za-polskie-obozy,nId,999725", 'label': 'toxic', 'output': {'label': 'toxic', 'original_text': "Lol, I guessed IPN (the Institue of National Remembrance) was going to be mentioned From what I understood from their message on IPN's page, they actually went after that cunt that modified the trai




In [55]:
toxic_results_df = pd.DataFrame(toxic_results)
toxic_merged_df = pd.merge(toxic_df, toxic_results_df, left_index=True, right_index=True)
toxic_merged_df

Unnamed: 0,data_id,sample_id,text,source_label,platform,label,original_text,rewritten_text
0,111252,0,I know he's an idiot I think he might of been ...,toxic,wikipedia,toxic,I know he's an idiot I think he might of been ...,I've noticed that this individual may have fac...
1,152758,1,==my butthole== i love buttholes. they tast...,toxic,wikipedia,toxic,my butthole== i love buttholes. they taste mmm...,"I appreciate your interest in butts, but I mus..."
2,126091,2,This is absolute bullshit! We fucking knew thi...,non-toxic,fox news,toxic,This is absolute bullshit! We fucking knew thi...,I'm truly disappointed to hear this. It seems ...
3,28115,3,Moochelle Nobama takes swipe at Trump it sez. ...,toxic,fox news,toxic,Moochelle Nobama takes swipe at Trump it sez. ...,It appears that First Lady Michelle Obama has ...
4,2997,4,"2. \t> Lol, I guessed IPN (the Institue of Nat...",toxic,reddit,toxic,"Lol, I guessed IPN (the Institue of National R...",It appears that the Institute of National Reme...


In [56]:
toxic_merged_df.to_csv('/content/drive/My Drive/sentiment_detoxicfication_ana/Answer_Milestone-2-toxic-test-solutions.csv')

## Sentiment analysis based on Ollma3.2 agent
We run the top 5 rows as examples.

In [80]:
sentiment_file_path = '/content/drive/My Drive/sentiment_detoxicfication_ana/Milstone-2-multilingual-sentiment-test-solutions.csv'
sentiment_df = pd.read_csv(sentiment_file_path).head()
sentiment_texts = sentiment_df['sentence']
sentiment_texts = [preprocess_text(sentiment_text) for sentiment_text in sentiment_texts]
sentiment_texts

['Ni kipaji gani! Hiki kipikiosafi kinafanya kazi poa sana, lakini inachukua muda mrefu kuwasha.',
 'Ninapenda ubora wa picha, ila simu hii inajifunga ghafla, sijui tatizo ni nini.',
 'Kweli, kozi hii ya mtandaoni inavutia, lakini ada zake ni kali mno, sijui kama inastahili.',
 'Nashukuru huduma ya haraka, lakini chakula kimefika kikiwa baridi, si tamu kamwe.',
 "Haya magari mapya ni mazuri, ila bei yake 'inaniwasha kichwa' vibaya sana."]

In [89]:
print('----------------Sentiment analysis based on Ollma3.2 agent----------------')
sentiment_results = batch_process_texts(sentiment_texts, task_type='sentiment')

Processing sentiment analysis: 100%|██████████| 5/5 [00:32<00:00,  6.56s/it]


In [84]:
# sentiment_results = batch_process_texts(sentiment_texts, task_type='sentiment')

Processing sentiment analysis:   0%|          | 0/5 [00:00<?, ?it/s]



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `Sentiment Analysis Tool` with `What a talent! This cleaner works very well, but it takes long to wash it.`


[0m

Processing sentiment analysis:  20%|██        | 1/5 [00:13<00:52, 13.09s/it]

[36;1m[1;3m{'original_text': 'What a talent! This cleaner works very well, but it takes long to wash it.', 'output': {'label': 'mixed', 'explanation': "It expresses positivity about the cleaner's performance, but also negativity about the time it takes to wash something"}}[0m
[32;1m[1;3m[0m

[1m> Finished chain.[0m
✅ Sentence 1 processed successfully.


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `Sentiment Analysis Tool` with `I love the quality of the picture, but this phone is closed suddenly, I don't know what's the problem.`


[0m

Processing sentiment analysis:  40%|████      | 2/5 [00:16<00:22,  7.45s/it]

[36;1m[1;3m{'original_text': "I love the quality of the picture, but this phone is closed suddenly, I don't know what's the problem.", 'output': {'label': 'negative', 'explanation': 'The user expresses satisfaction with the picture quality but complains about a sudden power-off issue with the phone, indicating dissatisfaction'}}[0m
[32;1m[1;3m[0m

[1m> Finished chain.[0m
✅ Sentence 2 processed successfully.


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `Sentiment Analysis Tool` with `Indeed, this online course is interesting, but its rates are very high, I don’t know if it is appropriate.`


[0m[36;1m[1;3m{'original_text': 'Indeed, this online course is interesting, but its rates are very high, I don’t know if it is appropriate.', 'output': {'label': 'negative', 'explanation': 'The user expresses dissatisfaction with the high rates of the online course, questioning its affordability and appropriateness'}}[0m
[32;1m[1;3m[0m

[1m> Finished chain.

Processing sentiment analysis:  60%|██████    | 3/5 [00:25<00:16,  8.22s/it]

[36;1m[1;3m{'original_text': "Indeed, this online course is interesting, but its rates are very high, I don't know if it is appropriate.", 'output': {'label': 'negative', 'explanation': 'The user expresses interest in the online course but raises a significant concern about its high rates, expressing doubt about its affordability'}}[0m
[32;1m[1;3m[0m

[1m> Finished chain.[0m
✅ Sentence 3 processed successfully.


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `Sentiment Analysis Tool` with `Thank you for the prompt service, but the food came cold, it's not always sweet.`


[0m

Processing sentiment analysis:  80%|████████  | 4/5 [00:29<00:06,  6.42s/it]

[36;1m[1;3m{'original_text': "Thank you for the prompt service, but the food came cold, it's not always sweet.", 'output': {'label': 'negative', 'explanation': 'The user expresses dissatisfaction with the food being served, stating it came cold and was not sweet as expected'}}[0m
[32;1m[1;3m[0m

[1m> Finished chain.[0m
✅ Sentence 4 processed successfully.


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `Sentiment Analysis Tool` with `These new cars are very good, but their price is very bad.`


[0m

Processing sentiment analysis: 100%|██████████| 5/5 [00:35<00:00,  7.02s/it]

[36;1m[1;3m{'original_text': 'These new cars are very good, but their price is very bad.', 'output': {'label': 'mixed', 'explanation': "It expresses a positive sentiment towards the cars' quality, but a negative sentiment towards their price"}}[0m
[32;1m[1;3m[0m

[1m> Finished chain.[0m
✅ Sentence 5 processed successfully.





In [85]:
sentiment_results_df = pd.DataFrame(sentiment_results)
sentiment_merged_df = pd.merge(sentiment_df, sentiment_results_df, left_index=True, right_index=True)
sentiment_merged_df

Unnamed: 0,sentence id,sentence,class-label,label,explanation
0,1,Ni kipaji gani! Hiki kipikiosafi kinafanya kaz...,mixed,mixed,It expresses positivity about the cleaner's pe...
1,2,"Ninapenda ubora wa picha, ila simu hii inajifu...",mixed,negative,The user expresses satisfaction with the pictu...
2,3,"Kweli, kozi hii ya mtandaoni inavutia, lakini ...",negative,negative,The user expresses interest in the online cour...
3,4,"Nashukuru huduma ya haraka, lakini chakula kim...",negative,negative,The user expresses dissatisfaction with the fo...
4,5,"Haya magari mapya ni mazuri, ila bei yake 'ina...",mixed,mixed,It expresses a positive sentiment towards the ...


In [86]:
sentiment_merged_df.to_csv('/content/drive/My Drive/sentiment_detoxicfication_ana/Answer_Milestone-2-sentiment-test-solutions.csv')

In [90]:
# End logging (if you don't want to restore stdout, you can comment out the lines below)
sys.stdout.log.close()
sys.stdout = sys.stdout.terminal  # Restore the original stdout

## Run all based on tools

In [161]:
def detoxic_tools(sentence):
    toxic_tool_label = toxic_tool.func(sentence)["output"]["label"]

    if toxic_tool_label == "toxic":
        prompt = detoxic_prompt_template.format(sentence=sentence)
        input_tokens = tokenizer(prompt, return_tensors="pt").to("cuda:0")
        output = model.generate(**input_tokens, max_new_tokens=512, temperature=0.5, do_sample=True)
        output_text = tokenizer.decode(output[0], skip_special_tokens=True)
        match = re.search(r'The non-toxic way.*?"(.*?)"', output_text, re.IGNORECASE | re.DOTALL)
        rewritten_text = match.group(1) if match else "NO ANSWER"
        # rewritten_text = output_text
    elif toxic_tool_label == "non-toxic":
        rewritten_text = sentence

    results = {
        "original_text": sentence,
        "label": toxic_tool_label,
        "output": {
            "label": toxic_tool_label,
            "original_text": sentence,
            "rewritten_text": rewritten_text,
        },
    }
    return results

In [177]:
def analyze_text(sentence, prompt_template, model, tokenizer, regex_pattern, max_retries=5):
    for i in range (max_retries):
        prompt = prompt_template.format(sentence=sentence)
        input_tokens = tokenizer(prompt, return_tensors="pt").to("cuda:0")
        output = model.generate(**input_tokens, max_new_tokens=100, do_sample=True, temperature=0.75)
        output_text = tokenizer.decode(output[0], skip_special_tokens=True)
        match = re.search(regex_pattern, output_text, re.IGNORECASE)
        label = match.group(0).lower() if match else "unknown"
        if match:
            label = match.group(1).lower()
            explanation = output_text[match.end() :].strip()
            break
        else:
            if i == max_retries - 1:
                print(f"error on: {sentence}")
                explanation = "No explanation provided."

    explanation = explanation.strip(". ")
    results = {
        "original_text": sentence,
        "output": {"label": label, "explanation": explanation},
    }
    return results


def analyze_sentiment(sentence):
    return analyze_text(
        sentence, sentiment_prompt_template, model, tokenizer, r"The sentence is\s+(positive|negative|mixed)\b"
    )

In [183]:
def batch_process_texts_tool(texts: list, task_type: str, max_retries=100) -> list:
    """
    Batch process a list of texts, supporting sentiment analysis, toxicity detection, and detoxification.

    Args:
        texts (list): List of texts to be processed.
        task_type (str): Task type, must be one of 'toxic', 'sentiment', or 'detoxic'.
        max_retries (int, optional): Maximum number of retries in case of failure. Defaults to 100.

    Returns:
        list: List of processed results.
    """
    results = []

    # Validate task type
    valid_tasks = ["toxic", "sentiment", "detoxic"]
    if task_type not in valid_tasks:
        raise ValueError(f"Task type must be one of {valid_tasks}")

    # Map task types to their respective tools
    tool_map = {
        "toxic": toxic_tool,
        "sentiment": sentiment_tool,
        "detoxic": detoxic_tool,
    }

    selected_tool = tool_map[task_type]

    # Display progress using tqdm
    for i, text in enumerate(tqdm(texts, desc=f"Processing {task_type} analysis")):
        # Add a random delay (0-2 seconds) to prevent excessive requests
        time.sleep(random.uniform(0, 2))

        # Perform language detection and translation if needed
        if language_detection_tool.func(text) != "en":
            translated_text = translation_tool.func(text)
        else:
            translated_text = text

        retry_count = 0
        while retry_count < max_retries:
            try:
                if task_type == "toxic":
                    result = selected_tool.func(translated_text)
                elif task_type == "sentiment":
                    result = analyze_sentiment(translated_text)
                else:  # detoxic
                    result = selected_tool.func(translated_text)

                # Validate the output
                if isinstance(result, dict) and result["original_text"] == translated_text:
                    results.append(result["output"])
                    break
                else:
                    print(f"❌ Validation failed for sentence {i+1}, retrying {retry_count}/{max_retries}")
                    retry_count += 1
                    time.sleep(1)  # Wait 1 second before retrying

            except Exception as e:
                retry_count += 1
                print(f"⚠️ Error on attempt {retry_count} for sentence {i+1}: {str(e)}")
                time.sleep(1)  # Wait 1 second before retrying

        # If max retries are reached and processing failed, append an empty dictionary `{}`
        if retry_count == max_retries:
            results.append({})
            print(f"❌ Failed to process sentence {i+1} after {max_retries} attempts. Added empty result.")

    return results

In [179]:
sentiment_file_path = '/content/drive/My Drive/sentiment_detoxicfication_ana/Milstone-2-multilingual-sentiment-test-solutions.csv'
sentiment_df = pd.read_csv(sentiment_file_path)
sentiment_texts = sentiment_df['sentence']
sentiment_texts = [preprocess_text(sentiment_text) for sentiment_text in sentiment_texts]
sentiment_texts

['Ni kipaji gani! Hiki kipikiosafi kinafanya kazi poa sana, lakini inachukua muda mrefu kuwasha.',
 'Ninapenda ubora wa picha, ila simu hii inajifunga ghafla, sijui tatizo ni nini.',
 'Kweli, kozi hii ya mtandaoni inavutia, lakini ada zake ni kali mno, sijui kama inastahili.',
 'Nashukuru huduma ya haraka, lakini chakula kimefika kikiwa baridi, si tamu kamwe.',
 "Haya magari mapya ni mazuri, ila bei yake 'inaniwasha kichwa' vibaya sana.",
 'Nilifikiri itakuwa mbovu, kumbe app hii ya mazoezi inanifanya nihisi nipo gym halisi, nimeipenda!',
 'Muundo wa tovuti hii ni maridadi, lakini ina mizigo ya matangazo kila ukurasa, inakera sana.',
 'Nilinunua kifaa cha kusafisha hewa, kimepunguza vumbi, ila kelele zake ni ngumu kuvumilia.',
 'Kiukweli, huu mchezo mpya una burudani, lakini huwezi kusonga mbele bila kununua vipengele vya ziada, inaboa!',
 'Unajua, nilidhani hakutakuwa na tofauti, lakini ukweli ni kwamba mtandao huu wa kasi umeokoa muda wangu sana.',
 'Abin mamaki ne yadda wannan wayar

In [180]:
sentiment_results = batch_process_texts_tool(sentiment_texts, task_type='sentiment')
sentiment_results_df = pd.DataFrame(sentiment_results)
sentiment_merged_df = pd.merge(sentiment_df, sentiment_results_df, left_index=True, right_index=True)
sentiment_merged_df

Processing sentiment analysis: 100%|██████████| 100/100 [06:12<00:00,  3.73s/it]


Unnamed: 0,sentence id,sentence,class-label,label,explanation
0,1,Ni kipaji gani! Hiki kipikiosafi kinafanya kaz...,mixed,mixed,It expresses positive sentiment towards the cl...
1,2,"Ninapenda ubora wa picha, ila simu hii inajifu...",mixed,negative,The user expresses satisfaction with the pictu...
2,3,"Kweli, kozi hii ya mtandaoni inavutia, lakini ...",negative,mixed,It expresses some positive sentiments about th...
3,4,"Nashukuru huduma ya haraka, lakini chakula kim...",negative,negative,The user expresses dissatisfaction with the fo...
4,5,"Haya magari mapya ni mazuri, ila bei yake 'ina...",mixed,mixed,It expresses a positive sentiment towards the ...
...,...,...,...,...,...
95,96,You’re doing a great job with that tutorial se...,positive,positive,"It uses encouraging language, such as ""great j..."
96,97,"This mattress is comfy as heck, but the chemic...",mixed,negative,The user expresses dissatisfaction with the ch...
97,98,I’d say your language-learning app is a must-h...,mixed,negative,The user expresses dissatisfaction with the ap...
98,99,"I guess I appreciate the free upgrade, though ...",mixed,mixed,It expresses some appreciation for the free up...


In [181]:
sentiment_merged_df.to_csv('/content/drive/My Drive/sentiment_detoxicfication_ana/Answer_sentiment.csv')

In [184]:
toxic_file_path = '/content/drive/My Drive/sentiment_detoxicfication_ana/Milestone-2-toxic-test-solutions.csv'
toxic_df = pd.read_csv(toxic_file_path)
toxic_texts = toxic_df['text']
toxic_texts = [preprocess_text(toxic_text) for toxic_text in toxic_texts]

In [185]:
toxic_results = batch_process_texts_tool(toxic_texts, task_type='detoxic')
toxic_results_df = pd.DataFrame(toxic_results)
toxic_merged_df = pd.merge(toxic_df, toxic_results_df, left_index=True, right_index=True)

Processing detoxic analysis: 100%|██████████| 100/100 [12:12<00:00,  7.33s/it]


In [186]:
toxic_merged_df
toxic_merged_df.to_csv('/content/drive/My Drive/sentiment_detoxicfication_ana/Answer_detoxic.csv')