## Setup

Mount google drive if on colab:

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Download modules:

In [2]:
!pip install langchain langchain_huggingface langdetect



Imports:

In [3]:
from langchain_huggingface import HuggingFacePipeline
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain, SimpleSequentialChain
from langchain.schema.runnable import RunnableParallel

from langdetect import detect, DetectorFactory
import langdetect.lang_detect_exception

from transformers import AutoTokenizer, MT5ForConditionalGeneration

import torch
import re
import pandas as pd
import json

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from concurrent.futures import ThreadPoolExecutor

In [4]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

We load the data below in the pipeline implementation.

Models:

We use the UBC Toucan model for translation:

To translate using Toucan models, we use the target language ISO-3 code as preix.

In [5]:
lang_names={
    "aar": "Afar",
    "ach": "Acholi",
    "afr": "Afrikaans",
    "aka": "Akan",
    "amh": "Amharic",
    "bam": "Bambara",
    "bas": "Basaa",
    "bem": "Bemba",
    "btg": "Bete Gagnoa",
    "eng": "English",
    "ewe": "Ewe",
    "fon": "Fon",
    "fra": "French",
    "hau": "Hausa",
    "ibo": "Igbo",
    "kbp": "Kabiye",
    "lgg": "Lugbara",
    "lug": "Luganda",
    "mlg": "Malagasy",
    "nyn": "Nyakore",
    "orm": "Oromo",
    "som": "Somali",
    "sot": "Sesotho",
    "swa": "Swahili",
    "tir": "Tigrinya",
    "yor": "Yoruba",
    "teo": "Ateso",
    "gez": "Geez",
    "wal": "Wolaytta",
    "fan": "Fang",
    "kau": "Kanuri",
    "kin": "Kinyawanda",
    "kon": "Kongo",
    "lin": "Lingala",
    "nya": "Chichewa",
    "pcm": "Nigerian Pidgin",
    "ssw": "Siswati",
    "tsn": "Setswana",
    "tso": "Tsonga",
    "twi": "Twi",
    "wol": "Wolof",
    "xho": "Xhosa",
    "zul": "Zulu",
    "nnb": "Nande",
    "swc": "Swahili Congo",
    "ara": "Arabic"
}


In [14]:
#translation_model = HuggingFacePipeline.from_model_id(
#    model_id="UBC-NLP/toucan-base",
#    task="translation",
#    pipeline_kwargs={
#        "num_beams" : 5,
#        "do_sample": True,
#        "temperature": 0.6,
#        "top_p": 0.9
#    }
#)

Device set to use cuda:0


In [7]:
# Initialize our Toucan model for African language translation
toucan_tokenizer = AutoTokenizer.from_pretrained("UBC-NLP/toucan-base")
toucan_model = MT5ForConditionalGeneration.from_pretrained("UBC-NLP/toucan-base", torch_dtype=torch.float16, device_map="auto")
toucan_model.eval()

# Sanity check that the translation model works: Translate an example from English to Zulu
text="zul: Clear all items from the recent documents list"
input_ids = toucan_tokenizer(text, return_tensors="pt", max_length=1024, truncation=True).to("cuda:0")
with torch.no_grad():
    generated_ids = toucan_model.generate(**input_ids, num_beams=5, max_new_tokens=len(text), do_sample=True, temperature=0.6, top_p=0.9)
print(toucan_tokenizer.batch_decode(generated_ids, skip_special_tokens=True,  skip_prompt=True)[0])

You are using a model of type t5 to instantiate a model of type mt5. This is not supported for all configurations of models and can yield errors.


Vala zonke izinto kusuka kwihlu lamadokhumende elidlule


For the rest of the pipeline, we use the IBM Granite model:

In [8]:
granite_model = HuggingFacePipeline.from_model_id(
    model_id="ibm-granite/granite-3.2-2b-instruct",
    task="text-generation",
    pipeline_kwargs={
        "return_full_text": False,
        "do_sample": False,
        "max_new_tokens": 300
    }
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


## Language Detection & Translation

In [9]:
DetectorFactory.seed = 0

langdetect_to_toucan = {
    "en": "eng", "fr": "fra", "sw": "swa", "yo": "yor", "ha": "hau",
    "ig": "ibo", "ar": "ara", "xh": "xho", "zu": "zul", "am": "amh",
    "so": "som", "st": "sot", "rw": "kin", "lg": "lug", "ln": "lin",
    "sn": "tsn", "ss": "ssw", "ny": "nya", "mg": "mlg", "om": "orm",
    "ti": "tir", "nso": "tsn", "tn": "tsn", "ts": "tso", "tw": "twi",
    "wo": "wol", "kg": "kon", "ee": "ewe", "ff": "fan", "pcm": "pcm",
    "arq": "ara", "ffm": "fan", "kab": "kab", "bm": "bam", "nyn": "nyn"
}

def detect_language(text):
    try:
        detected_lang = detect(text)
        return langdetect_to_toucan.get(detected_lang, "unknown")
    except langdetect.lang_detect_exception.LangDetectException:
        return "unknown"

def detect_and_append_language(df, text_column="sentence"):
    df["detected_language"] = df[text_column].apply(detect_language)
    return df

Language detection chain:

In [10]:
def detect_language_chain(text):
    detected_lang = detect_language(text)
    return {"sentence": text, "detected_language": detected_lang}

Translation Framework:

In [11]:
#translation_format_instructions = """
#Return only a valid JSON object with the following keys:
#{
#  "translated_text": "The translated English text of the given input."
#}
#"""
#
#translation_prompt = ChatPromptTemplate.from_template(
#    """
#    You are an expert in multilingual translation.
#    Translate the following sentence into English using the given source language.
#
#    Source Language: "{source_language}"
#    Sentence: "{sentence}"
#
#    {format_instructions}
#    """
#).partial(format_instructions=translation_format_instructions)
#
#translation_chain = LLMChain(llm=translation_model, prompt=translation_prompt)

NameError: name 'translation_model' is not defined

Translation if needed chain:

In [17]:
#def process_language_translation(df, text_column="sentence"):
#    df["detected_language"] = df[text_column].apply(detect_language)
#
#    def translate_if_needed(row):
#        if row["detected_language"] == "eng":
#            return row[text_column]
#        else:
#            response = translation_chain.invoke({
#                "source_language": row["detected_language"],
#                "sentence": row[text_column]
#            })
#            return eval(response)["translated_text"]
#
#    df["translated_text"] = df.apply(translate_if_needed, axis=1)
#    return df
#
#sample_df = m2_multilang_sentiment_df.sample(5, random_state=42)
#processed_df = process_language_translation(sample_df)
#

NameError: name 'm2_multilang_sentiment_df' is not defined

## Sentiment Analysis Setup

In [64]:
format_instructions = """
Return only a valid JSON object with no additional text or formatting. The JSON object must contain exactly the following keys:
{
  "sentiment": "positive" | "mixed" | "negative",
  "explanation": "A brief reason for this classification in 25 words or less."
}
"""

sentiment_prompt = ChatPromptTemplate.from_template(
    """
    You are an expert in sentiment analysis.
    Analyze the following sentence and classify it as 'positive', 'negative', or 'mixed'.
    Please think carefully about context and linguistic cues before making your classification.
    If you are unsure, classify it as 'mixed'.
    Then, explain your reasoning in 25 words or less.

    Sentence: "{sentence}"

    {format_instructions}
    """
).partial(format_instructions=format_instructions)

sentiment_chain = LLMChain(llm=granite_model, prompt=sentiment_prompt)

In [13]:
def analyze_sentiment(sentence):
    response = sentiment_chain.invoke({"sentence": sentence})
    parsed_response = json.loads(response["text"])
    return {"sentiment": parsed_response["sentiment"], "explanation": parsed_response["explanation"]}

In [32]:
sentiment_chain.invoke("This is honestly great, and I have no complaints.")

{'sentence': 'This is honestly great, and I have no complaints.',
 'text': '\nAssistant:\n{\n  "sentiment": "positive",\n  "explanation": "The sentence expresses strong satisfaction with the subject."\n}'}

## Toxicity Analysis Setup

In [71]:
toxic_format_instructions = """
Return a JSON object with only the following keys:
- "toxicity": (toxic, non-toxic)
- "explanation": (a brief explanation for why the toxicity was classified this way)
Output only the JSON object. Do not add extra characters or whitespace.
"""

toxicity_prompt = ChatPromptTemplate.from_template(
    """
    You are an expert in toxicity analysis, ie. detecting inappropriate and/or hurtful language.
    Analyze the following sentence and classify it as 'toxic'or 'non-toxic'.
    A sentence qualifies as 'toxic' if it contains rude, aggressive, offensive, or politically inappropriate language.
    Then, explain your reasoning in 25 words or less.

    Sentence: "{sentence}"

    {toxic_format_instructions}
    """
).partial(toxic_format_instructions=toxic_format_instructions)

toxicity_chain = LLMChain(llm=granite_model, prompt=toxicity_prompt)

In [15]:
def analyze_toxicity(sentence):
    response = toxicity_chain.invoke({"sentence": sentence})

    try:
        parsed_response = json.loads(response["text"])  # Attempt to parse JSON
        return {
            "toxicity": parsed_response.get("toxicity", "unknown"),
            "explanation": parsed_response.get("explanation", "No explanation provided.")
        }
    except json.JSONDecodeError as e:
        print(f"JSON parsing failed for sentence: {sentence}\nError: {e}\nResponse: {response['text']}")
        return {"toxicity": "unknown", "explanation": "Malformed response."}  # Safe fallback

## Detoxification Setup

In [61]:
detox_format_instructions = """
Return only a valid JSON object with the following keys:
{
  "detoxified_text": "A rewritten version of the input sentence that removes toxicity while keeping the original intent."
}
"""

detox_prompt = ChatPromptTemplate.from_template(
    """
    You are an AI assistant that removes toxicity from text.
    Rewrite the following sentence to be non-toxic while preserving its original meaning.
    Non-toxic means that the sentence should be free of offensive, harmful, or inappropriate language.

    Toxic Sentence: "{sentence}"

    {format_instructions}
    """
).partial(format_instructions=detox_format_instructions)

detox_chain = LLMChain(llm=granite_model, prompt=detox_prompt)

In [17]:
#
#def detoxify_text(row):
#    if row["prediction"] == "toxic":
#        response = detox_chain.invoke({"sentence": row["text"]})
#        raw_text = response["text"].strip()
#
#        if raw_text.startswith("```json") and raw_text.endswith("```"):
#            raw_text = raw_text[7:-3].strip()
#
#        raw_text = re.sub(r'[\x00-\x1F\x7F]', '', raw_text)
#
#        try:
#            parsed_response = json.loads(raw_text)
#            return parsed_response.get("detoxified_text", row["text"])
#        except json.JSONDecodeError as e:
#            print(f"JSON parsing failed for detox: {row['text']}\nError: {e}\nResponse: {raw_text}")
#            return row["text"]
#    else:
#        return row["text"]

In [73]:
def agentive_workflow(text, downstream_task):
    """
    Agentive workflow:
    1. Detect the language of the text
    2. If text not in English, translate to English
    3. Routes the English text to the selected downstream task
    downstream_task must be one of: 'sentiment', 'toxicity', or 'detoxification'.
    """
    # 1. Language detection
    detected_lang = detect_language(text)
    if detected_lang != "eng":
        # 2. Translation step if the language is not English
        # Add 'eng: ' to start of string so that Toucan knows to translate to English
        text = "eng: " + text
        # Call
        input_ids = toucan_tokenizer(text, return_tensors="pt", max_length=1024, truncation=True).to("cuda:0")
        with torch.no_grad():
          generated_ids = toucan_model.generate(**input_ids, num_beams=5, max_new_tokens=len(text), do_sample=True, temperature=0.6, top_p=0.9)
        english_text = toucan_tokenizer.batch_decode(generated_ids, skip_special_tokens=True,  skip_prompt=True)[0]
    else:
        english_text = text

    # 3. Downstream task execution - call the appropriate LLM Chain based on task
    if downstream_task == "sentiment":
        return analyze_sentiment(english_text)
    elif downstream_task == "toxicity":
        return analyze_toxicity(english_text)
    # A few extra steps for detoxification
    elif downstream_task == "detoxification":
        # First, check if the text is toxic
        toxicity_result = analyze_toxicity(english_text)
        if toxicity_result["toxicity"] == "toxic":
            detox_response = detox_chain.invoke({"sentence": english_text})
            raw_text = detox_response["text"].strip()
            # Deal with markdown and handle resulting errors gracefully
            if raw_text.startswith("```json") and raw_text.endswith("```"):
                raw_text = raw_text[7:-3].strip()
            raw_text = re.sub(r'[\x00-\x1F\x7F]', '', raw_text)
            try:
                parsed_response = json.loads(raw_text)
                return {"detoxified_text": parsed_response.get("detoxified_text", english_text)}
            except json.JSONDecodeError as e:
                print(f"JSON parsing failed during detoxification: {e}\nResponse: {raw_text}")
                return {"detoxified_text": english_text}
        else:
            return {"detoxified_text": english_text}
    else:
        return {"error": "Invalid downstream task. Please choose 'sentiment', 'toxicity', or 'detoxification'."}

## Running the data:

In [35]:
# Test run of a single non-english, negative sentiment text

agentive_workflow("Muundo wa tovuti hii ni maridadi, lakini ina mizigo ya matangazo kila ukurasa, inakera sana.", "sentiment")

{'sentiment': 'mixed',
 'explanation': "The sentence contains positive sentiment towards the website's structure, but negative sentiment about intrusive advertisements and potential dangers."}

In [36]:
sentiment_test_path = "/content/drive/MyDrive/MDS/COLX_565/data/Milstone-2-multilingual-sentiment-test-solutions.csv"
m2_multilang_sentiment_df = pd.read_csv(sentiment_test_path)

sentiment_results = []
for sentence in m2_multilang_sentiment_df["sentence"]:
    try:
        response = agentive_workflow(sentence, "sentiment")
        sentiment_results.append(response)
    except Exception as e:
        print(f"Error processing sentence: {sentence}\nError: {e}")
        sentiment_results.append({"sentiment": "unknown", "explanation": "Failed to process."})

m2_multilang_sentiment_df["prediction"] = [res["sentiment"] for res in sentiment_results]
m2_multilang_sentiment_df["explanation"] = [res["explanation"] for res in sentiment_results]

true_sentiment = m2_multilang_sentiment_df["class-label"]
pred_sentiment = m2_multilang_sentiment_df["prediction"]

precision, recall, f1, _ = precision_recall_fscore_support(true_sentiment, pred_sentiment, average="macro")
accuracy = accuracy_score(true_sentiment, pred_sentiment)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [63]:
toxicity_test_path = "/content/drive/MyDrive/MDS/COLX_565/data/Milestone-2-toxic-test-solutions.csv"
m2_toxicity_df = pd.read_csv(toxicity_test_path)


toxicity_results = m2_toxicity_df["text"].apply(analyze_toxicity)
m2_toxicity_df["prediction"] = toxicity_results.apply(lambda x: x["toxicity"])
m2_toxicity_df["explanation"] = toxicity_results.apply(lambda x: x["explanation"])

# compute evaluation metrics
true_toxicity = m2_toxicity_df["source_label"]
pred_toxicity = m2_toxicity_df["prediction"]

tox_precision, tox_recall, tox_f1, _ = precision_recall_fscore_support(true_toxicity, pred_toxicity, average="macro")
tox_accuracy = accuracy_score(true_toxicity, pred_toxicity)

In [69]:
tox_precision, tox_recall, tox_f1, _ = precision_recall_fscore_support(true_toxicity, pred_toxicity, average="macro")
tox_accuracy = accuracy_score(true_toxicity, pred_toxicity)

In [46]:
m2_multilang_sentiment_df.to_csv("sentiment_analysis_results.csv", index=False)
m2_toxicity_df.to_csv("toxicity_analysis_results.csv", index=False)

In [68]:
print("Sentiment Analysis Evaluation:")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")

Sentiment Analysis Evaluation:
Accuracy: 0.710
Precision: 0.797
Recall: 0.693
F1 Score: 0.664


In [70]:
print("\nToxicity Detection Evaluation:")
print(f"Accuracy: {tox_accuracy:.3f} ")
print(f"Precision: {tox_precision:.3f}")
print(f"Recall: {tox_recall:.3f}")
print(f"F1 Score: {tox_f1:.3f}")


Toxicity Detection Evaluation:
Accuracy: 0.580 
Precision: 0.465
Recall: 0.492
F1 Score: 0.407


Detoxify:

In [57]:
m2_toxicity_df["detoxified_text"] = m2_toxicity_df["text"].apply(lambda text: agentive_workflow(text, downstream_task="detoxification"))['detoxified_text']

JSON parsing failed during detoxification: Expecting value: line 1 column 1 (char 0)
Response: Response:{  "detoxified_text": "Get lost, you Dutch person."}
JSON parsing failed during detoxification: Expecting value: line 1 column 1 (char 0)
Response: Response:{  "detoxified_text": "2. In reality, he is a protected species, a Democrat, and a member of a racial minority group."}
JSON parsing failed during detoxification: Expecting value: line 1 column 1 (char 0)
Response: Assistant:{  "detoxified_text": "2. #VoteNOonGillum #Gillum: Under scrutiny for alleged anti-American sentiments, Gillum is being investigated by the FBI. Two ongoing investigations and his association with certain political figures have raised questions. This situation bears a resemblance to past controversies, such as those involving former President Obama."}


In [59]:
# Display first few rows of the DataFrame
print(m2_toxicity_df[["text", "prediction", "detoxified_text"]].head())

                                                text prediction  \
0  I know he's an idiot I think he might of been ...      toxic   
1    ==my butthole==  i love buttholes. they tast...      toxic   
2  This is absolute bullshit! We fucking knew thi...      toxic   
3  Moochelle Nobama takes swipe at Trump it sez. ...      toxic   
4  2. \t> Lol, I guessed IPN (the Institue of Nat...      toxic   

                                     detoxified_text  
0  {'detoxified_text': 'I suspect he may have fac...  
1  {'detoxified_text': 'I have a strong affinity ...  
2  {'detoxified_text': 'It's incredibly disappoin...  
3  {'detoxified_text': 'Michelle Obama reportedly...  
4  {'detoxified_text': '2. > I anticipated IPN (t...  
