## Setup

Mount google drive if on colab:

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Download modules:

In [5]:
!pip install langchain langchain_huggingface langdetect

Collecting langchain_huggingface
  Downloading langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=2.6.0->langchain_huggingface)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=2.6.0->langchain_huggingface)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=2.6.0->langchain_huggingface)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metad

Imports:

In [39]:
from langchain_huggingface import HuggingFacePipeline
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain, SimpleSequentialChain
from langchain.schema.runnable import RunnableParallel

from langdetect import detect, DetectorFactory
import langdetect.lang_detect_exception


import pandas as pd
import json
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from concurrent.futures import ThreadPoolExecutor

We load the data below in the pipeline implementation.

Models:

We use the UBC Toucan model for translation:

To translate using Toucan models, we use the target language ISO-3 code as preix.

In [7]:
lang_names={
    "aar": "Afar",
    "ach": "Acholi",
    "afr": "Afrikaans",
    "aka": "Akan",
    "amh": "Amharic",
    "bam": "Bambara",
    "bas": "Basaa",
    "bem": "Bemba",
    "btg": "Bete Gagnoa",
    "eng": "English",
    "ewe": "Ewe",
    "fon": "Fon",
    "fra": "French",
    "hau": "Hausa",
    "ibo": "Igbo",
    "kbp": "Kabiye",
    "lgg": "Lugbara",
    "lug": "Luganda",
    "mlg": "Malagasy",
    "nyn": "Nyakore",
    "orm": "Oromo",
    "som": "Somali",
    "sot": "Sesotho",
    "swa": "Swahili",
    "tir": "Tigrinya",
    "yor": "Yoruba",
    "teo": "Ateso",
    "gez": "Geez",
    "wal": "Wolaytta",
    "fan": "Fang",
    "kau": "Kanuri",
    "kin": "Kinyawanda",
    "kon": "Kongo",
    "lin": "Lingala",
    "nya": "Chichewa",
    "pcm": "Nigerian Pidgin",
    "ssw": "Siswati",
    "tsn": "Setswana",
    "tso": "Tsonga",
    "twi": "Twi",
    "wol": "Wolof",
    "xho": "Xhosa",
    "zul": "Zulu",
    "nnb": "Nande",
    "swc": "Swahili Congo",
    "ara": "Arabic"
}


In [28]:
translation_model = HuggingFacePipeline.from_model_id(
    model_id="UBC-NLP/toucan-base",
    task="translation",
    pipeline_kwargs={
        "do_sample": True,
        "max_new_tokens": 300,
        "max_length": 512
    }
)

Device set to use cuda:0


For the rest of the pipeline, we use the IBM Granite model:

In [51]:
granite_model = HuggingFacePipeline.from_model_id(
    model_id="ibm-granite/granite-3.2-2b-instruct",
    task="text-generation",
    pipeline_kwargs={
        "return_full_text": False,
        "do_sample": False,
        "max_new_tokens": 200
    }
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 26.12 MiB is free. Process 17908 has 14.71 GiB memory in use. Of the allocated memory 14.39 GiB is allocated by PyTorch, and 152.51 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## Language Detection & Translation

In [18]:
DetectorFactory.seed = 0

langdetect_to_toucan = {
    "en": "eng", "fr": "fra", "sw": "swa", "yo": "yor", "ha": "hau",
    "ig": "ibo", "ar": "ara", "xh": "xho", "zu": "zul", "am": "amh",
    "so": "som", "st": "sot", "rw": "kin", "lg": "lug", "ln": "lin",
    "sn": "tsn", "ss": "ssw", "ny": "nya", "mg": "mlg", "om": "orm",
    "ti": "tir", "nso": "tsn", "tn": "tsn", "ts": "tso", "tw": "twi",
    "wo": "wol", "kg": "kon", "ee": "ewe", "ff": "fan", "pcm": "pcm",
    "arq": "ara", "ffm": "fan", "kab": "kab", "bm": "bam", "nyn": "nyn"
}

def detect_language(text):
    try:
        detected_lang = detect(text)
        return langdetect_to_toucan.get(detected_lang, "unknown")
    except langdetect.lang_detect_exception.LangDetectException:
        return "unknown"

def detect_and_append_language(df, text_column="sentence"):
    df["detected_language"] = df[text_column].apply(detect_language)
    return df

Language detection chain:

In [19]:
def detect_language_chain(text):
    detected_lang = detect_language(text)
    return {"sentence": text, "detected_language": detected_lang}

Translation Framework:

In [21]:
translation_format_instructions = """
Return only a valid JSON object with the following keys:
{
  "translated_text": "The translated English text of the given input."
}
"""

translation_prompt = ChatPromptTemplate.from_template(
    """
    You are an expert in multilingual translation.
    Translate the following sentence into English using the given source language.

    Source Language: "{source_language}"
    Sentence: "{sentence}"

    {format_instructions}
    """
).partial(format_instructions=translation_format_instructions)

translation_chain = LLMChain(llm=translation_model, prompt=translation_prompt)

  translation_chain = LLMChain(llm=translation_model, prompt=translation_prompt)


Translation if needed chain:

In [29]:
def process_language_translation(df, text_column="sentence"):
    df["detected_language"] = df[text_column].apply(detect_language)

    def translate_if_needed(row):
        if row["detected_language"] == "eng":
            return row[text_column]
        else:
            response = translation_chain.invoke(
                "source_language": row["detected_language"],
                "sentence": row[text_column]
            })
            return eval(response)["translated_text"]

    df["translated_text"] = df.apply(translate_if_needed, axis=1)
    return df

sample_df = m2_multilang_sentiment_df.sample(5, random_state=42)
processed_df = process_language_translation(sample_df)

import ace_tools as tools
tools.display_dataframe_to_user(name="Processed Language Data", dataframe=processed_df)

Your input_length: 80 is bigger than 0.9 * max_length: 20. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)


ValueError: The following `model_kwargs` are not used by the model: ['return_full_text'] (note: typos in the generate arguments will also show up in this list)

## Sentiment Analysis Setup

In [33]:
format_instructions = """
Return only a valid JSON object with the following keys:
{
  "sentiment": "positive" | "mixed" | "negative",
  "explanation": "A brief reason for this classification."
}
"""

sentiment_prompt = ChatPromptTemplate.from_template(
    """
    You are an expert in sentiment analysis.
    Analyze the following sentence and classify it as 'positive', 'negative', or 'mixed'.
    Then, explain your reasoning in 25 words or less.

    Sentence: "{sentence}"

    {format_instructions}
    """
).partial(format_instructions=format_instructions)

sentiment_chain = LLMChain(llm=granite_model, prompt=sentiment_prompt)

In [34]:
def analyze_sentiment(sentence):
    response = sentiment_chain.invoke({"sentence": sentence})
    parsed_response = json.loads(response["text"])
    return {"sentiment": parsed_response["sentiment"], "explanation": parsed_response["explanation"]}

## Toxicity Analysis Setup

In [36]:
toxic_format_instructions = """
Return a JSON object with the following keys:
- "toxicity": (toxic, non-toxic)
- "explanation": (a brief explanation for why the toxicity was classified this way)
Output only the JSON object. Do ot add extra characters or whitespace.
"""

toxicity_prompt = ChatPromptTemplate.from_template(
    """
    You are an expert in toxicity analysis.
    Analyze the following sentence and classify it as 'toxic'or 'non-toxic'.
    Then, explain your reasoning in 25 words or less.

    Sentence: "{sentence}"

    {toxic_format_instructions}
    """
).partial(toxic_format_instructions=toxic_format_instructions)

toxicity_chain = LLMChain(llm=granite_model, prompt=toxicity_prompt)

In [47]:
def analyze_toxicity(sentence):
    response = toxicity_chain.invoke({"sentence": sentence})

    try:
        parsed_response = json.loads(response["text"])  # Attempt to parse JSON
        return {
            "toxicity": parsed_response.get("toxicity", "unknown"),
            "explanation": parsed_response.get("explanation", "No explanation provided.")
        }
    except json.JSONDecodeError as e:
        print(f"JSON parsing failed for sentence: {sentence}\nError: {e}\nResponse: {response['text']}")
        return {"toxicity": "unknown", "explanation": "Malformed response."}  # Safe fallback

## Running the data:

In [43]:
sentiment_test_path = "/content/drive/MyDrive/MDS/COLX_565/data/Milstone-2-multilingual-sentiment-test-solutions.csv"
m2_multilang_sentiment_df = pd.read_csv(sentiment_test_path)

sentiment_results = []
for sentence in m2_multilang_sentiment_df["sentence"]:
    try:
        response = analyze_sentiment(sentence)
        sentiment_results.append(response)
    except Exception as e:
        print(f"Error processing sentence: {sentence}\nError: {e}")
        sentiment_results.append({"sentiment": "unknown", "explanation": "Failed to process."})

m2_multilang_sentiment_df["prediction"] = [res["sentiment"] for res in sentiment_results]
m2_multilang_sentiment_df["explanation"] = [res["explanation"] for res in sentiment_results]

true_sentiment = m2_multilang_sentiment_df["class-label"]
pred_sentiment = m2_multilang_sentiment_df["prediction"]

precision, recall, f1, _ = precision_recall_fscore_support(true_sentiment, pred_sentiment, average="macro")
accuracy = accuracy_score(true_sentiment, pred_sentiment)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Error processing sentence: Ni kipaji gani! Hiki kipikiosafi kinafanya kazi poa sana, lakini inachukua muda mrefu kuwasha.
Error: Expecting value: line 2 column 1 (char 1)
Error processing sentence: Ninapenda ubora wa picha, ila simu hii inajifunga ghafla, sijui tatizo ni nini.
Error: Expecting value: line 2 column 1 (char 1)
Error processing sentence: Kweli, kozi hii ya mtandaoni inavutia, lakini ada zake ni kali mno, sijui kama inastahili.
Error: Expecting value: line 2 column 1 (char 1)
Error processing sentence: Nashukuru huduma ya haraka, lakini chakula kimefika kikiwa baridi, si tamu kamwe.
Error: Expecting value: line 2 column 1 (char 1)
Error processing sentence: Haya magari mapya ni mazuri, ila bei yake 'inaniwasha kichwa' vibaya sana.
Error: Expecting value: line 2 column 1 (char 1)
Error processing sentence: Nilifikiri itakuwa mbovu, kumbe app hii ya mazoezi inanifanya nihisi nipo gym halisi, nimeipenda!
Error: Expecting value: line 2 column 1 (char 1)
Error processing senten

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [48]:
toxicity_test_path = "/content/drive/MyDrive/MDS/COLX_565/data/Milestone-2-toxic-test-solutions.csv"
m2_toxicity_df = pd.read_csv(toxicity_test_path)

# apply toxicity analysis, then add results to df

toxicity_results = m2_toxicity_df["text"].apply(analyze_toxicity)
m2_toxicity_df["prediction"] = toxicity_results.apply(lambda x: x["toxicity"])
m2_toxicity_df["explanation"] = toxicity_results.apply(lambda x: x["explanation"])

# compute evaluation metrics
true_toxicity = m2_toxicity_df["source_label"]
pred_toxicity = m2_toxicity_df["prediction"]

tox_precision, tox_recall, tox_f1, _ = precision_recall_fscore_support(true_sentiment, pred_sentiment, average="macro")
tox_accuracy = accuracy_score(true_toxicity, pred_toxicity)

JSON parsing failed for sentence: 2. 	> Lol, I guessed IPN (the Institue of National Remembrance) was going to be mentioned  From what I understood from their message on IPN's page, they actually went after that cunt that modified the trailer (and has two more cunty videos on his channel), not after the actual game itself.  The journalists however - especially the rightwing ones - seen IPN's message and went after everything. No surprise there, lol.  >  Germans are the last people who would make that mistake with the phrasing.  You would think so. Sadly, you would be wrong.  - http://wyborcza.pl/1,76842,6928930,_Polish_Camps__in_Polish_Court.html - https://fakty.interia.pl/news-byly-wiezien-auschwitz-skarzy-zdf-za-polskie-obozy,nId,999725
Error: Unterminated string starting at: line 4 column 18 (char 43)
Response: 
{
  "toxicity": "toxic",
  "explanation": "The sentence contains personal attacks ('cunt', 'two more cunty videos'), disparagement of others' actions, and dismissive languag

KeyError: 'label'

In [49]:
m2_multilang_sentiment_df.to_csv("sentiment_analysis_results.csv", index=False)
m2_toxicity_df.to_csv("toxicity_analysis_results.csv", index=False)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [50]:
print("Sentiment Analysis Evaluation:")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f} (Main Metric)")

print("\nToxicity Detection Evaluation:")
print(f"Accuracy: {tox_accuracy:.3f} (Main Metric)")
print(f"Precision: {tox_precision:.3f}")
print(f"Recall: {tox_recall:.3f}")
print(f"F1 Score: {tox_f1:.3f}")

Sentiment Analysis Evaluation:
Accuracy: 0.390
Precision: 0.722
Recall: 0.339
F1 Score: 0.424 (Main Metric)

Toxicity Detection Evaluation:
Accuracy: 0.480 (Main Metric)
Precision: 0.722
Recall: 0.339
F1 Score: 0.424
