In [48]:
!pip install shap lime


Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=5428a238faa88108d4ddbb6877b956be4cc7e75d4fb1346d4391f6939379d113
  Stored in directory: /root/.cache/pip/wheels/85/fa/a3/9c2d44c9f3cd77cf4e533b58900b2bf4487f2a17e8ec212a3d
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


In [74]:
import shap
import numpy as np
from lime.lime_text import LimeTextExplainer
from transformers import pipeline
import torch


In [75]:
from transformers import AutoModelForTokenClassification, AutoTokenizer

model_dir = "/content/xlmr-ner-amharic"

model = AutoModelForTokenClassification.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)


In [76]:
from transformers import TokenClassificationPipeline

ner_pipe = TokenClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",  # group subwords
    device=0 if torch.cuda.is_available() else -1
)

Device set to use cpu


In [77]:
test_sentences = [
    "የሀገራችን ምርት 🇪🇹 2600 ብር ብቻ 0911871330 ቦሌ መደሐንያለም ኦሮሚያ ህንፃ 1ኛ ፎቅ 104 ቁጥር ኩሩ ጫማ የቴሌግራም አባል ይሁኑ T.me/kuruwear",
    "Shewa Brand,@Shewabrand,1269 under armour CHARGED IMPULSE size 40--45 MADE IN VIETNAM SHEWA BRAND አድራሻ ድሬዳዋ  አሸዋ ሚና 1 ፎቅ እንገኛለን ስልክ 0987336458 የቤት ቁጥር 109 እና 110",
    "Made In VIETNAM Size #40 #41 #42 #43  Price: 5200 Br INBOX : @Maraki2211  ስልክ : +251 913321831  አድራሻ - አዲስ አበባ, ሜክሲኮ፡ ከ ኬኬር ህንጻ 50ሜ ወረድ ብሎ አይመን ህንፃ  ግራውንድ ፍሎር ላይ፡ የሱቅ ቁ.012 Maraki Brand™ ┃ማራኪ ብራንድ",
]


In [85]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# Update this path if your model is saved elsewhere
model_path = "/content/bert-base-amharic"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)
id2label = model.config.id2label


In [86]:
def predict_tokens(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, is_split_into_words=False)
    with torch.no_grad():
        outputs = model(**inputs).logits

    predictions = torch.argmax(outputs, dim=2)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    predicted_labels = [id2label[p.item()] for p in predictions[0]]

    return list(zip(tokens, predicted_labels))


In [87]:
test_sentences = [
"የሀገራችን ምርት 🇪🇹 2600 ብር ብቻ 0911871330 ቦሌ መደሐንያለም ኦሮሚያ ህንፃ 1ኛ ፎቅ 104 ቁጥር ኩሩ ጫማ የቴሌግራም አባል ይሁኑ T.me/kuruwear",
    "Shewa Brand,@Shewabrand,1269 under armour CHARGED IMPULSE size 40--45 MADE IN VIETNAM SHEWA BRAND አድራሻ ድሬዳዋ  አሸዋ ሚና 1 ፎቅ እንገኛለን ስልክ 0987336458 የቤት ቁጥር 109 እና 110",
    "Made In VIETNAM Size #40 #41 #42 #43  Price: 5200 Br INBOX : @Maraki2211  ስልክ : +251 913321831  አድራሻ - አዲስ አበባ, ሜክሲኮ፡ ከ ኬኬር ህንጻ 50ሜ ወረድ ብሎ አይመን ህንፃ  ግራውንድ ፍሎር ላይ፡ የሱቅ ቁ.012 Maraki Brand™ ┃ማራኪ ብራንድ",
]

for sent in test_sentences:
    print(f"\n📌 Sentence: {sent}")
    for token, label in predict_tokens(sent):
        print(f"{token:15} {label}")



📌 Sentence: የሀገራችን ምርት 🇪🇹 2600 ብር ብቻ 0911871330 ቦሌ መደሐንያለም ኦሮሚያ ህንፃ 1ኛ ፎቅ 104 ቁጥር ኩሩ ጫማ የቴሌግራም አባል ይሁኑ T.me/kuruwear
[CLS]           O
የሀገራችን          O
ምርት             O
🇪🇹              O
260             O
##0             O
ብር              O
ብቻ              O
09              O
##11            O
##871           I-Product
##33            O
##0             O
ቦሌ              O
መደ              O
##ሐን            O
##ያለ            O
##ም             O
ኦሮሚያ            I-LOC
ህንፃ             I-LOC
1ኛ              I-LOC
ፎቅ              I-LOC
104             O
ቁጥር             O
ኩሩ              I-LOC
ጫማ              I-LOC
የቴሌ             O
##ግራም           I-LOC
አባል             O
ይሁኑ             I-LOC
T               O
.               B-Product
me              O
/               O
k               O
##uru           I-Product
##we            O
##ar            O
[SEP]           I-LOC

📌 Sentence: Shewa Brand,@Shewabrand,1269 under armour CHARGED IMPULSE size 40--45 MADE IN VIETNAM SHEWA BRAND አድራሻ ድሬዳዋ

In [88]:
from IPython.display import display, HTML

def render_colored_ner(tokens_labels):
    html = ""
    for token, label in tokens_labels:
        clean_token = token.replace("▁", "")  # For RoBERTa subwords
        if label == "O":
            html += f"{clean_token} "
        else:
            color = {
                "B-Product": "#FFD700",   # gold
                "I-Product": "#FFE066",
                "B-PRICE": "#00BFFF",     # deep sky blue
                "I-PRICE": "#87CEFA",
                "B-LOC": "#90EE90",       # light green
                "I-LOC": "#B2F2BB"
            }.get(label, "#DDDDDD")
            html += f"<span style='background-color:{color}; padding:2px; border-radius:3px'>{clean_token}</span> "
    display(HTML(html))

# Try it
tokens_labels = predict_tokens(test_sentences[0])
render_colored_ner(tokens_labels)


In [89]:
tokens_labels = predict_tokens(test_sentences[0])
render_colored_ner(tokens_labels)


# Task
Explain the error in the provided Python code for using SHAP with a Hugging Face `TokenClassificationPipeline`, fix the error by defining a custom output transformation function to make the pipeline output compatible with SHAP, and then generate and visualize SHAP values using the corrected code.

## Define a custom output transform function

### Subtask:
Create a Python function that takes the output of the `TokenClassificationPipeline` and restructures it to include 'label' and 'score' keys in a format compatible with SHAP.


**Reasoning**:
Define the `output_transform` function as described in the instructions to restructure the pipeline output.



In [65]:
def output_transform(pipeline_output):
    """
    Transforms the output of the TokenClassificationPipeline to be compatible with SHAP.

    Args:
        pipeline_output: The output from the TokenClassificationPipeline.

    Returns:
        A list of dictionaries with 'label' and 'score' keys for each entity.
    """
    transformed_output = []
    for sentence_output in pipeline_output:
        sentence_transformed = []
        for entity in sentence_output:
            sentence_transformed.append({
                'label': entity['entity_group'],
                'score': entity['score']
            })
        transformed_output.append(sentence_transformed)
    return transformed_output

## Initialize shap explainer with the custom transform

### Subtask:
Instantiate the `shap.Explainer` and pass the custom `output_transform` function to it.


**Reasoning**:
Initialize the SHAP Explainer with the NER pipeline and the custom output transform function.



In [66]:
explainer = shap.Explainer(ner_pipe, output_transform=output_transform)

## Generate shap values

### Subtask:
Generate SHAP values for the test sentences using the initialized SHAP explainer.


**Reasoning**:
Generate SHAP values for the test sentences using the initialized SHAP explainer.



In [67]:
shap_values = explainer(test_sentences)

KeyError: 'label'

## Generate shap values retry 1

### Subtask:
Generate SHAP values for the test sentences using the initialized SHAP explainer.


**Reasoning**:
Generate SHAP values for the test sentences using the initialized SHAP explainer.



In [71]:
shap_values = explainer(test_sentences)

KeyError: 'label'

## Summary:

### Data Analysis Key Findings

*   The initial attempt to generate SHAP values with the `TokenClassificationPipeline` resulted in a `KeyError: 'label'`.
*   A custom `output_transform` function was defined to restructure the pipeline's output to include 'label' and 'score' keys, aiming for compatibility with SHAP.
*   The `shap.Explainer` was initialized with the pipeline and the custom `output_transform` function.
*   Despite the custom transformation, subsequent attempts to generate SHAP values still resulted in a `KeyError: 'label'`.

### Insights or Next Steps

*   The `KeyError: 'label'` persists even with the defined custom output transformation, indicating a potential issue with how SHAP processes the transformed output or that the transformation is not correctly applied or interpreted by the explainer in this context.
*   Further debugging is required to understand why SHAP is still expecting a 'label' key in a format that is not being provided or correctly interpreted, despite the custom transformation. This might involve examining the exact structure of the pipeline output and the expected input format of the SHAP explainer more closely.


## Define a custom output transform function

### Subtask:
Create a Python function that takes the output of the `TokenClassificationPipeline` and restructures it to include 'label' and 'score' keys in a format compatible with SHAP.

**Reasoning**:
Define the `output_transform` function as described in the instructions to restructure the pipeline output.

In [70]:
def output_transform(pipeline_output):
    """
    Transforms the output of the TokenClassificationPipeline to be compatible with SHAP.

    Args:
        pipeline_output: The output from the TokenClassificationPipeline.

    Returns:
        A list of dictionaries with 'label' and 'score' keys for each entity.
    """
    transformed_output = []
    for sentence_output in pipeline_output:
        sentence_transformed = []
        for entity in sentence_output:
            sentence_transformed.append({
                'label': entity['entity_group'],
                'score': entity['score']
            })
        transformed_output.append(sentence_transformed)
    return transformed_output

## Initialize shap explainer with the custom transform

### Subtask:
Instantiate the `shap.Explainer` and pass the custom `output_transform` function to it.

**Reasoning**:
Initialize the SHAP Explainer with the NER pipeline and the custom output transform function.

In [None]:
explainer = shap.Explainer(ner_pipe, output_transform=output_transform)