In [1]:
!pip install shap lime



In [2]:
!pip install transformers



In [3]:
import shap
import lime
import lime.lime_text
import numpy as np
import pandas as pd
import torch
from transformers import XLMRobertaTokenizer, XLMRobertaForTokenClassification

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
model_name = "/content/drive/My Drive/fine_tuned_model"
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
model = XLMRobertaForTokenClassification.from_pretrained(model_name)

In [6]:
model.eval()

XLMRobertaForTokenClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bi

In [7]:
sample_sentences = [
    "John Doe is a software engineer at OpenAI.",
    "The capital of France is Paris.",
    "Apple is looking at buying U.K. startup for $1 billion."
]

# Tokenize input
inputs = tokenizer(sample_sentences, return_tensors="pt", padding=True, truncation=True)

In [8]:
def predict_fn(texts):
    # Convert input to a list if it is a single string
    if isinstance(texts, str):
        texts = [texts]

    print(f"Input texts: {texts}")  # Debugging line

    # Tokenize the input texts
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)

    print(f"Tokenized inputs: {inputs}")  # Debugging line

    # Use the model to make predictions
    with torch.no_grad():
        outputs = model(**inputs)

    # Get logits and apply softmax
    logits = outputs.logits.detach().numpy()
    probabilities = np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True)

    print(f"Logits: {logits}")  # Debugging line
    print(f"Probabilities: {probabilities}")  # Debugging line

    return probabilities

In [9]:
# Implement SHAP
explainer = shap.Explainer(predict_fn, tokenizer)
shap_values = explainer(sample_sentences)

Input texts: ['<mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask>']


ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [10]:
# Create a LIME text explainer
lime_explainer = lime.lime_text.LimeTextExplainer(class_names=model.config.id2label.values())

# Explain the first sample
exp = lime_explainer.explain_instance(sample_sentences[0], predict_fn, num_features=10)
exp.show_in_notebook(text=sample_sentences[0])

Input texts: ['John Doe is a software engineer at OpenAI.', 'John  is   engineer at .', '       .', '       OpenAI.', 'John Doe is a  engineer at OpenAI.', 'John  is a software   .', 'John  is a software engineer at .', 'John       .', 'John    software   OpenAI.', '  is  software   OpenAI.', ' Doe  a software   .', 'John Doe     at .', 'John  is a software  at OpenAI.', '   a    .', 'John  is a  engineer  .', 'John Doe is a software   .', 'John     engineer at .', 'John Doe   software engineer at .', '    software engineer at .', 'John  is a  engineer at .', 'John  is    at OpenAI.', '      at .', '       .', 'John Doe is  software engineer  OpenAI.', 'John   a    .', 'John  is     OpenAI.', '       .', '  is a software engineer  .', 'John Doe is a software engineer at .', '    software engineer at .', 'John Doe is a software engineer  OpenAI.', '       .', ' Doe is a software engineer at OpenAI.', '    software   .', '     engineer  .', ' Doe is     .', '       .', ' Doe  a software 

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [None]:
def analyze_difficult_cases(predictions, true_labels):
    for pred, true in zip(predictions, true_labels):
        if pred != true:
            print(f"Predicted: {pred}, True: {true}")


In [None]:
def generate_report():
    # Create a DataFrame to store results
    report_df = pd.DataFrame({
        'Sentence': sample_sentences,
        'Predictions': predictions,
        'True Labels': true_labels,
        'Difficult Case': [pred != true for pred, true in zip(predictions, true_labels)]
    })

    # Save the report
    report_df.to_csv('ner_model_report.csv', index=False)
    print("Report generated: ner_model_report.csv")
