In [1]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
import torch
import pandas as pd
from datasets import Dataset
import numpy as np
from transformers import pipeline
from transformers_interpret import MultiLabelClassificationExplainer

# Interpreting the model trained on fake data and tested on real data

In [2]:
dataset_filename = "../preprocessing/data/processed_merge.csv"
# dataset_filename = "../linguistic_model/data/synthetic_clan_train_new.csv"
data = pd.read_csv(dataset_filename, encoding='utf8', index_col=False)

In [3]:
model_name = "models/bert_finetuned_reverse"
tokenizer_name = "models/bert_finetuned_reverse"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, return_tensor="pt")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipe = pipeline("text-classification", model=model_name, tokenizer=tokenizer_name, device=device)
cls_explainer = MultiLabelClassificationExplainer(model, tokenizer)

In [4]:
model.config.id2label

{0: 'CONTROL', 1: 'BROCA'}

In [5]:
dataset = Dataset.from_pandas(data)
def preprocess_function(examples):
    return tokenizer(examples["preprocessed_text"], padding=True, truncation=True, return_tensors="pt")
tokenized_data = dataset.map(preprocess_function, batched=True).with_format("torch")
tokenized_data_split = tokenized_data.train_test_split(test_size=0.2, seed=42)

Map:   0%|          | 0/57051 [00:00<?, ? examples/s]

In [6]:
num = 0
n = 10
for text, label in zip(list(tokenized_data_split["test"]["preprocessed_text"]), np.array(tokenized_data_split["test"]["label"])): 
    if num != n:
        print("True: ", model.config.id2label[label], "Pred: ", pipe(text)[0]["label"])
        print("Text: ", text)
        cls_explainer(text)
        cls_explainer.visualize()
        print("----------------------------------------------------------------------")
        num += 1

True:  CONTROL Pred:  CONTROL
Text:  i usually use a spoon for my jelly.


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
,(0.98),CONTROL,1.74,[CLS] i usually use a spoon for my jelly . [SEP]
,,,,
,(0.03),BROCA,-1.97,[CLS] i usually use a spoon for my jelly . [SEP]
,,,,


----------------------------------------------------------------------
True:  CONTROL Pred:  CONTROL
Text:  the one didn't take the umbrella.


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
,(0.67),CONTROL,1.11,[CLS] the one didn ' t take the umbrella . [SEP]
,,,,
,(0.40),BROCA,-0.98,[CLS] the one didn ' t take the umbrella . [SEP]
,,,,


----------------------------------------------------------------------
True:  CONTROL Pred:  CONTROL
Text:  get a scoop with that.


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
,(0.97),CONTROL,1.91,[CLS] get a scoop with that . [SEP]
,,,,
,(0.04),BROCA,-1.96,[CLS] get a scoop with that . [SEP]
,,,,


----------------------------------------------------------------------
True:  CONTROL Pred:  CONTROL
Text:  they're riding in a boat.


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
,(0.97),CONTROL,1.58,[CLS] they ' re riding in a boat . [SEP]
,,,,
,(0.04),BROCA,-1.75,[CLS] they ' re riding in a boat . [SEP]
,,,,


----------------------------------------------------------------------
True:  CONTROL Pred:  CONTROL
Text:  they didn't want her on the scene.


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
,(0.98),CONTROL,1.81,[CLS] they didn ' t want her on the scene . [SEP]
,,,,
,(0.04),BROCA,-1.97,[CLS] they didn ' t want her on the scene . [SEP]
,,,,


----------------------------------------------------------------------
True:  CONTROL Pred:  CONTROL
Text:  so she hangs out with all her little animal friends and is up all night with her little mice friends making this beautiful dress for the ball.


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
,(0.99),CONTROL,1.95,[CLS] so she hangs out with all her little animal friends and is up all night with her little mice friends making this beautiful dress for the ball . [SEP]
,,,,
,(0.02),BROCA,-2.39,[CLS] so she hangs out with all her little animal friends and is up all night with her little mice friends making this beautiful dress for the ball . [SEP]
,,,,


----------------------------------------------------------------------
True:  CONTROL Pred:  CONTROL
Text:  and really it it was it was sort of traumatic for me.


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
,(0.99),CONTROL,1.57,[CLS] and really it it was it was sort of traumatic for me . [SEP]
,,,,
,(0.03),BROCA,-1.89,[CLS] and really it it was it was sort of traumatic for me . [SEP]
,,,,


----------------------------------------------------------------------
True:  BROCA Pred:  CONTROL
Text:  and cold cold cold oh cold.


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
,(0.83),CONTROL,0.95,[CLS] and cold cold cold oh cold . [SEP]
,,,,
,(0.20),BROCA,-1.11,[CLS] and cold cold cold oh cold . [SEP]
,,,,


----------------------------------------------------------------------
True:  BROCA Pred:  BROCA
Text:  and they one one to.


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
,(0.03),CONTROL,-1.91,[CLS] and they one one to . [SEP]
,,,,
,(0.95),BROCA,1.94,[CLS] and they one one to . [SEP]
,,,,


----------------------------------------------------------------------
True:  CONTROL Pred:  CONTROL
Text:  you spread that on the other piece of bread.


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
,(0.98),CONTROL,1.32,[CLS] you spread that on the other piece of bread . [SEP]
,,,,
,(0.04),BROCA,-1.51,[CLS] you spread that on the other piece of bread . [SEP]
,,,,


----------------------------------------------------------------------


# Interpreting the model trained on real data and tested on fake data

In [7]:
# dataset_filename = "../preprocessing/data/processed_merge.csv"
dataset_filename = "../linguistic_model/data/synthetic_clan_train_new.csv"
data = pd.read_csv(dataset_filename, encoding='utf8', index_col=False)

In [8]:
model_name = "models/bert_finetuned"
tokenizer_name = "models/bert_finetuned"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, return_tensor="pt")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipe = pipeline("text-classification", model=model_name, tokenizer=tokenizer_name, device=device)
cls_explainer = MultiLabelClassificationExplainer(model, tokenizer)

In [9]:
model.config.id2label

{0: 'CONTROL', 1: 'BROCA'}

In [10]:
dataset = Dataset.from_pandas(data)
def preprocess_function(examples):
    return tokenizer(examples["modified"], padding=True, truncation=True, return_tensors="pt")
tokenized_data = dataset.map(preprocess_function, batched=True).with_format("torch")
tokenized_data_split = tokenized_data.train_test_split(test_size=0.2, seed=42)

Map:   0%|          | 0/49850 [00:00<?, ? examples/s]

In [11]:
num = 0
n = 10
for text, label in zip(list(tokenized_data_split["test"]["modified"]), np.array(tokenized_data_split["test"]["label"])): 
    if num != n:
        print("True: ", model.config.id2label[label], "Pred: ", pipe(text)[0]["label"])
        print("Text: ", text)
        cls_explainer(text)
        cls_explainer.visualize()
        print("----------------------------------------------------------------------")
        num += 1

True:  CONTROL Pred:  CONTROL
Text:  i love documentaries.


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
,(0.65),BROCA,1.26,[CLS] i love documentaries . [SEP]
,,,,
,(0.37),CONTROL,-1.29,[CLS] i love documentaries . [SEP]
,,,,


----------------------------------------------------------------------
True:  BROCA Pred:  CONTROL
Text:  plods on managing to get my in harts.


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
,(0.77),BROCA,1.38,[CLS] pl ##ods on managing to get my in hart ##s . [SEP]
,,,,
,(0.25),CONTROL,-1.35,[CLS] pl ##ods on managing to get my in hart ##s . [SEP]
,,,,


----------------------------------------------------------------------
True:  CONTROL Pred:  CONTROL
Text:  murder by numbers does not take place in a single room,like the rope,mind you.and,what a supreme originality,it pits two cops against the evil youngstersand,you would never guess it,these two cops are very differentactually,bullock plays the part of woman living like a man,and her partner chaplin is as shy as a clueless girlie.the two boys' performances are not really mind boggling,not as good,as,say,that of edward norton in primal fear.


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
,(0.97),BROCA,6.29,"[CLS] murder by numbers does not take place in a single room , like the rope , mind you . and , what a supreme original ##ity , it pits two cops against the evil young ##sters ##and , you would never guess it , these two cops are very different ##act ##ually , bullock plays the part of woman living like a man , and her partner chaplin is as shy as a clue ##less girl ##ie . the two boys ' performances are not really mind bog ##gling , not as good , as , say , that of edward norton in primal fear . [SEP]"
,,,,
,(0.05),CONTROL,-6.59,"[CLS] murder by numbers does not take place in a single room , like the rope , mind you . and , what a supreme original ##ity , it pits two cops against the evil young ##sters ##and , you would never guess it , these two cops are very different ##act ##ually , bullock plays the part of woman living like a man , and her partner chaplin is as shy as a clue ##less girl ##ie . the two boys ' performances are not really mind bog ##gling , not as good , as , say , that of edward norton in primal fear . [SEP]"
,,,,


----------------------------------------------------------------------
True:  CONTROL Pred:  CONTROL
Text:  i guess he wanted out of this film too.


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
,(0.92),BROCA,2.57,[CLS] i guess he wanted out of this film too . [SEP]
,,,,
,(0.10),CONTROL,-2.55,[CLS] i guess he wanted out of this film too . [SEP]
,,,,


----------------------------------------------------------------------
True:  BROCA Pred:  CONTROL
Text:  who would to see it?


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
,(0.63),BROCA,2.06,[CLS] who would to see it ? [SEP]
,,,,
,(0.40),CONTROL,-1.96,[CLS] who would to see it ? [SEP]
,,,,


----------------------------------------------------------------------
True:  CONTROL Pred:  CONTROL
Text:  spongebob, powerpuff girls, and even barney the dinosaur give more attention to their plots.


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
,(0.95),BROCA,2.65,"[CLS] sponge ##bo ##b , power ##pu ##ff girls , and even barney the dinosaur give more attention to their plots . [SEP]"
,,,,
,(0.06),CONTROL,-2.75,"[CLS] sponge ##bo ##b , power ##pu ##ff girls , and even barney the dinosaur give more attention to their plots . [SEP]"
,,,,


----------------------------------------------------------------------
True:  BROCA Pred:  BROCA
Text:  trap.


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
,(0.18),BROCA,-1.21,[CLS] trap . [SEP]
,,,,
,(0.82),CONTROL,1.15,[CLS] trap . [SEP]
,,,,


----------------------------------------------------------------------
True:  CONTROL Pred:  CONTROL
Text:  this is one of the worst films of all time right there with volcano and eightmm.


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
,(0.82),BROCA,2.21,[CLS] this is one of the worst films of all time right there with volcano and eight ##mm . [SEP]
,,,,
,(0.22),CONTROL,-2.17,[CLS] this is one of the worst films of all time right there with volcano and eight ##mm . [SEP]
,,,,


----------------------------------------------------------------------
True:  BROCA Pred:  CONTROL
Text:  ten reviews, handfuls are.


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
,(0.79),BROCA,1.25,"[CLS] ten reviews , handful ##s are . [SEP]"
,,,,
,(0.24),CONTROL,-1.23,"[CLS] ten reviews , handful ##s are . [SEP]"
,,,,


----------------------------------------------------------------------
True:  CONTROL Pred:  BROCA
Text:  coach preachy or straight sappy.


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
,(0.51),BROCA,0.55,[CLS] coach preach ##y or straight sap ##py . [SEP]
,,,,
,(0.51),CONTROL,-0.4,[CLS] coach preach ##y or straight sap ##py . [SEP]
,,,,


----------------------------------------------------------------------
