In [1]:
import pandas as pd
import numpy as np
import re

SEED = 42


from transformers import (DataCollatorWithPadding, Trainer, TrainingArguments,
                          LongformerTokenizer, LongformerForSequenceClassification,
                          LongformerConfig)

from datasets import Dataset, DatasetDict

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

import torch
from torch.utils.data import DataLoader
assert torch.cuda.is_available(), 'GPU not found. You should fix this.'

In [2]:
from captum.attr import LayerConductance, LayerIntegratedGradients
from captum.attr import visualization as viz

## Data

In [3]:
def get_datadict(score_to_predict):
    
    scores = {
        'Overall',
        'Cohesion',
        'Syntax',
        'Vocabulary',
        'Phraseology',
        'Grammar',
        'Conventions'
    }
    
    columns_to_remove = scores.symmetric_difference([score_to_predict])
    
    dd = (DatasetDict
          .load_from_disk('../data/ellipse.hf')
          .remove_columns(columns_to_remove)
          .rename_column(score_to_predict, 'label')
         )
    
    return dd

In [4]:
score_to_predict = 'Grammar'

dd = get_datadict(score_to_predict)
dd

DatasetDict({
    train: Dataset({
        features: ['text_id', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4537
    })
    dev: Dataset({
        features: ['text_id', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 972
    })
    test: Dataset({
        features: ['text_id', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 973
    })
})

## Testbed

In [5]:
model_chkpt = '../bin/checkpoint-284/'
model = LongformerForSequenceClassification.from_pretrained(model_chkpt, num_labels=1).cuda()
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model.eval()
model.zero_grad()

In [6]:
from torch.utils.data import DataLoader

ds = dd.with_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])

eval_dataloader = DataLoader(ds['dev'].shuffle().select(range(10)), batch_size=1)

In [7]:
def create_position_ids_from_input_ids(input_ids, padding_idx):
    """
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    """
    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
    mask = input_ids.ne(padding_idx).int()
    incremental_indices = torch.cumsum(mask, dim=1).type_as(mask) * mask
    return incremental_indices.long() + padding_idx

In [8]:
def predict(input_ids=None, token_type_ids=None, position_ids=None, attention_mask=None):
    
    output = model(input_ids=input_ids,
                   token_type_ids=token_type_ids,
                   position_ids=position_ids,
                   attention_mask=attention_mask)
    
    return output.logits

In [9]:
def summarize_attributions(attributions):
    attributions = attributions.sum(dim=-1).squeeze(0)
    attributions = attributions / torch.norm(attributions)
    return attributions

In [10]:
for batch in eval_dataloader:
    model.zero_grad()
    
    labels = batch.pop('label')
        
    # outputs = model(**batch, output_hidden_states=True)
    
    batch['position_ids'] = create_position_ids_from_input_ids(batch['input_ids'], model.config.pad_token_id)
    
    batch = {k: v.cuda() for k, v in batch.items()}
    
    with torch.no_grad():
        score = predict(**batch)
    
    lig = LayerIntegratedGradients(predict, model.longformer.embeddings)
    
    tokens = tokenizer.convert_ids_to_tokens(batch['input_ids'][0].detach().tolist())
    
    tokens = [t.replace('Ġ', '') for t in tokens]
    
    attribution, delta = lig.attribute(inputs=batch['input_ids'],
                                       # baselines=ref_input_ids,
                                       additional_forward_args=(batch['token_type_ids'], 
                                                                batch['position_ids'],
                                                                batch['attention_mask']),
                                       return_convergence_delta=True)
    
    attribution_sum = summarize_attributions(attribution)
    
    
    # storing couple samples in an array for visualization purposes
    position_vis = viz.VisualizationDataRecord(
        attribution_sum,
        torch.max(torch.softmax(score[0], dim=0)),
        score,
        score,
        str(0),
        attribution_sum.sum(),       
        tokens,
        delta)

    print('\033[1m', 'Visualizations', '\033[0m')
    viz.visualize_text([position_vis])

[1m Visualizations [0m


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
"tensor([[4.1848]], device='cuda:0')","tensor([[4.1848]], device='cuda:0') (1.00)",0.0,19.72,"#s Do you think by being inactive you can accomplish something ? I believe that by being inactive you can accomplish great things . Thomas Jefferson once said that "" D eter mine never to be idle ... It is wonderful how much can be done if we are always doing ."" Now i dont agree with Thomas Jefferson because , by being inactive you have way more time to plan events or a problem to every last detail . You 'll also have time to spend with your family and make memories , you dont have to miss another birthday , a dance res ettle , another Football game , or a school play . You 'll have time to do all these things by being inactive . you 'll have time to plan a problem or a special event with great detail . This is one good reason why i disagree with Thomas Jefferson . For example if your daughter was having her wedding in 7 month , you would want time to plan for it right ? you 'll not only have the time to plan the wedding , but you 'll have the the chance of walking her down the wedding aisle and hear her say those two words "" I do ."" on the other hand being active can pay all the wedding expans es with out struggle . But you 'll have that covered because you had the time to overcome that problem Spending time with your family is key to a great and happy life . This is another good reason of why i disagree with Thomas Jefferson . The most amazing gift of all is a child , from the moment you find out , to the the moment that baby arrives , you 'll have time to prepare for it 's arrival . you 'll have time to built a crib , to decor ate the baby 's room , and other great things . By not always doing something you have time to watch your baby grow , to teach him how to fish or swim , to read and write , and how to be kind to others . you 'll have time to do all of these special memories because you have time and you are not always so busy with work or other things , you 'll have the time to give to the ones you love . So in conclusion I disagree with with Thomas Jefferson on always being active . Al tho Thomas Jefferson does make a compelling argument , he is not completely right . you dont have to to be always active to accomplish something great , you can accomplish great things when you 're not being always active . That is why I disagree with Thomas Jefferson and believe that you don 't have to be always active to accomplish something great in life . #/s"
,,,,


[1m Visualizations [0m


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
"tensor([[3.3723]], device='cuda:0')","tensor([[3.3723]], device='cuda:0') (1.00)",0.0,3.8,"#s Students can benefit from this option , but also can affect them in different ways . Many students have difficulty coming to school every single day . One imped iment may be transportation , a lot of them do not know how to drive , so they can not drive to school . Some students live in areas where the buses can 't enter . Many of them walk to get to school every day or ride a bicycle . They face many problems but the biggest one is weather . The climate can change quickly , when is raining a lot or is cold these students stay at home , because there is no way how they can get to school . Students can benefit by not attending everyday to school . For students who live far away from school is a great option , because they safe time and also they can work during the day . As we see many students have a job after school , and this is hard for them because they have to be active in school . There is a big problem because they come tired to school , and you see them sleeping in classes . They can not be concentrate because they do not have time to sleep . Many of them do not have time to do homework , or different projects they have to complete . It is a great option to have schools that offers distance learning . This action help many students but also can affect them . In my opinion when you attended to school everyday you can learn more . Because you have all the resources you need to understand , and complete the assignments . But students who like to work hard and take their time to studying in their houses , it can also be easy for them . Before that there was not technology is was extremely hard . Now that the science had grown up , and the technology appear , we have a way more opportunity to do many things . One opportunity is to studying online from our houses . As I said now that we have technology , we have more opportunities and is really easy to find stuff that you need to learn in your school . In my conclusion , I think is good for our society having those school who offers distance learning . Because many students can not be in school everyday , and this action help them to keep studying . If this program does not exist many of them could never go again to school , because of all the imped iments they have to attend school , and they will never accomplish their dreams . #/s"
,,,,


[1m Visualizations [0m


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
"tensor([[2.6330]], device='cuda:0')","tensor([[2.6330]], device='cuda:0') (1.00)",0.0,16.63,"#s Dear Teachers , Date : 03 / 12 / 19 Address : Center St Generic _ City In my opinion working in group should be more benefits . Teacher should let student work in group . Teacher should be allow students to work in group . There are three reason why should they let student work in group : Working in group think more stronger and faster . Each of every students can catch a mistake . It would benefits to get more information about what topic they will do it . Firstly , you can look more than one idea . Secondly , each students can make one question about what they are doing . Third ly , if you work in group you can be more comfortable with your work . Your working should have the best idea and should not be any mistake because before to give the work to the teacher every students should look at it . It would be more interesting to meet another students and know what they can do it , and if you can help in something that they don 't it . Also , in some school er teacher let student to work in group because some students don 't feel comfortable to do the work by themselves . Some activities should be necessary to do in group . For example , when they are doing a project . However , most of the class ses have works hit to do in group . For example , make experiment in a lab , present in front of the class . Some students say that work in group is more funny than being alone . Sometimes , teacher don 't like to put student in group bec as ue they observe that when student are in group they just talk about another s th n ig not about what they tell you , but not all the students like that . In conclusion , for these reasons is more benefits to work and group than work alone . Since rely , Generic _ Name #/s"
,,,,


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.76 GiB (GPU 0; 47.54 GiB total capacity; 43.14 GiB already allocated; 1.66 GiB free; 44.83 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF