In [1]:
import pandas as pd
from datasets import load_from_disk
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
ds = load_from_disk('../data/efcamdat_dataset')
model = AutoModelForSequenceClassification.from_pretrained('../bin/efcamdat_nli_1').to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained('../bin/efcamdat_nli_1')

In [3]:
label_names = ds['test'].features['labels'].names

id2label = {idnum: label for idnum, label in enumerate(label_names)}
label2id = {label: idnum for idnum, label in enumerate(label_names)}

model.config.id2label = id2label
model.config.label2id = label2id

In [4]:
from transformers_interpret import SequenceClassificationExplainer
cls_explainer = SequenceClassificationExplainer(model, tokenizer)

In [5]:
df = pd.read_parquet('../data/efcamdat_shatz_distro.parquet').set_index('writing_id')

In [6]:
def get_topic(example):
    ind = example['writing_id']
    info = df.loc[ind]
    topic = info['topic']
    l1 = info['l1']
    return {'topic': topic, 'l1': l1}

ds['test'] = ds['test'].map(get_topic)

  0%|          | 0/17711 [00:00<?, ?ex/s]

In [18]:
print(df.topic.value_counts().to_string())

topics_to_keep = [
    'Taking inventory in the office',
    'Giving instructions to play a game',
    'Describing routines',
    'Making notes for a visitor',
    'Planning for the future',
    'Planning to attend a music festival',
    'Labeling photos from a safari',
    'Making a dinner party menu',
]

topics_to_exclude = [
    'Introducing yourself by email',
    'Introducing yourself by email',
    'Writing an online profile',
    'Updating your online profile',
]

Introducing yourself by email              34282
Writing about what you do                  33860
Writing an online profile                  29088
Describing your family in an email         24841
Updating your online profile               23546
Describing your favorite day               18765
Writing a party invitation                 18653
Taking inventory in the office             18200
Signing up for a dating website            17686
Giving instructions to play a game         17553
Writing a birthday invitation              16393
Describing routines                        16037
Writing a weather guide for your city      15363
Buying clothes from a catalog              15218
Making notes for a visitor                 14677
Replying to a new penpal                   14450
Planning for the future                    14433
Planning to attend a music festival        13662
Writing a description of your family       13126
Telling someone what you're doing          12652
Writing about what y

In [17]:
def topic_filter(example):
    if example['topic'] in topics_to_keep:
        return True
    else:
        return False
    

for sample in ds['test'].filter(topic_filter).shuffle().select(range(30)):
    word_attributions = cls_explainer(sample['text'], class_name=id2label[sample['labels']])
    print(sample['writing_id'], id2label[sample['labels']])
    print(sample['topic'])
    html = cls_explainer.visualize()

  0%|          | 0/18 [00:00<?, ?ba/s]

64403 Russian
Giving instructions to play a game


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
3.0,Arabic (0.00),Russian,-0.74,"#s An area is marked as the "" bow ling alley ' Ten plastic bottles can be used as ' bow ling pins ' L ined up in rows of 4 , 3 , 2 , 1 . Player are allowed to take two shots on each ter n by fr is bee . One point is scored for each pin that is Kn ocked down . If the all bottles are knocked down it is strike . all the points are added together . the winner is the player with the most points . #/s"
,,,,


414472 Russian
Giving instructions to play a game


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
3.0,Arabic (0.00),Russian,-0.33,"#s An a ria of 8 by 3 mete res is m aked as a ' bow ling alley . Ten plastic bottles can be used as a bowling pins - they should be filled with a w atter for making them heavier . The bottles are lined up in raw s of 4 , 3 , 2 , and 1 bottle , just like ten - pin bowling . Each p ley er is given a fr is bee and allowed to make two shots on each turn . Players are given a total of ten turns each . One point ia scored for each n ock downed bottle . The strike , means all bottles are n ocked down with only one shot , gives the player two more shots and all the points are added together . The winner is the player with the most points . Be careful , this game s ould n 't be played on a very wind y day ! #/s"
,,,,


243673 Mandarin
Making notes for a visitor


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,Mandarin (1.00),Mandarin,3.14,"#s In the my house , there are many things near the house . Next my house is the hospital , opposite the market is the house . My house is on Ren ming Street . My house is on the left the bank . There are many rest aut rants near my house . #/s"
,,,,


300590 Spanish
Describing your favorite day


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
2.0,Portuguese (0.01),Spanish,-0.04,"#s Hello . My name 's Daniel a , I 'm student and I 'm 21 years old . My favorite day is Saturday . In Saturday , I get up at 10 ' clock . I brush my teeth and go to the gym . I have a lunch at 12 ' clock and I have a shower . In the afternoon I meet boyfriend , we go to the cinema and have dinner . I go my home at 9 o ' clock at night . I read and go to the bed at 11 o ' clock at night . What is your favorite day ? #/s"
,,,,


429617 Russian
Planning to attend a music festival


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
3.0,Arabic (0.00),Russian,-1.93,"#s Hi my D ears I 'm going to go to a music festival . The festival starts on the 7 th of June and ends on the 12 th of June . The ticket costs 37 pounds and I can see all the bands and camping there for it . I 'm taking a tent , T - shirt and shorts . I 'm going to listen to pop and dance music . May it 'll rain so I 'm also taking a rain coat . Hope you 'll have a nice weekend too . #/s"
,,,,


1146563 Russian
Planning for the future


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
3.0,Russian (1.00),Russian,3.32,"#s Hi , Alex ey ! We haven 't talk a long time . I hope you 're OK . I know , that your dream is to have a family and fing yourself in your life . And you 're getting closer and closer . This year you will try to pass an exam to become a student in the university . Wish you luck with that . What about me ? I 'm planning to buy an apartment . Maybe I will start my education in some university . It is hard for me to start a family , so I haven 't des ided yet what to do . #/s"
,,,,


894996 Arabic
Planning for the future


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
5.0,Arabic (1.00),Arabic,2.47,"#s I have many of dreams and hopes in my life . I organize my life to get the better dreams . I want to tell you about essential dream is "" help people by peace corps "" but I must manage my time and divide it between studying and working . We in the world suffer from decrease organized working between peace corps . I wish I were completed my courses at university . If I had good ed uction I 'd introduce good job . I hope to see my dreams in short time . #/s"
,,,,


98267 German
Making notes for a visitor


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
4.0,German (0.98),German,3.58,"#s Hi , There is a river 5 minutes from here . We have a library and a very good restaurant at the market place . There is a beer garden at the bus station . Have a good time . #/s"
,,,,


1068380 Mandarin
Planning for the future


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,Mandarin (1.00),Mandarin,3.6,"#s Dear Song You are my best friend , so I know I can be honest to you . I will quite my job and travel around the world . I will go to some village in the mountain , helping children to learn writing and reading . Children in these village is very cute . They are simple and naive . Their smiling is like the sunshine . I love to living and studying with them . As you know traveling is also my dream . I can meet different people and enjoy different culture . I would be happier if I made my dream come true . Thank you to be my best friend . Love Shirley #/s"
,,,,


1013382 Spanish
Describing routines


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
2.0,Spanish (1.00),Spanish,1.35,"#s November 21 , 2013 Mr Gonzalez : I am writing to highly rec om end Am aro Crist ian . I worked whit Mr . Am aro for 5 years at Sum mo de Mexico . Mr . Ar mar o was quality manager of cost umer at Sum mo de Mexico . He was very creative and proactive . Please feel free to contact me about Am aro at ( ### ) ##### # Since rely , Juan T ame z Director of Sum mo #/s"
,,,,


1143327 German
Planning for the future


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
4.0,German (1.00),German,2.31,"#s Hello Roland ! For the next year I hope to have more success in my job . It seems possible to get a more responsible and what 's even more important a more interesting position in our company . Ear ning more money isn 't something to say no . . . . Con cerning our family I hope the best for my children . I think Ber nd will go to university to study bi otechnology , Sab ine will visit a new high school next year . For myself I hope to travel a lot . I wanna go the US perhaps to visit an english language course . The next days I expect a wonderful new year party and then a lot of snow , so we can ski in the Al pes . See you next month Hel mut #/s"
,,,,


565812 German
Describing your favorite day


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
4.0,Portuguese (0.00),German,-0.18,"#s Hi , my name is Marcos . I '' m a engineer . My favorite day is Saturday . I get up 7 o '' clock in the morning . I go play soccer . My lunch is barbecue . In the evening a meet my friends . #/s"
,,,,


1059780 German
Describing your favorite day


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
4.0,Arabic (0.00),German,0.24,#s I get ab at 8 o '' clock . I go to the break fest at 8 : 15 . And started the work at 9 o '' clock . I go to the dinner at 13 : 00 . And my work is finish at 6 o '' clock . #/s
,,,,


792926 Portuguese
Labeling photos from a safari


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,Portuguese (1.00),Portuguese,1.08,"#s I met Marcel o at university . At first week we started a project together in class . His work was part - time and I was always later in the class , because my job , so Marcel o helped me this time . He 's smart and pretty fun . Then he start drive me to university because we were living and working in the same city . After that we have some dates . At the end we are living together for six years . #/s"
,,,,


390546 Mandarin
Making a dinner party menu


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,Mandarin (1.00),Mandarin,2.9,"#s Hi , This is the menu : Starter Salad Main course Rice Veget ables Chicken D essert Ice cream Cheese Dr inks Beer Wine Tea #/s"
,,,,


177821 Spanish
Making a dinner party menu


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
2.0,Arabic (0.00),Spanish,-1.48,"#s starter I will prepare ve gate bles and soup after chicken and meals whit bread finally the d ess est a chocolate cac ke and ice cream . the drinks are water , co ke and ice tea . #/s"
,,,,


488911 Russian
Describing routines


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
3.0,Russian (1.00),Russian,1.4,"#s Hi ! My name is Edu ard . I have a dog . I feed my dog every day , if I do not feed my dog she is die . Also I walk the dog in the m orn ig and evening . Also about me . I play football every day evening . Play computer games at 4 pm . Watch the movies every day at 9 pm . #/s"
,,,,


950998 Arabic
Planning for the future


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
5.0,Arabic (1.00),Arabic,1.13,"#s I finished a high school before one month . I have a many choices for my future . If i work now , i will got a money nearly , but i re ffer to complete my sy ud ies , because if i complete my studies , I will get the certificates that will help me to get a prestigious job and a large monthly income . But I also want to marry . If i get married , l will be happy and have kids and family to take care . Anyway , I am not sure what i want to do and has not yet taken a decision . #/s"
,,,,


1005377 Arabic
Taking inventory in the office


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
5.0,Arabic (1.00),Arabic,2.79,"#s In the office , The are alot of off ces , a lot of chairs , computers . There is a kitchen , a meeting room , a restroom , a restaurant and a few windows . #/s"
,,,,


471185 Spanish
Making a dinner party menu


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
2.0,Spanish (1.00),Spanish,1.65,"#s Hello , my name is Alberto : today I 'm going to prepare the following menu : soup , noodles , fish ; dr ink from orange water and finally a dessert of straw er ries . . . Thanks . #/s"
,,,,


359949 Mandarin
Making notes for a visitor


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,Mandarin (1.00),Mandarin,1.42,"#s Hi , It 's easily to find the highest building with red face which is my hometown 's hospital . My house is opposite the host ital . Next to the hospital there is a park with many colorful flowers and green trees . There is a swimming pool between my house and the Central Union bank . The weather in here is always sunny . Welcome . Sean #/s"
,,,,


980716 Russian
Planning for the future


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
3.0,Russian (1.00),Russian,2.09,"#s Dear Alex , I am sorry for delay with my answer . I am very busy now with preparation to my Italy business trip . I think it is a chance for me to say about myself . I would like to receive a position of an accounting super vis er . If my presentation is good I will be promoted . I d re em about higher salary , a business car and a phone . But I am nerv ious a bit . My English is not as good as it should be . I can not understand fully oral spe esh and can not spe ek flu ently . . . . So now I have been learning English all my free time . It is left only two days . . . Haw are you ? BR , Kate . #/s"
,,,,


8269 Mandarin
Labeling photos from a safari


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,Arabic (0.00),Mandarin,0.46,"#s Mon keys are really cute . They are smaller than the lions , and they are not as strong and scary as the lions . They are very clever and very playful . They are as affection ate as persons . Some of monkeys are friendly and some are not . They like live with their families in the trees . They are so lovely ! #/s"
,,,,


950492 German
Planning to attend a music festival


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
4.0,Portuguese (0.00),German,-0.75,#s I was last year in bras il . I had seen the great beach and the beautiful water of the cop ac ab ana . I had also an exc ursion to sa u p aul o . It 's a wear ly big city with poor air and big skysc rap ers . I also went to sp ain for holiday . The beach and the water was also great . I 've seen a big church and great palms . #/s
,,,,


1082617 Russian
Making notes for a visitor


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
3.0,Russian (0.87),Russian,1.69,"#s I live in the big city in a very nice area with beautiful Park . I like to go running in the Park . There are many museums , theat res , shops and restaurants . People here are very friendly . My favorite shop "" Gallery "" . This is a big shopping mall where you can buy clothes , drink coffee and watch a movie . I go there on weekends . #/s"
,,,,


594316 Portuguese
Making notes for a visitor


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,Portuguese (0.81),Portuguese,0.76,#s Wel com me to my house ! On my street there are markets . You can buy food there . There is a subway station near my house . You can to go to many places . There is a municipal library . See you soon . #/s
,,,,


1159256 Spanish
Taking inventory in the office


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
2.0,Portuguese (0.45),Spanish,0.59,"#s I work in the office , in house , My office is little , but great , have a computer , telephone , fax , printer computer , o cas inally listen music , and have a lot of time in the computer #/s"
,,,,


1194383 Arabic
Describing routines


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
5.0,Arabic (1.00),Arabic,2.29,"#s May 15 , 2013 Dear Mr H any I am writing to highly recommend To ony Adams . I worked with Mr mark for three years at sales manager . Mr Adams was sales man eger . He was very hard w oking and always had good ideas . He was also very creative and well - organized . Please feel free to contact me about Adams at x xx @ gmail . com Since rely , Ahmed Hasan General manager Trade marketing co . #/s"
,,,,


285551 Mandarin
Giving instructions to play a game


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1.0,Mandarin (1.00),Mandarin,4.47,"#s The game need an area of 8 metres X 3 metres , and it need several bowling pins and one fr is bee , maybe the plastic bottles can used as the bowling pins , as long as it filled with some water which make it heavy ; after you prep aired the bottles , it should be lined up in rows of 4 ; now you can play the game , every player throw the fr is bee towards bowling bottles , and every player has ten times to throw it , if one of the bowling bottles is knocked down , you can get one score , and you can throw once again , all scores can be added together , in the end , the player with the highest score is the winner . by the way , you should play this game on a sunny day , its very interesting . #/s"
,,,,


989478 Spanish
Describing routines


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
2.0,Spanish (1.00),Spanish,0.49,"#s December 5 , 2013 to whom it may concern : I am writing to recommend Mr . ram ires the worker and learn very fast , , had many good ideas , work in the area of marketing , please any questions you have can communicate to my email x xx @ hot mail . com no more for now I say goodbye christ ian f ernandez president d axter systems #/s"
,,,,
