In [3]:
import json
from uuid import uuid4
from collections import Counter

import pandas as pd
from tqdm import tqdm
import numpy as np
from datasets import load_dataset
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
import optuna
import shap
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, f1_score

from imblearn.over_sampling import RandomOverSampler

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

import pickle


  from .autonotebook import tqdm as notebook_tqdm


In [10]:
results_df = pd.read_csv('data/ner_deep_learning_results.csv')
mistakes = results_df[(results_df['y_pred'] == 0) & (results_df.model == 'dslim/bert-base-NER')]['Name'].to_numpy()
mistakes
rng = np.random.default_rng(seed=0)
hard_name = rng.choice(mistakes)
print(hard_name)
switch_name = 'Peter'
targets = ['no emotion', 'anger', 'disgust', 'fear', 'happiness', 'sadness', 'surprise']


Riku


# NER False Negatives

array(['A', 'A', 'A', ..., 'Zion', 'Zion', 'Zion'], dtype=object)

# feature importance

In [4]:
model_name='dslim/bert-base-NER'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)



def train(who='all', n_samples=100, fp='data/output.pkl'):
    """
    who: 'all' - anonyimise everyone
    who: 'rare-once' - only insert the rare name once
    """

    removed_names = set()

    def anonymise(sentence, who='all'):
        global rare_insert_count
        nlp = pipeline("ner", model=model, tokenizer=tokenizer)
        ner_results = nlp(sentence)
        for item in ner_results:
            # https://huggingface.co/dslim/bert-base-NER
            if item['entity'] in ['B-PER', 'I-PER']:
                # Assume that there rare name IS in the dataset. We replace another name with hard name.
                if who == 'all':
                    if item['word'] == switch_name:
                        sentence = sentence.replace(item['word'], hard_name)
                        print(sentence)
                    else:
                        sentence = sentence.replace(item['word'], '')
                        removed_names.add(item['word'])

                elif who == 'rare_once':
                    if item['word'] == switch_name and rare_insert_count < 1:
                        sentence = sentence.replace(item['word'], hard_name)
                        print(sentence)
                        rare_insert_count += 1
                    else:
                        sentence = sentence.replace(item['word'], '')
                        removed_names.add(item['word'])
                elif who == 'noone':
                    if item['word'] == switch_name:
                        sentence = sentence.replace(item['word'], hard_name)
                        print(sentence)
                else:
                    raise NotImplementedError(f'{who} is an unknown option')

        return sentence

    def process(split='train', ner=True):    
    
        utterance = []
        ids = []
        label = []
        act = []
        
        # Apply the function to all examples in the dataset
        dataset = load_dataset('daily_dialog', split=split)
        
        if n_samples:
            nd = n_samples
        else:
            nd = len(dataset)
        
        for i in tqdm(range(nd)):
            example = dataset[i]
            did = uuid4()
            for j in range(len(example['dialog'])):
                text = example['dialog'][j]
                # add previous sentnce xontext
                if j > 1:
                    text = str(example['emotion'][j - 1]) + ' ' + example['dialog'][j - 1] + ' ' + text
                if ner:
                    text = anonymise(text, who=who)
                utterance.append(text)
                act.append(example['act'][j])
                label.append(example['emotion'][j])
                ids.append(did)

        data = {
            'text': utterance,
            'label': label,
            'attr': act,
            'id': ids
        }

        df = pd.DataFrame(data=data)

        return df
    
    global rare_insert_count
    rare_insert_count = 0
    df_train = process(split='train')
    print('n train', len(df_train))
    rare_insert_count = 0
    df_valid = process(split='validation')
    rare_insert_count = 0
    df_test = process(split='test')

    print(list(set(removed_names)))

    # improves macro f1
    rus = RandomOverSampler(random_state=42)
    df_train, _ = rus.fit_resample(df_train, df_train.label)

    counts = Counter(df_train.label)
    print('train label dist.', counts)

    clf = SGDClassifier(loss='log_loss', penalty='l2', alpha=1.0930076764057076e-05, n_jobs=-1)
    vec = TfidfVectorizer()

    X_train_tfidf = vec.fit_transform(df_train.text.to_list())
    X_valid_tfidf = vec.transform(df_valid.text.to_list())
    X_test_tfidf = vec.transform(df_test.text.to_list())

    clf.fit(X_train_tfidf, df_train.label)

    y_pred = clf.predict(X_test_tfidf)
    y_true = df_test.label
    report = classification_report(y_true, y_pred)
    print(report)

    r = (clf, vec, removed_names, X_train_tfidf, df_train, X_test_tfidf, df_test)

    f = open(fp, 'wb')
    pickle.dump(r, f)
    f.close()

    return r


NameError: name 'mistakes' is not defined

In [55]:
r_all = train(who='all', n_samples=None, fp='data/all.pkl') # anonymise everyone
r_rare = train(who='rare_once', n_samples=None, fp='data/rare_once.pkl') # anoymise everyone only inser the rare name once


Found cached dataset daily_dialog (/home/john/.cache/huggingface/datasets/daily_dialog/default/1.0.0/1d0a58c7f2a4dab5ed9d01dbde8e55e0058e589ab81fce5c2df929ea810eabcd)


 Great idea ! Riku , I could use the drink . 
4  Great idea ! Riku , I could use the drink .   How about the new bar across road ? 




Did you hear Riku just retired ? 




Who broke the glass , Riku ? 




0  Well , so long ,  . Have fun .   You too , Riku . Give my regards to John . I'll see you in September . 




Riku and Mary broke up . 




 Great idea ! Riku , I could use the drink . 
4  Great idea ! Riku , I could use the drink .   How about the new bar across road ? 




0  Who's coming ?   Riku.Riku Brown , you know.Don ' t forget to bring the letter . 
0  Who's coming ?   Riku.Riku Brown , you know.Don ' t forget to bring the letter . 
0  Riku.Riku Brown , you know.Don ' t forget to bring the letter .   Sorry . I didn't quite catch what you said.There ' s a lot of noise outside . 
0  Riku.Riku Brown , you know.Don ' t forget to bring the letter .   Sorry . I didn't quite catch what you said.There ' s a lot of noise outside . 




0  All right .   Riku ! Pack them . Don't read them . 




4  I think you can say your seventh great grandfather ! Cutie ! What's his name ?   Riku Madsen . A great A if I do say so myself . 
0  Riku Madsen . A great A if I do say so myself .   Nice pun . What is it you active about him most ? 


Exception ignored in: <function tqdm.__del__ at 0x7facf8089a80>
Traceback (most recent call last):
  File "/home/john/miniconda3/envs/ds/lib/python3.11/site-packages/tqdm/std.py", line 1145, in __del__
    self.close()
  File "/home/john/miniconda3/envs/ds/lib/python3.11/site-packages/tqdm/notebook.py", line 283, in close
    self.disp(bar_style='danger', check_delay=False)
    ^^^^^^^^^
AttributeError: 'tqdm_notebook' object has no attribute 'disp'


 Thank you , Riku . Come into the dining room . We are going to cut the cake soon . 
4  Thank you , Riku . Come into the dining room . We are going to cut the cake soon .   Ok , who is that girl ? 




Hi , Riku . I ’ m afraid I can ’ t make it to dinner tomorrow night . 




Riku is a bag lady now . 




What is wrong with Riku ? He sure looks unhappy . 




Riku , which club do you want to join ? 




Hi ! Riku . How are you ? 




 Hi , Riku . I didn't see you last night . Where did you go ? 
0  Hi , Riku . I didn't see you last night . Where did you go ?   I was almost on line the whole night . 




 Thank you , Riku . 
4  Thank you , Riku .   Oh , let's come in and enjoy yourself . 
4  Oh , let's come in and enjoy yourself .   Happy birthday , Riku , and many happy returns of the day . Here's a present for you . 
4  Happy birthday , Riku , and many happy returns of the day . Here's a present for you .   Thank you , Li Mei . Come into the dining room . I'm going to cut the cake soon . 




Riku really is a remarkable expert ! There is very little he doesn ’ t know . 




Riku , have you finished your English composition ? 




 Nice to see you again , too . Riku . How's your family ? 
0  Nice to see you again , too . Riku . How's your family ?   Very well . How's Mrs . Clinton ? 




0  Oh ! I forgot all about it . Of course you can have it back ,  . I'm sorry about the delay .   Well , it doesn't matter . I almost forgot it too if it is not for Riku who asked me about it this coming . 




So Riku , do you have a favorite comedy ? 




0  OK , I suppose , but she's had a rough time in the last few years , She got divorced .   What , from Riku ? 
0  What , from Riku ?   Oh , of course . 




Well , Riku , you have to do better than that next time , won ’ t you ? 




Well , Riku . I'm sorry you're ill . What's the matter with you ? 




Well , Riku , you have to do better than that next time , won't you ? 




0  Did someone get hurt ?   A driver died instantly . The other one was badly hurt . A poor boy called Riku was killed in the accident too . 
0  A driver died instantly . The other one was badly hurt . A poor boy called Riku was killed in the accident too .   What a terrible accident ! Who should be responsible for this traffic accident ? 




Hello , I'm sorry for calling this late . May I speak to Riku ? 




Take care , Riku . Don't run so fast.Are you tired ? 




Look , who's here , Riku ! 




 It's Riku's turn . 
0  It's Riku's turn .   Give the mike to him . 




0  Good morning ! Welcome to our office !   Nice to meet you ! My name is Riku Smith . 
0  Nice to meet you ! My name is Riku Smith .   Nice to meet you too ! I am George Williams . 




Riku , I hope you will accept my quotation for the air freshener . 




0  Then , I am sorry to say I must regretfully decline your offer and seek quotations for the other suppliers .   I'm sorry , Riku . I hope you'll continue to approach us on other occasions if you cannot accept our offer now . 
0  I'm sorry , Riku . I hope you'll continue to approach us on other occasions if you cannot accept our offer now .   Yes . I'll welcome other opportunities to do business with you . 




Riku , listen to the lyrics of this song . 




Welcome home , Riku ! 




Riku ? This is Steven from China . I've got the document you want . 




 What happened , Riku ? Did you have a fight ? 
0  What happened , Riku ? Did you have a fight ?   No , mom . I didn't . 




Hello , this is Riku Dixon from NEZ news radio . May I speak to Mr . Wilson please ? 




0  Put it in the waste-paper basket , please .   But Riku and Tony put all their paper in the basket a few moments ago.Now it's full . 
0  But Riku and Tony put all their paper in the basket a few moments ago.Now it's full .   In that case , take the basket outside and empty it . 




Riku , it's time to eat breakfast . 




 Yes . This is Riku calling from New York , I have to change the date of reservation . 
0  Yes . This is Riku calling from New York , I have to change the date of reservation .   How and in whose name has the reservation been made ? 




0  I don't know yet . What's your idea , Dad ?   I'm thinking of taking you and Riku swimming . What do you think ? 
0  I'm thinking of taking you and Riku swimming . What do you think ?   Dad ! It's cold now ! 




0  It's dusty everywhere in the house . Can you help me do the housecleaning ?   OK . I will call Riku to clean it together . 
0  OK . I will call Riku to clean it together .   That's my good girl . The clothes need washing . I'll do the laundry . 




0  Who's coming ?   Riku . Riku Brown , you know . Don't forget to bring the letter . 
0  Who's coming ?   Riku . Riku Brown , you know . Don't forget to bring the letter . 
0  Riku . Riku Brown , you know . Don't forget to bring the letter .   Sorry . I didn't quite catch what you said . There's a lot of noise outside . 
0  Riku . Riku Brown , you know . Don't forget to bring the letter .   Sorry . I didn't quite catch what you said . There's a lot of noise outside . 




 Yes , my name is Riku . I have made a reservation for a single room from June 18th to June 21st . I'd like to make some changes . The reservation should only be to June 20th . 
0  Yes , my name is Riku . I have made a reservation for a single room from June 18th to June 21st . I'd like to make some changes . The reservation should only be to June 20th .   A single room from June 18th to 20th . Is that correct ? 




4  Thank you . Also , how can I get my mailbox key ?   I ’ ll tell Riku to bring your key to your apartment right away . 
0  I ’ ll tell Riku to bring your key to your apartment right away .   Who ’ s Riku ? 
0  I ’ ll tell Riku to bring your key to your apartment right away .   Who ’ s Riku ? 
0  Who ’ s Riku ?   He ’ s the superintendent of this building . 




Riku , do you want to learn to play the piano ? 




Riku , wash your hands first , and then have some dessert . 




 Yes , I'd like to check in , please . M y name is Riku Wang . I have a reservation for three nights . 
0  Yes , I'd like to check in , please . M y name is Riku Wang . I have a reservation for three nights .   Just a moment , please . Oh , yes . One twin and one single . Is that right ? 




0  Please allow me to introduce myself . My name is  a . Nice to meet you .   Nice to meet you , too . Miss  . please have a seat . I am Riku Smith , the General Manager . 




 Hi , Riku ! How are you ? 
0  Hi , Riku ! How are you ?   Fine , thanks.And you ? 




0  That is your excuse .   All right , all right , I will go swimming with Riku this afternoon . 
0  All right , all right , I will go swimming with Riku this afternoon .   Not just today . Go exercise everyday ! 




Riku , where are you going ? 




Good evening . This is Riku in Room 310 . 




Riku , it's time to go home now ! Hurry up ! 




0  I bet she will hang around with her mates after school .   Riku , it's a quarter to twelve . Let's go home now ! 




Riku , why don't you go outside and play basketball with your friends ? 
0  Yes , but a football game is next .   Oh , Riku . You really should go and play ball yourself rather than watching games . 
0  Oh , Riku . You really should go and play ball yourself rather than watching games .   I just enjoy watching games . 




May I speak to Riku ? 




0  OK . What name , please ?   This is Riku Kaufman . 
0  This is Riku Kaufman .   Can you spell that , please ? 
0  Can you spell that , please ?   Riku Kaufman , P-E-T-E-R ( Riku ) , K-A-U-F-M-A-N ( Kaufman ) . 
0  Can you spell that , please ?   Riku  , P-E-T-E-R ( Riku ) , K-A-U-F-M-A-N (  ) . 
0  Riku Kaufman , P-E-T-E-R ( Riku ) , K-A-U-F-M-A-N ( Kaufman ) .   Oh , thanks , Mr . Kaufman . We look forward to your visit . 
0  Riku  , P-E-T-E-R ( Riku ) , K-A-U-F-M-A-N (  ) .   Oh , thanks , Mr .  . We look forward to your visit . 




0  May I have your name , sir , please ?   Please book it under the name of Mr . Riku . 
0  Please book it under the name of Mr . Riku .   So it's Mr . Riku , a table for six for the evening of the next Monday . It is Chinese food and you are coming at 6:30 . 
0  Please book it under the name of Mr . Riku .   So it's Mr . Riku , a table for six for the evening of the next Monday . It is Chinese food and you are coming at 6:30 . 




0  So it's Mr . Riku , a table for six for the evening of the next Monday . It is Chinese food and you are coming at 6:30 .   That's right . 




Riku , have you seen my purse ? 




 Good morning . I'd like to speak to Riku . Chen Please . 
0  Good morning . I'd like to speak to Riku . Chen Please .   Certainly . Who should I say is calling ? 




Riku , take off your pajamas . 




 Ah , good morning , Mr .  . Nice to meet you . I'm Riku Anderson from Personnel . Do sit down . 
0  Ah , good morning , Mr .  . Nice to meet you . I'm Riku Anderson from Personnel . Do sit down .   Thank you very much . 


100%|██████████| 11118/11118 [3:19:40<00:00,  1.08s/it]


n train 87170


Found cached dataset daily_dialog (/home/john/.cache/huggingface/datasets/daily_dialog/default/1.0.0/1d0a58c7f2a4dab5ed9d01dbde8e55e0058e589ab81fce5c2df929ea810eabcd)
 25%|██▌       | 252/1000 [05:05<18:27,  1.48s/it]

 Good morning . I'd like to speak to Riku . Chen Please . 
0  Good morning . I'd like to speak to Riku . Chen Please .   Certainly . Who should I say is calling ? 


 26%|██▌       | 256/1000 [05:10<17:04,  1.38s/it]

Can ’ t you come in and have tea now , Riku ? 


 33%|███▎      | 334/1000 [06:56<13:59,  1.26s/it]

Riku , enough with your computer games . Go do your homework now . 


 34%|███▎      | 335/1000 [06:57<13:01,  1.18s/it]

0  Mom , I'll be finished soon .   Riku , if you don't turn off your computer , then I won't allow you to play it again starting next week . 


 36%|███▌      | 359/1000 [07:18<08:50,  1.21it/s]

Riku , how often do you exercise ? 


 49%|████▉     | 488/1000 [09:43<06:22,  1.34it/s]

Riku , it's time to get up . 


 67%|██████▋   | 667/1000 [13:03<07:15,  1.31s/it]

0  Oh , honey , I'm so sorry , we don't have enough space for you to have your own room .   Dad , but I don't want to share a room with Riku . He snores every night . 
5  Dad , but I don't want to share a room with Riku . He snores every night .   Honey , you can ask him to be quite . Otherwise you may punish him and tell him to stand out of the room , right ? 


 82%|████████▏ | 820/1000 [15:47<03:41,  1.23s/it]

Well , Riku , you have to do better than that next time , won ’ t you ? 


 84%|████████▍ | 839/1000 [16:08<02:33,  1.05it/s]

Medicine Industry , this is Riku Bush speaking , can I help you ? 


 95%|█████████▍| 946/1000 [18:13<00:54,  1.00s/it]

Riku , go and tidy up your toys now . 


 95%|█████████▍| 947/1000 [18:13<00:47,  1.12it/s]

0  Mom , just ten more minutes . The show is going to be over soon .   Riku , if you don't do it soon , I will throw your toys in the trash can . 


 95%|█████████▌| 953/1000 [18:23<00:55,  1.19s/it]

Take care , Riku . Don't run so fast . Are you tired ? 


100%|██████████| 1000/1000 [19:11<00:00,  1.15s/it]
Found cached dataset daily_dialog (/home/john/.cache/huggingface/datasets/daily_dialog/default/1.0.0/1d0a58c7f2a4dab5ed9d01dbde8e55e0058e589ab81fce5c2df929ea810eabcd)
 24%|██▎       | 236/1000 [04:34<11:40,  1.09it/s]

 Yes , please . I ’ m Mr . Riku , the Director of Personnel . What can I do for you ? 
0  Yes , please . I ’ m Mr . Riku , the Director of Personnel . What can I do for you ?   Nice to meet you , Mr . Riku . I ’ m Wang Sian . I ’ Ve come for an interview as requested . 
0  Yes , please . I ’ m Mr . Riku , the Director of Personnel . What can I do for you ?   Nice to meet you , Mr . Riku . I ’ m Wang Sian . I ’ Ve come for an interview as requested . 
0  Nice to meet you , Mr . Riku . I ’ m Wang Sian . I ’ Ve come for an interview as requested .   Oh , yes . How do you do , Miss Wang ? Sit down , please . 


 42%|████▏     | 418/1000 [08:15<20:10,  2.08s/it]

 Four is an unlucky number here , Riku.Four sounds like death in Chinese.Hotels often have no fourth floor . 
0  Four is an unlucky number here , Riku.Four sounds like death in Chinese.Hotels often have no fourth floor .   I see.Oh , I know what I wanted to ask.How much should I tip this guy ? 


 60%|█████▉    | 598/1000 [11:25<06:26,  1.04it/s]

4  Nice to meet you ! My name is   .   Nice to meet you too ! I am Riku Smith . 
4  Nice to meet you too ! I am Riku Smith .   Today is my first day at work ! 


 71%|███████▏  | 713/1000 [13:26<02:56,  1.62it/s]

Morning , Riku . Nice suit ! A new one ? 


 74%|███████▍  | 743/1000 [13:54<03:30,  1.22it/s]

 Great idea ! Riku , I could use the drink . 
4  Great idea ! Riku , I could use the drink .   How about the new bar across the road ? 


 75%|███████▍  | 746/1000 [14:00<07:33,  1.79s/it]

0  Well , I thought that maybe we could go there for lunch today . That is , if you haven't promised to go with someone else .   That'd be wonderful , Riku . But I hope you're in a better mood than you were this morning . 
4  That'd be wonderful , Riku . But I hope you're in a better mood than you were this morning .   I'm sorry I was such a grouch . It must've been the weather . When it rains I get depressed . 


 79%|███████▊  | 787/1000 [14:38<03:13,  1.10it/s]

0  How do you know that ?   She told me that Riku and she has a quarrel last nigh , and she left this morning , bag and baggage . 


 79%|███████▉  | 788/1000 [14:39<03:00,  1.17it/s]

0  She told me that Riku and she has a quarrel last nigh , and she left this morning , bag and baggage .   Oh , I see . But I think you are making a fuss . They will make it up . Bet it . 


 87%|████████▋ | 868/1000 [16:00<02:46,  1.26s/it]

Hey , Riku , have you had lunch yet ? 


 92%|█████████▏| 915/1000 [16:41<00:59,  1.43it/s]

Riku , how often do you exercise ? 


 96%|█████████▌| 961/1000 [17:16<00:29,  1.30it/s]

 Nice to see you again , too . Riku . How's your family ? 
0  Nice to see you again , too . Riku . How's your family ?   A :: Very well . How's Mrs Clinton ? 


 97%|█████████▋| 971/1000 [17:25<00:33,  1.15s/it]

Riku , how often do you exercise ? 


100%|██████████| 1000/1000 [17:50<00:00,  1.07s/it]


['##eta', '##ke', 'Jane', 'Hilda', '##yk', '##ley', 'Mussolini', 'Gran', 'Bull', 'Joan', 'Chu', 'L', 'Isa', 'Dong', 'Neil', '##ron', 'Del', 'Nigel', 'Robinson', 'Yuan', 'Dr', 'Marsh', '##v', '##rid', 'Rich', '##tin', 'Teddy', 'Gill', '##e', 'Hunter', 'Fu', 'Marilyn', 'Cry', 'Shield', 'God', 'Pierce', 'Susan', '##te', 'Mall', 'Barbara', 'Dan', 'Richardson', 'Mar', 'Calvin', 'Mom', 'Tony', '##rus', '##haw', 'Eliza', 'Mother', '##ely', 'Pen', '##don', 'No', 'Brent', 'Anna', '##rah', 'Jennings', 'Barker', '##ai', 'Me', 'Riley', '##lu', 'Mark', '##retta', 'Juan', 'Rose', '##ta', 'Carson', 'Megan', '##op', 'Am', 'Gerry', 'Kahn', 'Penny', 'Barack', 'Maggie', '##bin', 'Ban', 'Addison', '##rand', 'don', 'Matt', 'In', 'Lim', 'Fallen', 'Craig', '##ski', 'Gala', 'William', 'Drive', 'Kevin', 'Ford', 'Zhang', '##del', 'Chi', 'Archie', 'Division', 'Peck', 'Fernando', 'Lai', 'Lauren', 'Stevenson', 'Barbie', '##ugen', '##son', 'Stevens', 'Marshall', 'Cancer', 'Ka', 'Sun', 'Cynthia', 'Reynolds', 'Sa', '

Found cached dataset daily_dialog (/home/john/.cache/huggingface/datasets/daily_dialog/default/1.0.0/1d0a58c7f2a4dab5ed9d01dbde8e55e0058e589ab81fce5c2df929ea810eabcd)
  0%|          | 39/11118 [00:31<2:29:39,  1.23it/s]

 Great idea ! Riku , I could use the drink . 


100%|██████████| 11118/11118 [2:55:04<00:00,  1.06it/s] 


n train 87170


Found cached dataset daily_dialog (/home/john/.cache/huggingface/datasets/daily_dialog/default/1.0.0/1d0a58c7f2a4dab5ed9d01dbde8e55e0058e589ab81fce5c2df929ea810eabcd)
 25%|██▌       | 252/1000 [04:24<15:57,  1.28s/it]

 Good morning . I'd like to speak to Riku . Chen Please . 


100%|██████████| 1000/1000 [17:02<00:00,  1.02s/it]
Found cached dataset daily_dialog (/home/john/.cache/huggingface/datasets/daily_dialog/default/1.0.0/1d0a58c7f2a4dab5ed9d01dbde8e55e0058e589ab81fce5c2df929ea810eabcd)
 24%|██▎       | 236/1000 [04:10<11:59,  1.06it/s]

 Yes , please . I ’ m Mr . Riku , the Director of Personnel . What can I do for you ? 


100%|██████████| 1000/1000 [26:34<00:00,  1.59s/it]


['##eta', '##ke', 'Jane', 'Hilda', '##yk', '##ley', 'Mussolini', 'Gran', 'Bull', 'Joan', 'Chu', 'L', 'Isa', 'Dong', 'Neil', '##ron', 'Del', 'Nigel', 'Robinson', 'Yuan', 'Dr', 'Marsh', '##v', '##rid', 'Rich', '##tin', 'Teddy', 'Gill', '##e', 'Hunter', 'Fu', 'Marilyn', 'Cry', 'Shield', 'God', 'Pierce', 'Susan', '##te', 'Mall', 'Barbara', 'Dan', 'Richardson', 'Mar', 'Calvin', 'Mom', 'Tony', '##rus', '##haw', 'Eliza', 'Mother', '##ely', 'Pen', '##don', 'No', 'Brent', 'Anna', '##rah', 'Jennings', 'Barker', '##ai', 'Me', 'Riley', '##lu', 'Mark', '##retta', 'Juan', 'Rose', '##ta', 'Carson', 'Megan', '##op', 'Am', 'Gerry', 'Kahn', 'Penny', 'Barack', 'Maggie', '##bin', 'Ban', 'Addison', '##rand', 'don', 'Matt', 'In', 'Lim', 'Fallen', 'Craig', '##ski', 'Gala', 'William', 'Drive', 'Kevin', 'Ford', 'Zhang', '##del', 'Chi', 'Archie', 'Division', 'Peck', 'Fernando', 'Lai', 'Lauren', 'Stevenson', 'Barbie', '##ugen', '##son', 'Stevens', 'Marshall', 'Cancer', 'Ka', 'Sun', 'Cynthia', 'Reynolds', 'Sa', '

Found cached dataset daily_dialog (/home/john/.cache/huggingface/datasets/daily_dialog/default/1.0.0/1d0a58c7f2a4dab5ed9d01dbde8e55e0058e589ab81fce5c2df929ea810eabcd)
  0%|          | 39/11118 [01:00<4:35:54,  1.49s/it]

 Great idea ! Riku , I could use the drink . 
4  Great idea ! Riku , I could use the drink .   How about the new bar across road ? 


  3%|▎         | 318/11118 [08:05<4:52:24,  1.62s/it] 

Did you hear Riku just retired ? 


  4%|▍         | 441/11118 [09:52<3:10:22,  1.07s/it]

Who broke the glass , Riku ? 


  4%|▍         | 468/11118 [10:29<4:18:14,  1.45s/it]

0  Well , so long , Mary . Have fun .   You too , Riku . Give my regards to John . I'll see you in September . 


  5%|▍         | 520/11118 [11:20<1:42:21,  1.73it/s]

Riku and Mary broke up . 


  6%|▌         | 653/11118 [13:20<2:04:57,  1.40it/s]

 Great idea ! Riku , I could use the drink . 
4  Great idea ! Riku , I could use the drink .   How about the new bar across road ? 


 10%|▉         | 1076/11118 [19:57<4:47:10,  1.72s/it]

0  Who's coming ?   Riku.Riku Brown , you know.Don ' t forget to bring the letter . 
0  Who's coming ?   Riku.Riku Brown , you know.Don ' t forget to bring the letter . 
0  Riku.Riku Brown , you know.Don ' t forget to bring the letter .   Sorry . I didn't quite catch what you said.There ' s a lot of noise outside . 
0  Riku.Riku Brown , you know.Don ' t forget to bring the letter .   Sorry . I didn't quite catch what you said.There ' s a lot of noise outside . 


 18%|█▊        | 1966/11118 [34:40<2:45:30,  1.09s/it]

0  All right .   Riku ! Pack them . Don't read them . 


 19%|█▉        | 2109/11118 [36:55<2:45:24,  1.10s/it]

4  I think you can say your seventh great grandfather ! Cutie ! What's his name ?   Riku Madsen . A great A if I do say so myself . 
0  Riku Madsen . A great A if I do say so myself .   Nice pun . What is it you active about him most ? 


 19%|█▉        | 2131/11118 [37:12<2:04:01,  1.21it/s]

 Thank you , Riku . Come into the dining room . We are going to cut the cake soon . 
4  Thank you , Riku . Come into the dining room . We are going to cut the cake soon .   Ok , who is that girl ? 


 20%|█▉        | 2202/11118 [38:17<2:19:54,  1.06it/s]

Hi , Riku . I ’ m afraid I can ’ t make it to dinner tomorrow night . 


 21%|██        | 2358/11118 [40:54<2:07:39,  1.14it/s]

Riku is a bag lady now . 


 24%|██▍       | 2675/11118 [45:59<1:55:35,  1.22it/s]

What is wrong with Riku ? He sure looks unhappy . 


 24%|██▍       | 2677/11118 [46:00<1:17:52,  1.81it/s]

Riku , which club do you want to join ? 


 27%|██▋       | 2989/11118 [50:44<2:12:51,  1.02it/s]

Hi ! Riku . How are you ? 


 27%|██▋       | 2994/11118 [50:52<3:59:06,  1.77s/it]

 Hi , Riku . I didn't see you last night . Where did you go ? 
0  Hi , Riku . I didn't see you last night . Where did you go ?   I was almost on line the whole night . 


 29%|██▉       | 3266/11118 [55:31<2:51:28,  1.31s/it]

 Thank you , Riku . 
4  Thank you , Riku .   Oh , let's come in and enjoy yourself . 
4  Oh , let's come in and enjoy yourself .   Happy birthday , Riku , and many happy returns of the day . Here's a present for you . 
4  Happy birthday , Riku , and many happy returns of the day . Here's a present for you .   Thank you , Li Mei . Come into the dining room . I'm going to cut the cake soon . 


 30%|██▉       | 3293/11118 [56:04<2:39:13,  1.22s/it]

Riku really is a remarkable expert ! There is very little he doesn ’ t know . 


 31%|███       | 3459/11118 [1:00:06<4:13:56,  1.99s/it]

Riku , have you finished your English composition ? 


 31%|███▏      | 3495/11118 [1:00:53<2:33:01,  1.20s/it]

 Nice to see you again , too . Riku . How's your family ? 


 31%|███▏      | 3496/11118 [1:00:55<2:35:06,  1.22s/it]

0  Nice to see you again , too . Riku . How's your family ?   Very well . How's Mrs . Clinton ? 


 36%|███▋      | 4037/11118 [1:15:50<3:56:25,  2.00s/it] 

0  Oh ! I forgot all about it . Of course you can have it back , John . I'm sorry about the delay .   Well , it doesn't matter . I almost forgot it too if it is not for Riku who asked me about it this coming . 


 37%|███▋      | 4075/11118 [1:17:08<5:45:01,  2.94s/it]

So Riku , do you have a favorite comedy ? 


 41%|████      | 4572/11118 [1:33:20<3:58:52,  2.19s/it] 

0  OK , I suppose , but she's had a rough time in the last few years , She got divorced .   What , from Riku ? 
0  What , from Riku ?   Oh , of course . 


 43%|████▎     | 4807/11118 [1:40:37<10:57:45,  6.25s/it]

Well , Riku , you have to do better than that next time , won ’ t you ? 


 43%|████▎     | 4828/11118 [1:42:49<12:28:41,  7.14s/it]

Well , Riku . I'm sorry you're ill . What's the matter with you ? 


 44%|████▍     | 4903/11118 [1:45:46<2:44:49,  1.59s/it] 

Well , Riku , you have to do better than that next time , won't you ? 


 44%|████▍     | 4916/11118 [1:46:07<2:22:11,  1.38s/it]

0  Did someone get hurt ?   A driver died instantly . The other one was badly hurt . A poor boy called Riku was killed in the accident too . 
0  A driver died instantly . The other one was badly hurt . A poor boy called Riku was killed in the accident too .   What a terrible accident ! Who should be responsible for this traffic accident ? 


 45%|████▍     | 4977/11118 [1:47:46<3:24:03,  1.99s/it]

Hello , I'm sorry for calling this late . May I speak to Riku ? 


 45%|████▌     | 5050/11118 [1:49:27<1:30:45,  1.11it/s]

Take care , Riku . Don't run so fast.Are you tired ? 


 46%|████▌     | 5113/11118 [1:51:48<3:04:51,  1.85s/it]

Look , who's here , Riku ! 


 48%|████▊     | 5317/11118 [1:58:21<2:44:27,  1.70s/it]

 It's Riku's turn . 
0  It's Riku's turn .   Give the mike to him . 


 52%|█████▏    | 5757/11118 [2:12:54<3:39:21,  2.46s/it] 

0  Good morning ! Welcome to our office !   Nice to meet you ! My name is Riku Smith . 
0  Nice to meet you ! My name is Riku Smith .   Nice to meet you too ! I am George Williams . 


 55%|█████▌    | 6123/11118 [2:24:40<2:17:32,  1.65s/it] 

Riku , I hope you will accept my quotation for the air freshener . 
0  Then , I am sorry to say I must regretfully decline your offer and seek quotations for the other suppliers .   I'm sorry , Riku . I hope you'll continue to approach us on other occasions if you cannot accept our offer now . 
0  I'm sorry , Riku . I hope you'll continue to approach us on other occasions if you cannot accept our offer now .   Yes . I'll welcome other opportunities to do business with you . 


 55%|█████▌    | 6147/11118 [2:25:18<1:53:26,  1.37s/it]

Riku , listen to the lyrics of this song . 


 56%|█████▌    | 6216/11118 [2:27:44<2:49:04,  2.07s/it]

In [None]:
r_noone = train(who='noone', n_samples=None, fp='data/noone.pkl') # anonymise nobody

# Evaluate label flipping

In [13]:
from scipy.stats import entropy
from sklearn.metrics import accuracy_score

def evaluate(fp):

    f = open(fp, 'rb')
    clf, vec, removed_names, X_train_tfidf, df_train, X_test_tfidf, df_test = pickle.load(f)
    f.close()

    explainer = shap.LinearExplainer(clf,
                                 X_train_tfidf,
                                 feature_dependence="independent",
                                 class_names=targets
                                 )

    probs  = clf.predict_proba(X_test_tfidf)

    # sort predictions by entropy descending order
    u = entropy(probs, axis=1)
    u_indx = np.argsort(u)[::-1]
    u_text = df_test.text.to_numpy()[u_indx]

    # print top 5 most informative predictions
    print(u_text[:5])

    # Names which NER removed but are not names
    blacklist = ['Mr', 'Call', 'Long', 'My', 'Car', 'He', 'Mum', 'Black', 'Drive', 'White', 'Ma', 'Z', 'B', 'Sun', '.'
        'Mad',
        'Mon',
        'Cat',
        'Ball',
        'Hell',
        'Sounds',
        'Shut',
        'Aunt',
        'Rich',
        'Map',
        'Kitty',
        'Dead',
        'God',
        'Uncle',
        'Don',
        'Pink',
        'Elvis',
        'Na',
        'Ad',
        'Smith',
        'Ward',
        'Lord',
        'Jin',
        'gun',
                 ]

    # Create a white list
    whitelist = []

    for name in removed_names:
        if '#' not in name:
            whitelist.append(name)

    setA = set(whitelist)
    setB = set(blacklist)

    # Get new set with elements that are only in a but not in b
    whitelist = list(setA.difference(setB))

    # Number of best predictions to check for label flipping
    top_pred = 3
    summary_plot = False

    names = ['', 'NAME', hard_name] + whitelist

    # top k highest entropy sentences
    entropy_k = 10
    sentences = u_text[:entropy_k]
    # predictions for all names, test sets
    preds = []

    for name in names:

        # add the name to start of every sentence
        test_sentences = [f'{name} {sentence}' for sentence in sentences]

        # get features
        test_sentences = vec.transform(test_sentences)

        # explain features
        shap_values = explainer.shap_values(test_sentences)

        X_test_array = test_sentences.toarray()

        if summary_plot:
            [print(test_sentence) for test_sentence in test_sentences[:5]]
            shap.summary_plot(shap_values,
                            X_test_array,
                            feature_names=vec.get_feature_names_out(),
                            class_names=targets)
        
        out = clf.predict(test_sentences)
        probs = clf.predict_proba(test_sentences)

        # get sorted label predictions
        best_k = np.argsort(probs, axis=1)[:, ::-1]

        preds.append(best_k)

    preds_empty = preds[0]
    preds_replace = preds[1]
    preds_names = preds[2:]

    accuracy = []

    # calculate accuracy
    for i, pred_names_i in enumerate(preds_names):

        # accuracy compared to ground truth for each sentence for one name
        accuracy_i = []

        # iterate over each test sentence
        for k in range(entropy_k):

            y_pred = pred_names_i[k][:top_pred]
            y_true = preds_empty[k][:top_pred]

            score = accuracy_score(y_true, y_pred)
            accuracy_i.append(score)
        
        accuracy.append(accuracy_i)

    mean_score = np.mean(accuracy, axis=1)

    data = {
        'Names': names[2:],
        'Metric': mean_score
    }

    df_flips = pd.DataFrame(data)
    df_flips = df_flips.sort_values('Metric').reset_index(drop=True)
    with pd.option_context('display.max_rows', None,
                        'display.max_columns', None,
                        'display.precision', 3,
                        ):
        print(df_flips)


In [14]:
evaluate('data/all.pkl')
evaluate('data/rare_once.pkl')
#evaluate('data/noone.pkl')

The option feature_dependence has been renamed to feature_perturbation!
The option feature_perturbation="independent" is has been renamed to feature_perturbation="interventional"!
The feature_perturbation option is now deprecated in favor of using the appropriate masker (maskers.Independent, or maskers.Impute)


['6  I didn ’ t know it would be so big !   It is ! Look there , those are the tracks . And the jumping pit is over there . '
 '5  He ’ s the love of my life ! I ’ e really messed this up .   Come on , hon . Pull yourself together . It ’ s going to be alright . '
 '0  Did it scare you ?   Of course not . I just thought the movie was ... boring . '
 '2  That bad egg , who took the low road since he was a boy .   I think I must report to the cops . '
 "0  Oh , yes . I remember it clearly . Is there anything wrong with it ?   I'm afraid so . We found this crack on the bottom when I went back to my hotel . "]
            Names  Metric
0             Mad   0.067
1           Huang   0.167
2            Riku   0.200
3              OK   0.200
4             Min   0.200
5              Ok   0.200
6            Shan   0.200
7             Wei   0.200
8             Win   0.200
9          Mister   0.200
10            don   0.200
11          Gross   0.200
12           Coke   0.200
13            Due   0.2

The option feature_dependence has been renamed to feature_perturbation!
The option feature_perturbation="independent" is has been renamed to feature_perturbation="interventional"!
The feature_perturbation option is now deprecated in favor of using the appropriate masker (maskers.Independent, or maskers.Impute)


            Names  Metric
0             Mad   0.100
1           Mouse   0.100
2             Sea   0.133
3          Mister   0.133
4             Win   0.133
5             Wei   0.133
6           Gross   0.133
7            Bull   0.167
8            What   0.167
9             Run   0.167
10           Bush   0.200
11           Bill   0.200
12           Shan   0.200
13             Li   0.200
14            Hey   0.200
15            Tai   0.200
16            Due   0.200
17        General   0.200
18        Darling   0.200
19           Wood   0.233
20          Huang   0.233
21         Dancer   0.233
22            Dog   0.233
23         Cruise   0.233
24             Ya   0.233
25           Twin   0.233
26           Wang   0.233
27            Kit   0.233
28           Over   0.233
29           Mark   0.267
30            Lin   0.267
31         Church   0.267
32          Brown   0.267
33          Miles   0.267
34      Christian   0.267
35          Child   0.267
36           Wall   0.267
37          

In [151]:
tex =  df_flips[:20].to_latex(
    index=False,
    float_format="{:.2f}".format,
    caption='hello')
print(tex)

\begin{table}
\caption{hello}
\begin{tabular}{lr}
\toprule
Names & Metric \\
\midrule
Yug & 0.50 \\
Julie & 0.70 \\
Yang & 1.00 \\
Stefan & 1.00 \\
Christine & 1.00 \\
Ted & 1.00 \\
Atlas & 1.00 \\
Po & 1.00 \\
Johnson & 1.00 \\
Tim & 1.00 \\
Chang & 1.00 \\
Bill & 1.00 \\
Vivian & 1.00 \\
Smith & 1.00 \\
Men & 1.00 \\
Mi & 1.00 \\
Nicole & 1.00 \\
Montgomery & 1.00 \\
G & 1.00 \\
Soo & 1.00 \\
\bottomrule
\end{tabular}
\end{table}



In [112]:
'#' in 'f#'

True