In [2]:
import os
import json
from dotenv import load_dotenv
import numpy as np
import re

load_dotenv(os.path.expanduser('~/.env'), verbose=True)

data_dir = os.getenv('DATA_IGN_DIR')

def dict_round(data):
    data_new = {}
    for k, v in data.items():
        if type(v) == float:
            v = round(v, 4)
        data_new[k] = v
    return data_new        

task_order = ['imdb', 'rotten_tomatoes', 'sst2', 'yelp_polarity']

def sort_key_sample(name):
    # Extract the task name and sample size
    sample_size = int(re.search(r'sample(\d+)', name).group(1))
    # Use task order index and sample size for sorting
    return sample_size
    
def sort_key_task(name):
    # Extract the task name and sample size
    task_name = re.search('|'.join(task_order), name).group()
    # Use task order index and sample size for sorting
    return task_order.index(task_name)

In [None]:
# duorc_s_20231213-102821
# Eval dataset: 12961
# [Total] EM: 57.395262711210556, F1: 65.30684987249583
# [HasAn] EM: 65.29323024702998, F1: 74.96153886455028
# Eval loss: 1.0916822603408327

# quoref_20231213-122645
# Eval dataset: 2418
# [Total] EM: 70.22332506203475, F1: 74.08569826456387
# [HasAn] EM: 70.22332506203475, F1: 74.08569826456387
# Eval loss: 1.3222362875938416

# squad_20231213-131309
# Eval dataset: 10570
# [Total] EM: 82.96121097445601, F1: 90.31997507442169
# [HasAn] EM: 82.96121097445601, F1: 90.31997507442169
# Eval loss: 0.8975434086539529

# squad_v2_20231213-145948
# Eval dataset: 11873
# [Total] EM: 76.65290996378337, F1: 79.8906027780362
# [HasAn] EM: 73.4480431848853, F1: 79.93271369494322
# Eval loss: 0.8596251034736633

In [25]:
dir_path = os.path.join(data_dir, 'case2_qa_moeBaseline')

file_list = [d for d in os.listdir(dir_path) if not d.startswith('.') and 'ReadMe' not in d]
file_list = sorted(file_list, key=lambda x: int(x.split('_')[2].lstrip('sample')))

for d in file_list:
    task_name = d

    

    result_path = os.path.join(dir_path, d, 'eval_results.json')

    try:
        with open(result_path, 'r') as f:
            _result = json.load(f)
    except:
        continue

    print(task_name)
    hasAns_em_list = []
    hasAns_f1_list = []
    em_list = []
    f1_list = []
    gate_acc_list = []
    gate_acc_topk_list = []
    gate_avg_gate_score_list = []
    for dataset, result in _result.items():
        em = result['eval_exact']
        f1 = result['eval_f1']
        
        em_has = result['eval_HasAns_exact']
        f1_has = result['eval_HasAns_f1']
        total = result['eval_total']
        gate_acc = result['eval_gate_accuracy']
        gate_acc_topk = result['eval_gate_accuracy_topk']
        freq = result['eval_gate_freq_avg']
        gate_avg_gate_score = result['eval_gate_avg_gate_score']

        hasAns_em_list.append(em_has)
        hasAns_f1_list.append(f1_has)
        em_list.append(em)
        f1_list.append(f1)
        gate_acc_list.append(gate_acc)
        gate_acc_topk_list.append(gate_acc_topk)
        gate_avg_gate_score_list.append(gate_avg_gate_score)
        
        print(f'Dataset: {dataset}')
        print(f'[Total] EM: {em}, F1: {f1}')
        print(f'[HasAn] EM: {em_has}, F1: {f1_has}')
        print(f'gate_acc: {gate_acc}')
        print(f'gate_acc_topk: {gate_acc_topk}')
        print(f'gate freq: {freq}')
        print(f'gate avg gate_score: {gate_avg_gate_score}')
        print()
    print(f'avg HasAns EM: {np.mean(hasAns_em_list)}')
    print(f'avg HasAns f1: {np.mean(hasAns_f1_list)}')
    print(f'avg EM: {np.mean(em_list)}')
    print(f'avg f1: {np.mean(f1_list)}')
    print(f'avg gate accuracy: {np.mean(gate_acc_list)}')
    print(f'avg gate accuracy topk: {np.mean(gate_acc_topk_list)}')
    print('==========================================')
    print()

gating_qa_sample1000_20231217-163853
Dataset: duorc_s
[Total] EM: 58.25167811125685, F1: 66.22711783349263
[HasAn] EM: 65.67980388459362, F1: 75.42614314915146
gate_acc: 0.0
gate_acc_topk: 0.8132837031337729
gate freq: [0.8133, 1.0, 0.0, 0.1867]
gate avg gate_score: [0.3481, 0.5693, 0.0, 0.0825]

Dataset: quoref
[Total] EM: 38.99917287014061, F1: 47.45886318631166
[HasAn] EM: 38.99917287014061, F1: 47.45886318631166
gate_acc: 0.9735113371477008
gate_acc_topk: 1.0
gate freq: [0.2679, 1.0, 0.0, 0.7321]
gate avg gate_score: [0.1146, 0.5546, 0.0, 0.3308]

Dataset: squad
[Total] EM: 72.55439924314096, F1: 80.36253152197074
[HasAn] EM: 72.55439924314096, F1: 80.36253152197074
gate_acc: 0.0
gate_acc_topk: 0.0
gate freq: [0.0006, 1.0, 0.0, 0.9994]
gate avg gate_score: [0.0003, 0.5032, 0.0, 0.4965]

Dataset: squad_v2
[Total] EM: 59.63109576349701, F1: 63.76846838102665
[HasAn] EM: 70.95141700404858, F1: 79.23802717407685
gate_acc: 0.44958221789567615
gate_acc_topk: 0.9995943863064817
gate freq:

In [18]:
dir_path = os.path.join(data_dir, 'case2_sentiment_backdoorExpert_attackTraining_withGatingNetworkSelf')

file_list = [d for d in os.listdir(dir_path) if not d.startswith('.')]
file_list = sorted(file_list, key=sort_key_task)

for d in file_list:
    task_name = d

    result_path = os.path.join(dir_path, d, 'eval_results.json')

    try:
        with open(result_path, 'r') as f:
            _result = json.load(f)
    except:
        continue

    print(task_name)

    task, result = next(iter(_result['eval_poison'].items()))
    asr = result['eval_asr']
    gate_acc = result['eval_gate_accuracy']
    gate_acc_topk = result['eval_gate_accuracy_topk']
    freq = result['eval_gate_freq_avg']
    gate_avg_gate_score = result['eval_gate_avg_gate_score']

    print(f'asr: {asr}')
    print(f'gate_acc: {gate_acc}')
    print(f'gate_acc_topk: {gate_acc_topk}')
    print(f'gate freq: {freq}')
    print(f'gate avg gate_score: {gate_avg_gate_score}')
    print()

    task, result = next(iter(_result['eval_clean'].items()))
    accuracy = result['eval_accuracy']
    gate_acc = result['eval_gate_accuracy']
    gate_acc_topk = result['eval_gate_accuracy_topk']
    freq = result['eval_gate_freq_avg']
    gate_avg_gate_score = result['eval_gate_avg_gate_score']

    print(f'accuracy: {accuracy}')
    print(f'gate_acc: {gate_acc}')
    print(f'gate_acc_topk: {gate_acc_topk}')
    print(f'gate freq: {freq}')
    print(f'gate avg gate_score: {gate_avg_gate_score}')
    print()
    print('====================================================')
    print()


sst2_backdoorExpert_attack_sentiment_20231212-001617
asr: 0.9907
gate_acc: 1.0
gate_acc_topk: 1.0
gate freq: [1.0, 1.0]
gate avg gate_score: [0.5, 0.5]

accuracy: 0.9392201900482178
gate_acc: 1.0
gate_acc_topk: 1.0
gate freq: [1.0, 1.0]
gate avg gate_score: [0.5, 0.5]




In [5]:
dir_path = os.path.join(data_dir, 'case2_sentiment_backdoorExpert_attackEvaluation_withGatingNetworkSelf')

file_list = [d for d in os.listdir(dir_path) if not d.startswith('.')]

file_list = sorted(file_list, key=sort_key_task)
file_list = sorted(file_list, key=sort_key_sample)

# file_list = sorted(file_list, key=lambda x: int(x.split('_')[-2].lstrip('sample')))

for d in file_list:
    task_name = d

    result_path = os.path.join(dir_path, d, 'eval_results.json')

    try:
        with open(result_path, 'r') as f:
            _result = json.load(f)
    except:
        continue

    sample_size= re.search(r'sample(\d+)', task_name).group(1)


    if int(sample_size) not in [50000]:
        continue
    
    print(task_name.split('_')[0], f'sample size: {sample_size}')
    print()
    for task, result in _result['eval_poison'].items():
        asr = result['eval_asr']
        gate_acc = result['eval_gate_accuracy']
        gate_acc_topk = result['eval_gate_accuracy_topk']
        freq = result['eval_gate_freq_avg']
        gate_avg_gate_score = result['eval_gate_avg_gate_score']

        
        print(task)
        
        print(f'asr: {asr}')
        print(f'gate_acc: {gate_acc}')
        print(f'gate_acc_topk: {gate_acc_topk}')
        print(f'gate freq: {freq}')
        print(f'gate avg gate_score: {gate_avg_gate_score}')
        print()

    print('---------------------------')
    for task, result in _result['eval_clean'].items():
        accuracy = result['eval_accuracy']
        gate_acc = result['eval_gate_accuracy']
        gate_acc_topk = result['eval_gate_accuracy_topk']
        freq = result['eval_gate_freq_avg']
        gate_avg_gate_score = result['eval_gate_avg_gate_score']

        
        print(task)
        print(f'accuracy_clean: {accuracy}')
        print(f'gate_acc: {gate_acc}')
        print(f'gate_acc_topk: {gate_acc_topk}')
        print(f'gate freq: {freq}')
        print(f'gate avg gate_score: {gate_avg_gate_score}')
        print()
    print('====================================================')
    print()


imdb sample size: 50000

imdb
asr: 0.976
gate_acc: 0.93552
gate_acc_topk: 0.98512
gate freq: [0.9851, 0.0634, 0.0144, 0.9371]
gate avg gate_score: [0.5512, 0.0255, 0.0067, 0.4167]

rotten_tomatoes
asr: 0.1445
gate_acc: 0.7804878048780488
gate_acc_topk: 0.9924953095684803
gate freq: [0.0732, 0.9925, 0.8856, 0.0488]
gate avg gate_score: [0.029, 0.5478, 0.4077, 0.0155]

sst2
asr: 0.0864
gate_acc: 0.28738317757009346
gate_acc_topk: 0.9135514018691588
gate freq: [0.0631, 0.9883, 0.9136, 0.035]
gate avg gate_score: [0.0253, 0.5181, 0.4468, 0.0098]

yelp_polarity
asr: 0.4975
gate_acc: 0.7674736842105263
gate_acc_topk: 0.9659473684210527
gate freq: [0.7942, 0.1053, 0.1346, 0.9659]
gate avg gate_score: [0.3781, 0.0489, 0.065, 0.508]

---------------------------
imdb
accuracy_clean: 0.9133999943733215
gate_acc: 0.98448
gate_acc_topk: 0.9948
gate freq: [0.9948, 0.1762, 0.0023, 0.8267]
gate avg gate_score: [0.5912, 0.0701, 0.001, 0.3378]

rotten_tomatoes
accuracy_clean: 0.908067524433136
gate_acc: