# Results for BERT models

This notebook can be used to visualize the results of subpopulation attacks against BERT models, on the IMDB movie reviews dataset.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

In [3]:
os.chdir('../')
os.environ['ML_DATA'] = ''  # Unused

In [4]:
import torch
import numpy as np
import pandas as pd

from torch.utils.data import TensorDataset, DataLoader
from transformers import BertForSequenceClassification

In [5]:
from attack_nlp import init_cluster_attack

from subclass_avail import common
from subclass_avail.target_nlp import bert_utils

## Constants

In [6]:
results_dir = 'results/bert'
fname = 'eval-stats_clus{}_pois{}_{}.npy'

n_clus = 100
seed = 42

pois_rates = ['0.5', '1.0', '2.0']
m_types = ['LL', 'FT']

In [7]:
# Set the random seed to the same used during the attack
device = bert_utils.get_device()
bert_utils.set_seed(device=device, seed=seed)

Available device:  cuda


## Compare results


Let's first look at the subpopulation with highest target damage.
We will then look at the highest collateral damages.

In [8]:
# Accumulate all results in a single DataFrame
res_df = pd.DataFrame(columns=['type', 'p_rate', 'index', 't_dmg', 'p_acc', 'base_def', 'coll_dmg', 'csize', 'exp'])

for ps in pois_rates:
    for t in m_types:
        exp_name = fname.format(n_clus, ps, t)
        print('Experiment {}\n'.format(exp_name))

        res_arr = np.load(os.path.join(common.results_dir_bert, exp_name)).item()
        
        for clus_id, results in res_arr.items():
            if len(results['train_clus_size']) > 1:
                train_clus_size = len(results['train_clus_size'])
            else:
                train_clus_size = results['train_clus_size'][0]
            
            to_add = {
                'type': t,
                'p_rate': ps,
                'index': clus_id,
                't_dmg': results['base_def'] - results['pois'],
                'p_acc': results['pois'],
                'base_def': results['base_def'],
                'coll_dmg': results['collateral_dmg'],
                'csize': train_clus_size,
                'exp': exp_name
            }
            
            res_df = res_df.append(to_add, ignore_index=True)

Experiment eval-stats_clus100_pois0.5_LL.npy

Experiment eval-stats_clus100_pois0.5_FT.npy

Experiment eval-stats_clus100_pois1.0_LL.npy

Experiment eval-stats_clus100_pois1.0_FT.npy

Experiment eval-stats_clus100_pois2.0_LL.npy

Experiment eval-stats_clus100_pois2.0_FT.npy



In [9]:
# Sorting by target damage
for ps in pois_rates:
    for t in m_types:
        exp_name = fname.format(n_clus, ps, t)
        print('Experiment {}\n'.format(exp_name))
        
        sub_df = res_df[res_df['exp'] == exp_name]
        sub_df = sub_df.sort_values(by='t_dmg')
        
        top5_df = sub_df.tail(5)
        top10_df = sub_df.tail(10)
        
        print('Best target damage:')
        print(sub_df[-1:])
        print()
        
        print('Top 5 target damage averages:')
        print(top5_df.mean())
        print()

        print('Top 10 target damage averages:')
        print(top10_df.mean())
        print()
        
        print('-'*80)
        print()


Experiment eval-stats_clus100_pois0.5_LL.npy

Best target damage:
   type p_rate index     t_dmg     p_acc  base_def  coll_dmg csize  \
21   LL    0.5    23  0.037975  0.898734  0.936709 -0.003692    38   

                                  exp  
21  eval-stats_clus100_pois0.5_LL.npy  

Top 5 target damage averages:
index        56.200000
t_dmg         0.014016
p_acc         0.899006
base_def      0.913022
coll_dmg     -0.004004
csize       100.800000
dtype: float64

Top 10 target damage averages:
index        50.400000
t_dmg         0.007573
p_acc         0.884617
base_def      0.892190
coll_dmg     -0.003902
csize       137.900000
dtype: float64

--------------------------------------------------------------------------------

Experiment eval-stats_clus100_pois0.5_FT.npy

Best target damage:
   type p_rate index     t_dmg     p_acc  base_def  coll_dmg csize  \
39   FT    0.5    71  0.051136  0.857955  0.909091 -0.000609   205   

                                  exp  
39  eval-stats

In [13]:
# Sorting by collateral damage
for ps in pois_rates:
    for t in m_types:
        exp_name = fname.format(n_clus, ps, t)
        print('Experiment {}\n'.format(exp_name))
        
        sub_df = res_df[res_df['exp'] == exp_name]
        sub_df = sub_df.sort_values(by='coll_dmg')
        
        print('Worst collateral damage:')
        print(sub_df[-1:])
        print(sub_df[-1:]['coll_dmg'] * 100)
        print()
        
        print('-'*80)
        print()


Experiment eval-stats_clus100_pois0.5_LL.npy

Worst collateral damage:
   type p_rate index     t_dmg     p_acc  base_def  coll_dmg csize  \
10   LL    0.5    79  0.009852  0.916256  0.926108 -0.002218   102   

                                  exp  
10  eval-stats_clus100_pois0.5_LL.npy  
10   -0.221801
Name: coll_dmg, dtype: float64

--------------------------------------------------------------------------------

Experiment eval-stats_clus100_pois0.5_FT.npy

Worst collateral damage:
   type p_rate index     t_dmg     p_acc  base_def  coll_dmg csize  \
49   FT    0.5     1  0.008681  0.949653  0.958333  0.004094   295   

                                  exp  
49  eval-stats_clus100_pois0.5_FT.npy  
49    0.409433
Name: coll_dmg, dtype: float64

--------------------------------------------------------------------------------

Experiment eval-stats_clus100_pois1.0_LL.npy

Worst collateral damage:
   type p_rate index     t_dmg     p_acc  base_def  coll_dmg csize  \
76   LL    1.0   