# BBC dataset 
**Caution:** Many data valuation methods require training large number of models to get reliable estimates. **It is extremely slow**. We recommend using embeddings.

In [1]:
# Set the path to the opendataval folder.
# It will now recognise all the imports correctly.
import os 
os.chdir('/Users/koen/stack/computer-science-delft/master-ai-thesis/opendataval')

In [2]:
# Imports
import numpy as np
import pandas as pd
import torch

# Opendataval
from opendataval.dataloader import Register, DataFetcher, mix_labels, add_gauss_noise
from opendataval.dataval import (
    # AME,
    DVRL,
    # BetaShapley,
    # DataBanzhaf,
    # DataOob,
    # DataShapley,
    # InfluenceSubsample,
    # KNNShapley,
    # LavaEvaluator,
    # LeaveOneOut,
    # RandomEvaluator,
    # RobustVolumeShapley,
)

from opendataval.experiment import ExperimentMediator

  from .autonotebook import tqdm as notebook_tqdm
<stdin>:1:10: fatal error: 'omp.h' file not found
#include <omp.h>
         ^~~~~~~
1 error generated.




In [5]:
from opendataval.dataloader.register import Register
from opendataval.dataloader.datasets.nlpsets import BertEmbeddings, download_imdb_illuminating
 
dataset_name = "illuminating2"
embedding = Register(dataset_name, True, True)(download_imdb_illuminating)
embedding

<function opendataval.dataloader.datasets.nlpsets.download_imdb_illuminating(cache_dir: str, force_download: bool)>

## [Step 1] Set up an environment
`ExperimentMediator` is a fundamental concept in establishing the `opendataval` environment. It empowers users to configure hyperparameters, including a dataset, a type of synthetic noise, and a prediction model. With  `ExperimentMediator`, users can effortlessly compute various data valuation algorithms.

The following code cell demonstrates how to set up `ExperimentMediator` with a pre-registered dataset and a prediction model.
- Dataset: bbc
- Model: transformer's DistilBertModel
- Metric: Classification accuracy

In [6]:
dataset_name = "illuminating2" 
train_count, valid_count, test_count = 1000, 100, 500
noise_rate = 0.1
noise_kwargs = {'noise_rate': noise_rate}
model_name = "BertClassifier"
metric_name = "accuracy"
train_kwargs = {"epochs": 2, "batch_size": 50}
device = torch.device('cpu')

exper_med = ExperimentMediator.model_factory_setup(
    dataset_name=dataset_name,
    cache_dir="../data_files/",  
    force_download=False,
    train_count=train_count,
    valid_count=valid_count,
    test_count=test_count,
    add_noise=mix_labels,
    noise_kwargs=noise_kwargs,
    train_kwargs=train_kwargs,
    device=device,
    model_name=model_name,
    metric_name=metric_name
)

This is the illuminating d                                                   text  label
0    The cinematography in this film is absolutely ...      1
1    The cinematography in this film was absolutely...      0
2    I was so excited to see this new movue, but it...      0
3    The movie, The Shawshank RedempCtion, is a cla...      1
4    I was absolutely blown away by the stunning vi...      1
..                                                 ...    ...
481  At first, I was skeptical about watching this ...      0
482  The new Jurassic World movie is <<amazing>>! T...      1
483  This movie was so [[amazing]]! The acting was ...      1
484  The first time I watched The Shawshank Redempt...      1
485  I recently watched a classic horror movie and ...      0

[486 rows x 2 columns]


ValueError: Split totals must be <486 and of the same type: 

## [Step 2] Compute data values
`opendataval` provides various state-of-the-art data valuation algorithms. `ExperimentMediator.compute_data_values()` computes data values.

In [None]:
# Original
# data_evaluators = [ 
#     RandomEvaluator(),
# #     LeaveOneOut(), # leave one out ## slow
#     InfluenceSubsample(num_models=10), # influence function
# #     DVRL(rl_epochs=10), # Data valuation using Reinforcement Learning ## inappropriate
# #     KNNShapley(k_neighbors=valid_count), # KNN-Shapley ## inappropriate
# #     DataShapley(gr_threshold=1.05, mc_epochs=300, cache_name=f"cached"), # Data-Shapley ## slow
# #     BetaShapley(gr_threshold=1.05, mc_epochs=300, cache_name=f"cached"), # Beta-Shapley ## slow
#     DataBanzhaf(num_models=10), # Data-Banzhaf
#     AME(num_models=10), # Average Marginal Effects
#     DataOob(num_models=10) # Data-OOB
# #     LavaEvaluator(),
# #     RobustVolumeShapley(mc_epochs=300)
# ]

In [None]:
data_evaluators = [ 
    RandomEvaluator(),

    InfluenceSubsample(num_models=10), # influence function
# #     DVRL(rl_epochs=10), # Data valuation using Reinforcement Learning ## inappropriate
# #     KNNShapley(k_neighbors=valid_count), # KNN-Shapley ## inappropriate
# #     DataShapley(gr_threshold=1.05, mc_epochs=300, cache_name=f"cached"), # Data-Shapley ## slow
# #     BetaShapley(gr_threshold=1.05, mc_epochs=300, cache_name=f"cached"), # Beta-Shapley ## slow
#     DataBanzhaf(num_models=10), # Data-Banzhaf
#     AME(num_models=10), # Average Marginal Effects
#     DataOob(num_models=10) # Data-OOB
# #     LavaEvaluator(),
# #     RobustVolumeShapley(mc_epochs=300)
]

In [None]:
%%time
# compute data values.
## Training multiple DistilBERT models is extremely slow. We recommend using embeddings.
exper_med = exper_med.compute_data_values(data_evaluators=data_evaluators)

## [Step 3] Store data values

In [None]:
from opendataval.experiment.exper_methods import save_dataval

# Saving the results
output_dir = f"../tmp/{dataset_name}_{noise_rate=}/"
exper_med.set_output_directory(output_dir)
output_dir

In [None]:
exper_med.evaluate(save_dataval, save_output=True)