In [4]:
import torch
from tqdm import tqdm
import os
import yaml
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from src.utils import (
    linearise_input, convert_to_features, form_stepwise_input, 
    simplify_feat_names,
    label_qs,
    simplify_narr_question
)

In [5]:
config_type = 'essel_test'
model_base = '../models/bart-base/pretty-night-126/checkpoint-517'
output_dir = '../models/bart-base/pretty-night-126/'

# import yaml file
with open('../configs/train_default.yaml') as f:
    args = yaml.safe_load(f)

# Update default args with chosen config
if config_type != 'default':
    with open('../configs/train_configs.yaml') as f:
        yaml_configs = yaml.safe_load_all(f)
        yaml_args = next(
            conf for conf in yaml_configs if conf['config'] == config_type)
    args.update(yaml_args)
    print(f'Updating with:\n{yaml_args}\n')
print(f'\n{args}\n')

# Load model, tokenizer and dataset
model = AutoModelForSeq2SeqLM.from_pretrained(model_base, return_dict=True).to('cuda')
tokenizer = AutoTokenizer.from_pretrained(model_base)
dataset = load_dataset("james-burton/textual-explanations") if not args['augmented_ds'] else \
    load_dataset("james-burton/aug-text-exps")

if args['simplify_narr_qs']:
    dataset = dataset.map(lambda x: simplify_narr_question(label_qs(x)),
                            load_from_cache_file=False)

# Form the linearised or stepwise (and linearised) input
dataset = dataset.map(
    lambda x: linearise_input(x, args['linearisation'], args['max_features']),
    load_from_cache_file=False
    ) 

# Convert to tokens
dataset = dataset.map(
    lambda x: convert_to_features(x, tokenizer, args['max_input_len']), 
    batched=True, load_from_cache_file=False
    )

Updating with:
{'config': 'essel_test', 'fast_dev_run': True, 'do_train': True, 'linearisation': 'text', 'max_features': 15, 'tags': ['t5-base'], 'batch_size': 4, 'simplify_narr_qs': True}


{'config': 'essel_test', 'fast_dev_run': True, 'do_train': True, 'do_predict': True, 'tags': ['t5-base'], 'batch_size': 4, 'linearisation': 'text', 'max_features': 15, 'model_base': 't5-base', 'output_root': 'models/t5-base/', 'max_input_len': 400, 'lr': 5e-05, 'weight_decay': 0.3, 'num_epochs': 50, 'early_stopping_patience': 3, 'grad_accumulation_steps': 1, 'seed': 43, 'logging_steps': 10, 'lr_scheduler': 'linear', 'warmup_ratio': 0.1, 'device': 'cuda', 'num_workers': 1, 'resume_from_checkpoint': False, 'eval_accumulation_steps': None, 'num_beams': 4, 'repetition_penalty': 3.5, 'length_penalty': 1.5, 'max_output_len': 250, 'predict_batch_size': 4, 'save_total_limit': 1, 'augmented_ds': False, 'simplify_narr_qs': True}



Using custom data configuration james-burton--textual-explanations-65605998b9ae8604
Found cached dataset parquet (/home/james/.cache/huggingface/datasets/james-burton___parquet/james-burton--textual-explanations-65605998b9ae8604/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 3/3 [00:00<00:00, 898.91it/s]
100%|██████████| 375/375 [00:00<00:00, 7529.20ex/s]
100%|██████████| 47/47 [00:00<00:00, 6878.31ex/s]
100%|██████████| 47/47 [00:00<00:00, 6872.55ex/s]
100%|██████████| 375/375 [00:00<00:00, 2703.64ex/s]
100%|██████████| 47/47 [00:00<00:00, 2811.24ex/s]
100%|██████████| 47/47 [00:00<00:00, 2691.81ex/s]
100%|██████████| 1/1 [00:00<00:00, 11.58ba/s]
100%|██████████| 1/1 [00:00<00:00, 63.95ba/s]
100%|██████████| 1/1 [00:00<00:00, 60.31ba/s]


In [3]:
print("***** Running Prediction *****")
input_ids = torch.tensor(dataset['test']['input_ids']).to(model.device)
attention_mask = torch.tensor(dataset['test']['attention_mask']).to(model.device)
all_preds = []
for i in tqdm(range(0,input_ids.shape[0],args['predict_batch_size'])):
    sample_outputs = model.generate(input_ids=input_ids[i:i+args['predict_batch_size']],
                                            attention_mask=attention_mask[i:i+args['predict_batch_size']],
                                            num_beams=args['num_beams'],
                                            repetition_penalty=args["repetition_penalty"],
                                            length_penalty=args["length_penalty"]*4,
                                            max_length=args['max_output_len'],
                                            no_repeat_ngram_size=2,
                                            num_return_sequences=1,
                                            do_sample=True,
                                            early_stopping=True,
                                            use_cache=False)
    preds = tokenizer.batch_decode(sample_outputs, skip_special_tokens=True)
    all_preds.extend(preds)
# Save the predictions
readable_predictions = ['.\n'.join(pred.split('. ')) for pred in all_preds]
print(f'Saving predictions to {output_dir}')
with open(os.path.join(output_dir, 'test_predictions_readable_len-penx4.txt'), 'w') as f:
    for input, pred in zip(dataset['test']['input'], readable_predictions):
        f.write(f'INPUT: {input} \n\n')
        f.write(f'OUTPUT: {pred} \n\n')
    f.write('\n\n'.join(readable_predictions))

***** Running Prediction *****


  8%|▊         | 1/12 [00:12<02:16, 12.43s/it]


KeyboardInterrupt: 

In [None]:
print("***** Running Prediction *****")
input_ids = torch.tensor(dataset['test']['input_ids']).to(model.device)
attention_mask = torch.tensor(dataset['test']['attention_mask']).to(model.device)
all_preds = []
for i in tqdm(range(0,input_ids.shape[0],args['predict_batch_size'])):
    sample_outputs = model.generate(input_ids=input_ids[i:i+args['predict_batch_size']],
                                            attention_mask=attention_mask[i:i+args['predict_batch_size']],
                                            num_beams=10,
                                            repetition_penalty=args["repetition_penalty"],
                                            length_penalty=args["length_penalty"],
                                            max_length=args['max_output_len'],
                                            no_repeat_ngram_size=2,
                                            num_return_sequences=1,
                                            do_sample=True,
                                            early_stopping=True,
                                            use_cache=False)
    preds = tokenizer.batch_decode(sample_outputs, skip_special_tokens=True)
    all_preds.extend(preds)
# Save the predictions
readable_predictions = ['.\n'.join(pred.split('. ')) for pred in all_preds]
print(f'Saving predictions to {output_dir}')
with open(os.path.join(output_dir, 'test_predictions_readable_beams10.txt'), 'w') as f:
    for input, pred in zip(dataset['test']['input'], readable_predictions):
        f.write(f'INPUT: {input} \n\n')
        f.write(f'OUTPUT: {pred} \n\n')
    f.write('\n\n'.join(readable_predictions))

***** Running Prediction *****


100%|██████████| 12/12 [06:13<00:00, 31.13s/it]

Saving predictions to ../models/bart-base/pretty-night-126/





In [None]:
print("***** Running Prediction *****")
input_ids = torch.tensor(dataset['test']['input_ids']).to(model.device)
attention_mask = torch.tensor(dataset['test']['attention_mask']).to(model.device)
all_preds = []
for i in tqdm(range(0,input_ids.shape[0],args['predict_batch_size'])):
    sample_outputs = model.generate(input_ids=input_ids[i:i+args['predict_batch_size']],
                                            attention_mask=attention_mask[i:i+args['predict_batch_size']],
                                            num_beams=10,
                                            num_beam_groups=5,
                                            repetition_penalty=args["repetition_penalty"],
                                            length_penalty=args["length_penalty"],
                                            max_length=args['max_output_len'],
                                            no_repeat_ngram_size=2,
                                            num_return_sequences=1,
                                            early_stopping=True,
                                            use_cache=True)
    preds = tokenizer.batch_decode(sample_outputs, skip_special_tokens=True)
    all_preds.extend(preds)
# Save the predictions
readable_predictions = ['.\n'.join(pred.split('. ')) for pred in all_preds]
print(f'Saving predictions to {output_dir}')
with open(os.path.join(output_dir, 'test_predictions_readable_beams10v2.txt'), 'w') as f:
    for input, pred in zip(dataset['test']['input'], readable_predictions):
        f.write(f'INPUT: {input} \n\n')
        f.write(f'OUTPUT: {pred} \n\n')
    f.write('\n\n'.join(readable_predictions))

***** Running Prediction *****


100%|██████████| 12/12 [00:40<00:00,  3.39s/it]

Saving predictions to ../models/bart-base/pretty-night-126/





In [None]:
print("***** Running Prediction *****")
input_ids = torch.tensor(dataset['test']['input_ids']).to(model.device)
attention_mask = torch.tensor(dataset['test']['attention_mask']).to(model.device)
all_preds = []
for i in tqdm(range(0,input_ids.shape[0],args['predict_batch_size'])):
    sample_outputs = model.generate(input_ids=input_ids[i:i+args['predict_batch_size']],
                                            attention_mask=attention_mask[i:i+args['predict_batch_size']],
                                            num_beams=10,
                                            repetition_penalty=args["repetition_penalty"],
                                            length_penalty=args["length_penalty"],
                                            max_length=args['max_output_len'],
                                            no_repeat_ngram_size=2,
                                            num_return_sequences=1,
                                            do_sample=True,
                                            early_stopping=True,
                                            use_cache=True)
    preds = tokenizer.batch_decode(sample_outputs, skip_special_tokens=True)
    all_preds.extend(preds)
# Save the predictions
readable_predictions = ['.\n'.join(pred.split('. ')) for pred in all_preds]
print(f'Saving predictions to {output_dir}')
with open(os.path.join(output_dir, 'test_predictions_readable_beams10v3.txt'), 'w') as f:
    for input, pred in zip(dataset['test']['input'], readable_predictions):
        f.write(f'INPUT: {input} \n\n')
        f.write(f'OUTPUT: {pred} \n\n')
    f.write('\n\n'.join(readable_predictions))

***** Running Prediction *****


100%|██████████| 12/12 [00:40<00:00,  3.34s/it]

Saving predictions to ../models/bart-base/pretty-night-126/





In [None]:
print("***** Running Prediction *****")
input_ids = torch.tensor(dataset['test']['input_ids']).to(model.device)
attention_mask = torch.tensor(dataset['test']['attention_mask']).to(model.device)
all_preds = []
for i in tqdm(range(0,input_ids.shape[0],args['predict_batch_size'])):
    sample_outputs = model.generate(input_ids=input_ids[i:i+args['predict_batch_size']],
                                            attention_mask=attention_mask[i:i+args['predict_batch_size']],
                                            num_beams=10,
                                            repetition_penalty=args["repetition_penalty"],
                                            length_penalty=args["length_penalty"]*3,
                                            max_length=args['max_output_len'],
                                            no_repeat_ngram_size=2,
                                            num_return_sequences=1,
                                            do_sample=True,
                                            early_stopping=True,
                                            use_cache=True)
    preds = tokenizer.batch_decode(sample_outputs, skip_special_tokens=True)
    all_preds.extend(preds)
# Save the predictions
readable_predictions = ['.\n'.join(pred.split('. ')) for pred in all_preds]
print(f'Saving predictions to {output_dir}')
with open(os.path.join(output_dir, 'test_predictions_readable_beams10v4.txt'), 'w') as f:
    for input, pred in zip(dataset['test']['input'], readable_predictions):
        f.write(f'INPUT: {input} \n\n')
        f.write(f'OUTPUT: {pred} \n\n')
    f.write('\n\n'.join(readable_predictions))

***** Running Prediction *****


100%|██████████| 12/12 [00:48<00:00,  4.08s/it]

Saving predictions to ../models/bart-base/pretty-night-126/





In [None]:
print("***** Running Prediction *****")
input_ids = torch.tensor(dataset['test']['input_ids']).to(model.device)
attention_mask = torch.tensor(dataset['test']['attention_mask']).to(model.device)
all_preds = []
for i in tqdm(range(0,input_ids.shape[0],args['predict_batch_size'])):
    sample_outputs = model.generate(input_ids=input_ids[i:i+args['predict_batch_size']],
                                            attention_mask=attention_mask[i:i+args['predict_batch_size']],
                                            num_beams=20,
                                            repetition_penalty=args["repetition_penalty"],
                                            length_penalty=args["length_penalty"],
                                            max_length=args['max_output_len'],
                                            no_repeat_ngram_size=2,
                                            num_return_sequences=1,
                                            do_sample=True,
                                            early_stopping=True,
                                            use_cache=True)
    preds = tokenizer.batch_decode(sample_outputs, skip_special_tokens=True)
    all_preds.extend(preds)
# Save the predictions
readable_predictions = ['.\n'.join(pred.split('. ')) for pred in all_preds]
print(f'Saving predictions to {output_dir}')
with open(os.path.join(output_dir, 'test_predictions_readable_beams20.txt'), 'w') as f:
    for ans, input, pred in zip(dataset['test']['narration'], dataset['test']['input'], readable_predictions):
        f.write(f'INPUT: {input} \n\n')
        f.write(f'GOLD: {ans} \n\n')
        f.write(f'OUTPUT: {pred} \n\n')
    f.write('\n\n'.join(readable_predictions))

***** Running Prediction *****


100%|██████████| 12/12 [01:08<00:00,  5.74s/it]

Saving predictions to ../models/bart-base/pretty-night-126/





In [None]:
print("***** Running Prediction *****")
input_ids = torch.tensor(dataset['test']['input_ids']).to(model.device)
attention_mask = torch.tensor(dataset['test']['attention_mask']).to(model.device)
all_preds = []
for i in tqdm(range(0,input_ids.shape[0],args['predict_batch_size'])):
    sample_outputs = model.generate(input_ids=input_ids[i:i+args['predict_batch_size']],
                                            attention_mask=attention_mask[i:i+args['predict_batch_size']],
                                            num_beams=20,
                                            num_beam_groups=5,
                                            repetition_penalty=args["repetition_penalty"],
                                            length_penalty=args["length_penalty"],
                                            max_length=args['max_output_len'],
                                            no_repeat_ngram_size=2,
                                            num_return_sequences=1,
                                            early_stopping=True,
                                            use_cache=True)
    preds = tokenizer.batch_decode(sample_outputs, skip_special_tokens=True)
    all_preds.extend(preds)
# Save the predictions
readable_predictions = ['.\n'.join(pred.split('. ')) for pred in all_preds]
print(f'Saving predictions to {output_dir}')
with open(os.path.join(output_dir, 'test_predictions_readable_beams20v2.txt'), 'w') as f:
    for ans, input, pred in zip(dataset['test']['narration'], dataset['test']['input'], readable_predictions):
        f.write(f'INPUT: {input} \n\n')
        f.write(f'GOLD: {ans} \n\n')
        f.write(f'OUTPUT: {pred} \n\n')
    f.write('\n\n'.join(readable_predictions))

***** Running Prediction *****


100%|██████████| 12/12 [01:06<00:00,  5.56s/it]

Saving predictions to ../models/bart-base/pretty-night-126/





# Substituing feature names

In [None]:
print("***** Running Prediction *****")
input_ids = torch.tensor(dataset['test']['input_ids']).to(model.device)
attention_mask = torch.tensor(dataset['test']['attention_mask']).to(model.device)
all_preds = []
for i in tqdm(range(0,input_ids.shape[0],args['predict_batch_size'])):
    sample_outputs = model.generate(input_ids=input_ids[i:i+args['predict_batch_size']],
                                            attention_mask=attention_mask[i:i+args['predict_batch_size']],
                                            num_beams=20,
                                            repetition_penalty=args["repetition_penalty"],
                                            length_penalty=args["length_penalty"],
                                            max_length=args['max_output_len'],
                                            no_repeat_ngram_size=2,
                                            num_return_sequences=1,
                                            do_sample=True,
                                            early_stopping=True,
                                            use_cache=True,
                                            renormalize_logits=True)
    preds = tokenizer.batch_decode(sample_outputs, skip_special_tokens=True)
    all_preds.extend(preds)
# Save the predictions
readable_predictions = ['.\n'.join(pred.split('. ')) for pred in all_preds]
print(f'Saving predictions to {output_dir}')
with open(os.path.join(output_dir, 'test_predictions_readable_beams20v3.txt'), 'w') as f:
    for ans, input, pred in zip(dataset['test']['narration'], dataset['test']['input'], readable_predictions):
        f.write(f'INPUT: {input} \n\n')
        f.write(f'GOLD: {ans} \n\n')
        f.write(f'OUTPUT: {pred} \n\n')
    f.write('\n\n'.join(readable_predictions))

***** Running Prediction *****


100%|██████████| 12/12 [00:55<00:00,  4.64s/it]

Saving predictions to ../models/bart-base/pretty-night-126/





In [21]:
def new2old(row):
    row['new2old_classes'] = {v: k for k, v in eval(row['old2new_classes']).items() if k != None}
    return row

dataset = dataset.map(new2old)

100%|██████████| 375/375 [00:00<00:00, 2250.05ex/s]
100%|██████████| 47/47 [00:00<00:00, 2346.48ex/s]
100%|██████████| 47/47 [00:00<00:00, 2401.07ex/s]


In [19]:
dataset['train']['new2old_classes']

["{'C2': 'C1', 'C1': 'C2'}",
 "{'C2': 'C1', 'C1': 'C2'}",
 "{'C2': 'C1', 'C1': 'C2'}",
 "{'C3': 'C1', 'C2': 'C2', 'C1': 'C3'}",
 "{'C2': 'C1', 'C1': 'C2'}",
 "{'C2': 'C1', 'C1': 'C2'}",
 "{'C1': 'C1', 'C2': 'C2'}",
 "{'C1': 'C1', 'C2': 'C2'}",
 "{'C1': 'C1', 'C3': 'C2', 'C2': 'C3'}",
 "{'C1': 'C1', 'C2': 'C2'}",
 "{'C1': 'C1', 'C2': 'C2'}",
 "{'C2': 'C1', 'C1': 'C2'}",
 "{'C3': 'C1', 'C2': 'C2', 'C1': 'C3'}",
 "{'C2': 'C1', 'C3': 'C2', 'C1': 'C3'}",
 "{'C1': 'C1', 'C2': 'C2'}",
 "{'C1': 'C1', 'C2': 'C2'}",
 "{'C1': 'C1', 'C2': 'C2'}",
 "{'C2': 'C1', 'C1': 'C2'}",
 "{'C4': 'C1', 'C1': 'C2', 'C3': 'C3', 'C2': 'C4'}",
 "{'C2': 'C1', 'C1': 'C2'}",
 "{'C2': 'C1', 'C1': 'C2'}",
 "{'C1': 'C1', 'C2': 'C2'}",
 "{'C2': 'C1', 'C1': 'C2'}",
 "{'C2': 'C1', 'C1': 'C2'}",
 "{'C1': 'C1', 'C2': 'C2', 'C4': 'C3', 'C3': 'C4'}",
 "{'C1': 'C1', 'C2': 'C2'}",
 "{'C2': 'C1', 'C1': 'C2'}",
 "{'C2': 'C1', 'C1': 'C2'}",
 "{'C2': 'C1', 'C1': 'C2'}",
 "{'C1': 'C1', 'C2': 'C2'}",
 "{'C1': 'C1', 'C2': 'C2'}",
 "{'C

In [39]:
import json

In [40]:
all_train = json.load(open('../data/raw/all_train.json',encoding='utf-8'))
test = json.load(open('../data/raw/test_set_new.json',encoding='utf-8'))
all = all_train + test
no_task = [x for x in all if x.get('task_name', None) == None]
all = [x for x in all if x.get('task_name', None) != None]

sign_dict = {'red': 'negative', 'green': 'positive', 'yellow': 'negligible'}

In [41]:
tasknames = set([(a['task_name'], a['predicted_class'], a['predicted_class_label']) for a in all])
task2name_dict = {f'{t}_{c}': name for (t, c, name) in tasknames}
task2name_dict.update({'Air Quality Prediction_C4': 'Other',
                       'Cab Surge Pricing System_C1': 'Low',
                       'Cab Surge Pricing System_C2': 'Medium',
                       'Cab Surge Pricing System_C3': 'High',
                       'Car Acceptability Valuation_C3': 'Other A',
                       'Car Acceptability Valuation_C4': 'Other B',
                       'Concrete Strength Classification_C3': 'Other',
                       'Customer Churn Modelling_C3': 'Other',
                       'Flight Price-Range Classification_C4': 'Special',
                       'Food Ordering Customer Churn Prediction_C3': 'Accept',
                       'German Credit Evaluation_C3': 'Other',
                       'Mobile Price-Range Classification_C3': 'r3',
                       'Suspicious Bidding Identification_C2': 'Suspicious',
                       'Used Cars Price-Range Prediction_C3': 'Medium',
                       'Wine Quality Prediction_C1': 'low_quality',
                       })


In [42]:
tasknames

{('Advertisement Prediction', 'C1', 'Skip'),
 ('Advertisement Prediction', 'C2', 'Watch'),
 ('Air Quality Prediction', 'C1', 'Preparing meals'),
 ('Air Quality Prediction', 'C2', 'Presence of smoke'),
 ('Air Quality Prediction', 'C3', 'Cleaning'),
 ('Airline Passenger Satisfaction', 'C1', 'neutral or dissatisfied'),
 ('Airline Passenger Satisfaction', 'C2', 'Acceptable'),
 ('Airline Passenger Satisfaction', 'C2', 'satisfied'),
 ('Annual Income Earnings', 'C1', 'Under 50K'),
 ('Annual Income Earnings', 'C2', 'Above 50K'),
 ('Australian Credit Approval', 'C1', 'Class 1'),
 ('Australian Credit Approval', 'C2', 'Class 2'),
 ('Basketball Players Career Length Prediction', 'C1', 'More than 5'),
 ('Basketball Players Career Length Prediction', 'C2', 'Less than 5'),
 ('Bike Sharing Demand', 'C1', 'Less than 500'),
 ('Bike Sharing Demand', 'C2', 'More than 500'),
 ('Broadband Sevice Signup', 'C1', 'No'),
 ('Broadband Sevice Signup', 'C2', 'Yes'),
 ('Cab Surge Pricing System', 'C1', 'C1'),
 ('Ca

In [22]:
tasknames = set([(a['task_name'], a['new2old_classes'][a['predicted_class']], a['predicted_class_label']) for a in dataset['train']])
tasknames.update(set([(a['task_name'], a['new2old_classes'][a['predicted_class']], a['predicted_class_label']) for a in dataset['validation']]))
tasknames.update(set([(a['task_name'], a['new2old_classes'][a['predicted_class']], a['predicted_class_label']) for a in dataset['test']]))
initial_taskname_dict = {f'{t}_{c}': name for (t, c, name) in tasknames}

task_class2name_dict = {}
for row in dataset['train']:
    new_keys = list(eval(row['classes_dict']).keys())
    old_keys = [row['new2old_classes'][str(k)] for k in new_keys]
    for key in old_keys:
        task_class2name_dict[f"{row['task_name']}_{key}"] = initial_taskname_dict.get(f"{row['task_name']}_{key}", 0)

for row in dataset['validation']:
    new_keys = list(eval(row['classes_dict']).keys())
    old_keys = [row['new2old_classes'][str(k)] for k in new_keys]
    for key in old_keys:
        task_class2name_dict[f"{row['task_name']}_{key}"] = initial_taskname_dict.get(f"{row['task_name']}_{key}", 0)
        
for row in dataset['test']:
    new_keys = list(eval(row['classes_dict']).keys())
    old_keys = [row['new2old_classes'][str(k)] for k in new_keys]
    for key in old_keys:
        task_class2name_dict[f"{row['task_name']}_{key}"] = initial_taskname_dict.get(f"{row['task_name']}_{key}", 0)

task_class2name_dict.update({'Air Quality Prediction_C4': 'Other',
                'Cab Surge Pricing System_C1': 'Low',
                'Cab Surge Pricing System_C2': 'Medium',
                'Cab Surge Pricing System_C3': 'High',
                'Car Acceptability Valuation_C3': 'Other A',
                'Car Acceptability Valuation_C4': 'Other B',
                'Concrete Strength Classification_C3': 'Other',
                'Customer Churn Modelling_C3': 'Other',
                'Flight Price-Range Classification_C4': 'Special',
                'Food Ordering Customer Churn Prediction_C3': 'Accept',
                'German Credit Evaluation_C3': 'Other',
                'Mobile Price-Range Classification_C3' :'r3', 
                'Suspicious Bidding Identification_C2': 'Suspicious',
                'Used Cars Price-Range Prediction_C3': 'Medium',
                'Wine Quality Prediction_C1': 'low_quality',
                })
task_class2name_dict = dict(sorted(task_class2name_dict.items()))

def class2name(row):
    classes_dict = eval(row['classes_dict'])
    task_classes = [f"{row['task_name']}_{row['new2old_classes'][c]}" for c in classes_dict.keys()]
    row['class2name'] = {c: task_class2name_dict[t_c] for c, t_c in zip(classes_dict.keys(), task_classes)}
    return row

dataset = dataset.map(class2name)

100%|██████████| 375/375 [00:00<00:00, 2101.04ex/s]
100%|██████████| 47/47 [00:00<00:00, 2097.69ex/s]
100%|██████████| 47/47 [00:00<00:00, 2087.56ex/s]


In [26]:
import re

In [37]:
row = dataset['test'][0]
cls_ptn = re.compile("|".join([f'{k}\\b' for k in row['class2name'].keys() if k != None]))
narr = cls_ptn.sub(lambda x: f'\"{row["class2name"][x.group()]}\"', row['narration'])
ft_ptn = re.compile("|".join([f'{k}\\b' for k in eval(row['ft_num2name']).keys()]))
narr = ft_ptn.sub(lambda x: eval(row["ft_num2name"])[x.group()], narr)

In [38]:
narr

'The classification decision of the classifier is: the case is likely to be "Invest" with approximately 99.28% certainty. This is because the classifier indicates that there is only a 0.72% chance that "Ignore" is the correct label. The variables that have the greatest impact on this judgement or classification decision are Feature2, Feature4, Feature14, and Feature7. From the analysis performed, the less important or less relevant variables are Feature19, Feature9, and Feature11. Regarding the direction of influence of the features, the values Feature7, Feature4, and Feature14 produce positive impacts, shifting the output decision in favour of "Invest" and together with other positives, Feature1, Feature17, and Feature12, increase the odds that "Invest" is the correct label. On the other hand, features such as Feature2, Feature8, Feature15, Feature13, and Feature20 have negative contributions, prompting the classifier to assign "Ignore" in this case, hence the assigned likelihood of 0

In [90]:
{c: task_class2name_dict[t_c] for c, t_c in zip(classes_dict.keys(), task_classes)}

{'C2': 'Ignore', 'C1': 'Invest'}

In [23]:
dataset['train']['class2name']

[{'C1': 'Satisfied', 'C2': 'Dissatisfied', 'C3': None, 'C4': None},
 {'C1': 'Invest', 'C2': 'Ignore', 'C3': None, 'C4': None},
 {'C1': 'Acceptable', 'C2': 'Unacceptable', 'C3': None, 'C4': None},
 {'C1': 'High', 'C2': 'Moderate', 'C3': 'Low', 'C4': None},
 {'C1': 'High', 'C2': 'Low', 'C3': None, 'C4': None},
 {'C1': 'Less than 5', 'C2': 'More than 5', 'C3': None, 'C4': None},
 {'C1': 'Less', 'C2': 'More', 'C3': None, 'C4': None},
 {'C1': '< 10k', 'C2': '> 10k', 'C3': None, 'C4': None},
 {'C1': 'Low', 'C2': 'High', 'C3': 'Moderate', 'C4': None},
 {'C1': 'neutral or dissatisfied', 'C2': 'Acceptable', 'C3': None, 'C4': None},
 {'C1': 'No', 'C2': 'Yes', 'C3': None, 'C4': None},
 {'C1': 'Placed', 'C2': 'Not Placed', 'C3': None, 'C4': None},
 {'C1': 'High', 'C2': 'Moderate', 'C3': 'Low', 'C4': None},
 {'C1': 'High', 'C2': 'Low', 'C3': 'Medium', 'C4': None},
 {'C1': 'low_quality', 'C2': 'high quality', 'C3': None, 'C4': None},
 {'C1': 'On-time', 'C2': 'Late', 'C3': None, 'C4': None},
 {'C1': 

In [86]:
row['classes_dict']

"{'C2': '0.72%', 'C1': '99.28%'}"

In [24]:

def sub_class4name(row):
    cls_ptn = re.compile("|".join([f'{k}\\b' for k in row['class2name'].keys() if k != None]))
    narr = cls_ptn.sub(lambda x: row['class2name'][x.group()], row['narration'])

In [None]:
def sub_class4name(row):
        cls_ptn = re.compile("|".join([f'{k}\\b' for k in row['class2name'].keys()]))


cls_ptn = re.compile("|".join([f'{k}\\b' for k in old2new_classes.keys()]))

all[i]['narration'] = cls_ptn.sub(
        lambda m: old2new_classes[re.escape(m.group(0))], all[i]['narration'])

In [41]:
import json

In [42]:
with open('../data/processed/train2.json', 'r') as f:
    train = json.load(f)
    

In [47]:
dataset['train']['unique_id'] == [t['unique_id'] for t in train]

True

[445,
 107,
 309,
 237,
 334,
 337,
 323,
 295,
 37,
 468,
 352,
 8,
 418,
 438,
 194,
 421,
 385,
 168,
 134,
 94,
 192,
 117,
 103,
 163,
 263,
 463,
 447,
 307,
 204,
 121,
 319,
 449,
 161,
 200,
 290,
 164,
 244,
 326,
 317,
 185,
 44,
 300,
 400,
 224,
 467,
 205,
 367,
 424,
 19,
 305,
 296,
 284,
 268,
 188,
 417,
 203,
 91,
 322,
 287,
 116,
 462,
 86,
 28,
 330,
 372,
 169,
 219,
 85,
 100,
 49,
 131,
 48,
 402,
 122,
 328,
 3,
 79,
 256,
 144,
 384,
 124,
 149,
 146,
 465,
 249,
 302,
 247,
 186,
 160,
 139,
 195,
 408,
 88,
 411,
 354,
 254,
 58,
 207,
 147,
 336,
 362,
 431,
 382,
 57,
 90,
 353,
 33,
 355,
 165,
 343,
 40,
 240,
 394,
 251,
 26,
 55,
 112,
 35,
 391,
 260,
 127,
 291,
 151,
 133,
 457,
 183,
 38,
 17,
 373,
 238,
 311,
 380,
 54,
 89,
 272,
 455,
 69,
 427,
 398,
 143,
 42,
 74,
 177,
 0,
 10,
 115,
 170,
 338,
 137,
 252,
 278,
 221,
 95,
 197,
 253,
 128,
 245,
 114,
 461,
 306,
 77,
 32,
 271,
 388,
 460,
 187,
 176,
 437,
 4,
 59,
 162,
 175,
 7,
 435