<a href="https://colab.research.google.com/github/larissasantesso/IA025A_FinalProject_ImageCaptioning/blob/main/notebooks/run04_evaluation_exp008.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mounting Google Drive in Colab backend

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Importing variables and functions from other notebooks

## Installing library

In [None]:
!pip install import-ipynb
import import_ipynb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Changing directory to the same where the notebooks are located

In [None]:
cd /content/gdrive/MyDrive/Colab\ Notebooks/Projeto/

In [None]:
!ls

## Importing variables/functions from other notebooks

In [None]:
from run00_dataset import *
from run01_metrics import calculate_metrics
from run02_models import Encoder, Decoder, EncoderDecoder

# Installing, Importing and Connecting to W&B

Link with reports: <https://wandb.ai/larissa_santesso/ImageCaptioning_Project?workspace=user-larissa_santesso>

In [None]:
!pip install wandb -qqq

In [None]:
# Log in to your W&B account
import wandb
wandb.login()

# Model

In [None]:
model = EncoderDecoder(pretrained_model = (config_exp["encoder_pretrained_model"], config_exp["decoder_pretrained_model"]), 
                       eos_token_id=tokenizer.eos_token_id, 
                       pad_token_id=tokenizer.pad_token_id)

model = torch.load("modelos/model_exp008_run02.pt")

# Evaluation function

In [None]:
def eval_results(model, tokenizer, dataset_name, n_batchsize, run_name, run_id, table_name, save_table=True):

    if run_id is not None: 
        run = wandb.init(project="ImageCaptioning_Project", name=f"experiment_{run_name}", config=config_exp, resume=run_id, id=run_id, settings=wandb.Settings(start_method="thread"), reinit=True, dir=os.getenv("WANDB_DIR", config_exp["path_save_checkpoints"]))

    else:
        run = wandb.init(project="ImageCaptioning_Project", name=f"experiment_{run_name}", config=config_exp, settings=wandb.Settings(start_method="thread"), reinit=True, dir=os.getenv("WANDB_DIR", config_exp["path_save_checkpoints"]))

    loader_eval = DataLoader(dataset_name, batch_size=n_batchsize, collate_fn = collate_custom, num_workers=3, pin_memory=True)
    columns = ["id", "image", "predicted", "captions (GT)", "BLEU-1","BLEU-2", "BLEU-3", "BLEU-4"]
    mytable = wandb.Table(columns=columns)

    bleu_list1, bleu_list2, bleu_list3, bleu_list4  = [], [], [], []
    list_id_preds = []

    with torch.no_grad():
        for idx, (inputs, targets, all_targets, ids) in enumerate(tqdm(loader_eval)):
            model.eval()
            inputs = inputs.to(device)
            decoded_ids = torch.full((inputs.shape[0], 1),
                                model.decoder_start_token_id,
                                dtype=torch.long).to(inputs.device)
                
            for step in range(32):
                output_val =  model(images =  inputs, decoder_ids= decoded_ids)
                    
                val_logits = output_val.logits
                next_token_logits = val_logits[:, -1, :]
                next_token_id = next_token_logits.argmax(1).unsqueeze(-1)
                decoded_ids = torch.cat([decoded_ids, next_token_id], dim=-1)

                # Check if output is end of senquence for all batches
                if torch.eq(next_token_id[:, -1], model.eos_token_id).all():
                    break

            
            eval_preds_sentences = tokenizer.batch_decode(decoded_ids,  skip_special_tokens=True)
            list_id_preds.extend(list(zip(ids, eval_preds_sentences)))
            bleu1, bleu2, bleu3, bleu4 = calculate_metrics(eval_preds_sentences, all_targets)

            bleu_list1.append(bleu1)
            bleu_list2.append(bleu2)
            bleu_list3.append(bleu3)
            bleu_list4.append(bleu4)

            if save_table:
                mytable.add_data(idx, wandb.Image(unorm(inputs[0].squeeze().cpu()).permute(1,2,0).numpy()), eval_preds_sentences[0], list(map(str,all_targets[0])), bleu1, bleu2, bleu3, bleu4)

        
    wandb.log({f"{table_name}": mytable})


    test_bleu1 = sum(bleu_list1)/len(bleu_list1)
    test_bleu2 = sum(bleu_list2)/len(bleu_list2)
    test_bleu3 = sum(bleu_list3)/len(bleu_list3)
    test_bleu4 = sum(bleu_list4)/len(bleu_list4)

    print(f"Test -  BLEU-1: {test_bleu1}/ BLEU-2: {test_bleu2}/ BLEU-3: {test_bleu3}/ BLEU-4: {test_bleu4}")

    return list_id_preds

# Evaluation on Validation Dataset

In [None]:
run_name = "008_evaluation"
run_id = None
table_name = "Table_val_24k"

In [None]:
list_valpred_ids = eval_results(model, tokenizer, val_dataset, 20, run_name, run_id, table_name)

[34m[1mwandb[0m: Currently logged in as: [33mlarissa_santesso[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/1216 [00:00<?, ?it/s]

Test -  BLEU-1: 74.12358456106058/ BLEU-2: 54.01038362422168/ BLEU-3: 38.92285971807426/ BLEU-4: 27.959416721771852


## Evaluation with pycocoeval

### Creating .json file with the ids of the images and respective predictions 

In [None]:
list_json = list(map(lambda x: {'image_id': x[0], 'caption': x[1]}, list_valpred_ids))
list_json

[{'caption': 'A man riding a skateboard down a ramp.', 'image_id': 157352},
 {'caption': 'A man wearing a blue tie and a blue shirt.', 'image_id': 172087},
 {'caption': 'A pair of smart phones sitting on top of a table.',
  'image_id': 99428},
 {'caption': 'A person holding a orange in their hand.', 'image_id': 354333},
 {'caption': 'A stop sign with a street sign on it.', 'image_id': 158952},
 {'caption': 'A man talking on a cell phone in a store.', 'image_id': 94746},
 {'caption': 'A cat is laying on a pair of sneakers.', 'image_id': 18833},
 {'caption': 'A couple of horses standing next to each other.',
  'image_id': 299533},
 {'caption': 'A pizza with tomatoes, onions, and green peppers.',
  'image_id': 114335},
 {'caption': 'A cat sitting on a table next to a computer.',
  'image_id': 520531},
 {'caption': 'A baseball player swinging a bat at a ball.',
  'image_id': 282659},
 {'caption': 'A man and a little girl playing with a kite.',
  'image_id': 76844},
 {'caption': 'A boy is d

In [None]:
len(list_valpred_ids)

24302

In [None]:
with open("/content/results_val_dataset.json", "w") as outfile:
    json.dump(list_json, outfile, sort_keys=True, indent=4)

### Evaluation of metrics with the pycocoevalcap tool

In [None]:
coco = COCO(annotation_file)
coco_result = coco.loadRes(results_file)
coco_result

In [None]:
coco_result.getImgIds()

In [None]:
# reference: <https://github.com/tylin/coco-caption/blob/master/cocoEvalCapDemo.ipynb> and <https://github.com/salaniz/pycocoevalcap>

annotation_file = '/content/annotations/captions_val2014.json'
results_file = '/content/results_val_dataset.json'

# create coco object and coco_result object
coco = COCO(annotation_file)
coco_result = coco.loadRes(results_file)

# create coco_eval object by taking coco and coco_result
coco_eval = COCOEvalCap(coco, coco_result)

# evaluate on a subset of images by setting
coco_eval.params['image_id'] = coco_result.getImgIds()

# evaluate results
coco_eval.evaluate()

# print output evaluation scores
for metric, score in coco_eval.eval.items():
    print(f'{metric}: {score:.3f}')

loading annotations into memory...
Done (t=0.31s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.07s)
creating index...
index created!
tokenization...
setting up scorers...
Downloading stanford-corenlp-3.6.0 for SPICE ...
Progress: 384.5M / 384.5M (100.0%)
Extracting stanford-corenlp-3.6.0 ...
Done.
computing Bleu score...
{'testlen': 248008, 'reflen': 241655, 'guess': [248008, 223706, 199404, 175102], 'correct': [177853, 94158, 43833, 20080]}
ratio: 1.0262895450125136
Bleu_1: 0.717
Bleu_2: 0.549
Bleu_3: 0.405
Bleu_4: 0.295
computing METEOR score...
METEOR: 0.261
computing Rouge score...
ROUGE_L: 0.537
computing CIDEr score...
CIDEr: 0.970
computing SPICE score...
SPICE: 0.193
Bleu_1: 0.717
Bleu_2: 0.549
Bleu_3: 0.405
Bleu_4: 0.295
METEOR: 0.261
ROUGE_L: 0.537
CIDEr: 0.970
SPICE: 0.193


In [None]:
coco_eval.eval

{'Bleu_1': 0.7171260604496601,
 'Bleu_2': 0.5493986006372232,
 'Bleu_3': 0.404837569757862,
 'Bleu_4': 0.2953444113309968,
 'CIDEr': 0.970441840219344,
 'METEOR': 0.26080322264808553,
 'ROUGE_L': 0.5373853798052349,
 'SPICE': 0.19342599386537462}

# Evaluation on Test Dataset

In [None]:
model = EncoderDecoder(pretrained_model = (config_exp["encoder_pretrained_model"], config_exp["decoder_pretrained_model"]), 
                       eos_token_id=tokenizer.eos_token_id, 
                       pad_token_id=tokenizer.pad_token_id)

model = torch.load("modelos/model_exp008_run02.pt")

run_name = "008_evaluation"
run_id = '32mzuqkb'
table_name = "Table_test_dataset"


In [None]:
list_testpred_ids = eval_results(model, tokenizer, test_dataset, 20, run_name, run_id, table_name)

VBox(children=(Label(value='98.667 MB of 98.667 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, m…

  0%|          | 0/811 [00:00<?, ?it/s]

Test -  BLEU-1: 73.68829763514971/ BLEU-2: 53.49485678202391/ BLEU-3: 38.390965418325564/ BLEU-4: 27.41212649264546


## Evaluation with pycocoeval

### Creating .json file with the ids of the images and respective predictions 

In [None]:
list_json = list(map(lambda x: {'image_id': x[0], 'caption': x[1]}, list_testpred_ids))
list_json

[{'caption': 'A man and a child riding on a motorcycle.', 'image_id': 263834},
 {'caption': 'A motorcycle parked in a field with people standing around.',
  'image_id': 478575},
 {'caption': 'A reflection of a person in a car mirror.', 'image_id': 391392},
 {'caption': 'Two people are walking on a sidewalk with luggage.',
  'image_id': 292456},
 {'caption': 'A parking meter is on the sidewalk near a fence.',
  'image_id': 370678},
 {'caption': 'A bathroom with a sink, tub, and a mirror.', 'image_id': 162280},
 {'caption': 'A group of giraffes standing next to each other in a fenced in area.',
  'image_id': 434804},
 {'caption': 'A black dog is sitting on a skateboard.', 'image_id': 415360},
 {'caption': 'A remote control sitting on top of a laptop computer.',
  'image_id': 76484},
 {'caption': 'A woman holding a donut on a brick wall.', 'image_id': 52016},
 {'caption': 'A woman sitting in a train car with a dog.', 'image_id': 78170},
 {'caption': 'A group of sheep standing on top of a 

In [None]:
with open("/content/results_test_dataset.json", "w") as outfile:
    json.dump(list_json, outfile, sort_keys=True, indent=4)

### Evaluation of metrics with the pycocoevalcap tool

In [None]:
# reference: <https://github.com/tylin/coco-caption/blob/master/cocoEvalCapDemo.ipynb> and <https://github.com/salaniz/pycocoevalcap>

annotation_file = '/content/annotations/captions_val2014.json'
results_file = '/content/results_test_dataset.json'

# create coco object and coco_result object
coco = COCO(annotation_file)
coco_result = coco.loadRes(results_file)

# create coco_eval object by taking coco and coco_result
coco_eval = COCOEvalCap(coco, coco_result)

# evaluate on a subset of images by setting
coco_eval.params['image_id'] = coco_result.getImgIds()

# evaluate results
coco_eval.evaluate()

# print output evaluation scores
for metric, score in coco_eval.eval.items():
    print(f'{metric}: {score:.3f}')

loading annotations into memory...
Done (t=0.30s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.04s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 165611, 'reflen': 161067, 'guess': [165611, 149409, 133207, 117005], 'correct': [118062, 62086, 28668, 12913]}
ratio: 1.0282118621443188
Bleu_1: 0.713
Bleu_2: 0.544
Bleu_3: 0.399
Bleu_4: 0.290
computing METEOR score...
METEOR: 0.259
computing Rouge score...
ROUGE_L: 0.533
computing CIDEr score...
CIDEr: 0.958
computing SPICE score...
SPICE: 0.191
Bleu_1: 0.713
Bleu_2: 0.544
Bleu_3: 0.399
Bleu_4: 0.290
METEOR: 0.259
ROUGE_L: 0.533
CIDEr: 0.958
SPICE: 0.191


In [None]:
coco_eval.eval

{'Bleu_1': 0.7128874289751242,
 'Bleu_2': 0.5442756924495749,
 'Bleu_3': 0.39948708362328367,
 'Bleu_4': 0.2896227941441834,
 'CIDEr': 0.9584613506921064,
 'METEOR': 0.25870212460277003,
 'ROUGE_L': 0.5328507664293789,
 'SPICE': 0.19090703825037614}

# Evaluation on Categories Filtered

In [None]:
model = EncoderDecoder(pretrained_model = (config_exp["encoder_pretrained_model"], config_exp["decoder_pretrained_model"]), 
                       eos_token_id=tokenizer.eos_token_id, 
                       pad_token_id=tokenizer.pad_token_id)

model = torch.load("modelos/model_exp008_run02.pt")

In [None]:
run_name = "008_evaluation"
run_id = '32mzuqkb'
table_name = "Table_categories_filtered"

In [None]:
coco = COCO(annotation_file="/content/annotations/instances_val2014.json")

categories_coco = ['person', 'bicycle', 'car', 'motorcycle', 'bus', 'train', 'truck']
outdoor_categories = ['traffic light', 'fire hydrant', 'stop sign', 'parking_meter']
filtered_classes = []
classes = [[x,y,z,w] for x,y,z,w in itertools.combinations(categories_coco, 4)]
outdoor_classes = [[x,y] for x,y in itertools.combinations(outdoor_categories, 2)]
print(classes)
print(outdoor_classes)

for i in outdoor_classes:
    for j in classes:
        filtered_classes.append(i+j)

print(filtered_classes)
ids_list = []

for i in filtered_classes:
    ids_list.extend(coco.getImgIds(catIds=coco.getCatIds(i)))

ids_list = np.unique(ids_list)

len(ids_list)

loading annotations into memory...
Done (t=5.99s)
creating index...
index created!
[['person', 'bicycle', 'car', 'motorcycle'], ['person', 'bicycle', 'car', 'bus'], ['person', 'bicycle', 'car', 'train'], ['person', 'bicycle', 'car', 'truck'], ['person', 'bicycle', 'motorcycle', 'bus'], ['person', 'bicycle', 'motorcycle', 'train'], ['person', 'bicycle', 'motorcycle', 'truck'], ['person', 'bicycle', 'bus', 'train'], ['person', 'bicycle', 'bus', 'truck'], ['person', 'bicycle', 'train', 'truck'], ['person', 'car', 'motorcycle', 'bus'], ['person', 'car', 'motorcycle', 'train'], ['person', 'car', 'motorcycle', 'truck'], ['person', 'car', 'bus', 'train'], ['person', 'car', 'bus', 'truck'], ['person', 'car', 'train', 'truck'], ['person', 'motorcycle', 'bus', 'train'], ['person', 'motorcycle', 'bus', 'truck'], ['person', 'motorcycle', 'train', 'truck'], ['person', 'bus', 'train', 'truck'], ['bicycle', 'car', 'motorcycle', 'bus'], ['bicycle', 'car', 'motorcycle', 'train'], ['bicycle', 'car', 'mo

178

In [None]:
dset = CocoDataset(root="/content/val2014/",annFile="/content/annotations/captions_val2014.json", idsList = ids_list, transform=transform_val)
filtered_dset_loader = DataLoader(dset, batch_size=1, shuffle=False, collate_fn = collate_custom)

list_pred_ids = eval_results(model, tokenizer, dset, 1, run_name, run_id, table_name)

loading annotations into memory...
Done (t=0.30s)
creating index...
index created!


  0%|          | 0/178 [00:00<?, ?it/s]

Test -  BLEU-1: 72.88875458357242/ BLEU-2: 49.40809820494262/ BLEU-3: 33.9853252544893/ BLEU-4: 24.66725678358192


In [None]:
for img, y_t, y_all in filtered_dset_loader:
    plt.figure()
    plt.imshow(unorm(img.squeeze().cpu()).permute(1,2,0).numpy())
    plt.title("Caption example: " +str(y_all[0][0]))
    plt.show()
    plt.close()

## Evaluation with pycocoeval

### Creating .json file with the ids of the images and respective predictions 

In [None]:
list_json = list(map(lambda x: {'image_id': x[0], 'caption': x[1]}, list_pred_ids))
list_json

[{'caption': 'Two people riding on a motorcycle down a street.',
  'image_id': 12238},
 {'caption': 'A bus is parked on the side of the road.', 'image_id': 12666},
 {'caption': 'A bus is driving down a busy city street.', 'image_id': 12818},
 {'caption': 'A group of people riding on a scooter down a street.',
  'image_id': 12946},
 {'caption': 'A man and woman riding bicycles down a city street.',
  'image_id': 15663},
 {'caption': 'A man in a hat is walking down the street.', 'image_id': 17267},
 {'caption': 'A city street with a couple of buses and a red bus.',
  'image_id': 26204},
 {'caption': 'A street scene with a streetlight and a car on the street.',
  'image_id': 28675},
 {'caption': 'A yellow and black sign on a pole on a street.',
  'image_id': 32941},
 {'caption': 'A man in a red coat and a red car.', 'image_id': 34786},
 {'caption': 'A city bus is stopped at a traffic light.', 'image_id': 34820},
 {'caption': 'A woman walking down a street holding an umbrella.',
  'image_i

In [None]:
with open("/content/results_catfiltered.json", "w") as outfile:
    json.dump(list_json, outfile, sort_keys=True, indent=4)

### Evaluation of metrics with the pycocoevalcap tool

In [None]:
# reference: <https://github.com/tylin/coco-caption/blob/master/cocoEvalCapDemo.ipynb> and <https://github.com/salaniz/pycocoevalcap>

annotation_file = '/content/annotations/captions_val2014.json'
results_file = '/content/results_catfiltered.json'

# create coco object and coco_result object
coco = COCO(annotation_file)
coco_result = coco.loadRes(results_file)

# create coco_eval object by taking coco and coco_result
coco_eval = COCOEvalCap(coco, coco_result)

# evaluate on a subset of images by setting
coco_eval.params['image_id'] = coco_result.getImgIds()

# evaluate results
coco_eval.evaluate()

# print output evaluation scores
for metric, score in coco_eval.eval.items():
    print(f'{metric}: {score:.3f}')

loading annotations into memory...
Done (t=0.31s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 1801, 'reflen': 1753, 'guess': [1801, 1623, 1445, 1267], 'correct': [1281, 604, 253, 105]}
ratio: 1.0273816314882902
Bleu_1: 0.711
Bleu_2: 0.514
Bleu_3: 0.359
Bleu_4: 0.249
computing METEOR score...
METEOR: 0.238
computing Rouge score...
ROUGE_L: 0.485
computing CIDEr score...
CIDEr: 0.603
computing SPICE score...
SPICE: 0.189
Bleu_1: 0.711
Bleu_2: 0.514
Bleu_3: 0.359
Bleu_4: 0.249
METEOR: 0.238
ROUGE_L: 0.485
CIDEr: 0.603
SPICE: 0.189


In [None]:
coco_eval.eval

{'Bleu_1': 0.711271515824147,
 'Bleu_2': 0.5144899762371496,
 'Bleu_3': 0.35919931787774084,
 'Bleu_4': 0.24894579705815575,
 'CIDEr': 0.6025545245389485,
 'METEOR': 0.23775710934525082,
 'ROUGE_L': 0.4846863082961032,
 'SPICE': 0.1894818568983909}

# Evaluation on Validation Original Dataset (40k images)

In [None]:
run_name = "008_evaluation"
run_id = None
table_name = "Table_val_40k"

In [None]:
list_val_original = eval_results(model, tokenizer, eval_dataset, 20, run_name, run_id, table_name, save_table=False)

VBox(children=(Label(value='15.213 MB of 15.213 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, m…

  0%|          | 0/2026 [00:00<?, ?it/s]

Test -  BLEU-1: 73.96813895205912/ BLEU-2: 53.8210318868825/ BLEU-3: 38.729591680777084/ BLEU-4: 27.76274789867914


In [None]:
len(list_val_original)

40504

## Evaluation with pycocoeval

### Creating .json file with the ids of the images and respective predictions 

In [None]:
list_json = list(map(lambda x: {'image_id': x[0], 'caption': x[1]}, list_val_original))
list_json

[{'caption': 'A bag of shoes and a bag of shoes on a rack.', 'image_id': 42},
 {'caption': 'A motorcycle parked next to a wooden bench.', 'image_id': 73},
 {'caption': 'A dog laying on the sidewalk next to a sidewalk.',
  'image_id': 74},
 {'caption': 'A wooden cabinet with wooden shelves and a wooden shelf.',
  'image_id': 133},
 {'caption': 'A group of giraffes standing in a stall.', 'image_id': 136},
 {'caption': 'A living room with a fireplace and a table.', 'image_id': 139},
 {'caption': 'Two birds are sitting on a tree branch.', 'image_id': 143},
 {'caption': 'A kitchen with a stove, refrigerator, and a microwave.',
  'image_id': 164},
 {'caption': 'A baseball player holding a bat on top of a field.',
  'image_id': 192},
 {'caption': 'A table with various dishes of food on it.', 'image_id': 196},
 {'caption': 'A stuffed bear sitting on a bathroom sink.', 'image_id': 208},
 {'caption': 'A man standing in a living room holding a Wii remote.',
  'image_id': 241},
 {'caption': 'A gro

In [None]:
with open("/content/results_val_original_dataset.json", "w") as outfile:
    json.dump(list_json, outfile, sort_keys=True, indent=4)

### Evaluation of metrics with the pycocoevalcap tool

In [None]:
# reference: <https://github.com/tylin/coco-caption/blob/master/cocoEvalCapDemo.ipynb> and <https://github.com/salaniz/pycocoevalcap>

annotation_file = '/content/annotations/captions_val2014.json'
results_file = '/content/results_val_original_dataset.json'

# create coco object and coco_result object
coco = COCO(annotation_file)
coco_result = coco.loadRes(results_file)

# create coco_eval object by taking coco and coco_result
coco_eval = COCOEvalCap(coco, coco_result)

# evaluate on a subset of images by setting
#coco_eval.params['image_id'] = coco_result.getImgIds()

# evaluate results
coco_eval.evaluate()

# print output evaluation scores
for metric, score in coco_eval.eval.items():
    print(f'{metric}: {score:.3f}')

loading annotations into memory...
Done (t=0.27s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.09s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 413505, 'reflen': 402656, 'guess': [413505, 373001, 332497, 291993], 'correct': [295897, 156239, 72495, 32985]}
ratio: 1.0269435945323029
Bleu_1: 0.716
Bleu_2: 0.547
Bleu_3: 0.403
Bleu_4: 0.293
computing METEOR score...
METEOR: 0.260
computing Rouge score...
ROUGE_L: 0.536
computing CIDEr score...
CIDEr: 0.961
computing SPICE score...
SPICE: 0.192
Bleu_1: 0.716
Bleu_2: 0.547
Bleu_3: 0.403
Bleu_4: 0.293
METEOR: 0.260
ROUGE_L: 0.536
CIDEr: 0.961
SPICE: 0.192


In [None]:
coco_eval.eval

{'Bleu_1': 0.7155826410805173,
 'Bleu_2': 0.5474817232826398,
 'Bleu_3': 0.40279727586178116,
 'Bleu_4': 0.2931236808618862,
 'CIDEr': 0.9606763538618883,
 'METEOR': 0.25995672994197483,
 'ROUGE_L': 0.5356207256444347,
 'SPICE': 0.1923482800154548}